In [26]:
import torch
import json
import time
from torch.utils.data import random_split, DataLoader
from utils.bert_model import get_dataset, tokenize, avgPoolFunc
from transformers import BertTokenizer

import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose
from transformers import BertModel, BertTokenizer

torch.manual_seed(0)

<torch._C.Generator at 0x7fb7db86f9b0>

In [35]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def clean_data():
    with open("data/bigger sample.json", "r") as file:
        with open("data/cleanedData.json", "w") as cleaned:
            for line in file:
                lineJ = json.loads(line)
                if lineJ["label"] in ["Bullish", "Bearish"]:
                    cleaned.write(json.dumps(lineJ)+"\n")

def get_max_length(cleanedDataFile, tokenizer):
    max_length = 0
    with open(cleanedDataFile, "r") as file:
        for line in file:
            lineJ = json.loads(line)
            max_length = max(max_length, len(tokenizer.encode(lineJ["body"])))
    
    return max_length

class BertFineTuned(nn.Module):
    def __init__(self, out_dim, poolFunc, hiddenStates=768):
        """
        out_dim is the dim of the output, default is 2 for sentiment classification
        poolFunc is the function for pooling the output embedings of bert, 
            suggested to use avgPoolFunc
        hiddenStates is the dim of output embeddings of bert, default is 768
        """
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        for param in self.bert.parameters():
            param.requires_grad = False
        #self.pooler = Pooler(hiddenStates, poolFunc)
        self.pooler = nn.Linear(hiddenStates, hiddenStates)
        self.dense = nn.Linear(hiddenStates, out_dim)
        self.pool_func = poolFunc
        self.normalize = nn.BatchNorm1d(hiddenStates)

    def forward(self, x):
        x = self.bert(x)[0]
        x = self.pool_func(x)
        x = self.pooler(x)
        x = self.normalize(x)
        x = torch.tanh(x)
        x = self.dense(x)
        x = F.softmax(x, dim=1)
        return x

In [36]:
max_len = get_max_length("data/cleanedData.json", tokenizer)

In [37]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else: 
    device = torch.device("cpu")

In [38]:
bigData = get_dataset("data/cleanedData.json", max_length=max_len)

In [39]:
dataSize = len(bigData)
trainSize = round(dataSize * 0.9)
testSize = dataSize - trainSize
train, test = random_split(bigData, [trainSize, testSize])

In [40]:
trainSize, testSize

(23784, 2643)

In [41]:
trainloader = DataLoader(train, batch_size = 32, shuffle=True, num_workers=10)
testloader = DataLoader(test, batch_size=32, num_workers=10)
model = BertFineTuned(2, avgPoolFunc).to(device)


sgdOptim = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, nesterov=True)
adamOptim = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()

loss_history = []
running_loss = 0.0
elapsed_time = 0.0

In [42]:
for j in range(50):
    for i, data in enumerate(testloader):
        t1 = time.time()

        inputs = data["text"].to(device)
        labels = data["label"].to(device)

        adamOptim.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        adamOptim.step()

        running_loss += loss.item()
        elapsed_time += time.time() - t1

        if i%50 == 49:
            print(f"{running_loss} loss at epoch {j+1}, step {i+1}, average time per step: {elapsed_time/50}")
            loss_history.append(running_loss)
            running_loss = 0.0
            elapsed_time = 0.0
    running_loss = 0.0

35.11595141887665 loss at epoch 1, step 50, average time per step: 2.005021057128906
32.611062467098236 loss at epoch 2, step 50, average time per step: 3.3772318601608275
32.223897993564606 loss at epoch 3, step 50, average time per step: 3.3960593509674073
32.086209774017334 loss at epoch 4, step 50, average time per step: 3.3805804586410524
31.989273250102997 loss at epoch 5, step 50, average time per step: 3.3869371700286863
31.892602145671844 loss at epoch 6, step 50, average time per step: 3.3837448358535767
31.802795737981796 loss at epoch 7, step 50, average time per step: 3.3840641260147093
31.73996177315712 loss at epoch 8, step 50, average time per step: 3.3805708360671995
31.639796525239944 loss at epoch 9, step 50, average time per step: 3.384413547515869
31.622190356254578 loss at epoch 10, step 50, average time per step: 3.387392454147339
31.488242626190186 loss at epoch 11, step 50, average time per step: 3.38781334400177
31.507656276226044 loss at epoch 12, step 50, av

In [22]:
2643/32

82.59375

In [18]:
result = model.bert(sample["text"].cuda())
result[0].size()

torch.Size([32, 461, 768])

In [None]:
loss.item()

In [None]:
loss.backward()

In [None]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose
from transformers import BertModel, BertTokenizer

In [None]:
class BertFineTuned(nn.Module):
    def __init__(self, out_dim, poolFunc, hiddenStates=768):
        """
        out_dim is the dim of the output, default is 2 for sentiment classification
        poolFunc is the function for pooling the output embedings of bert, 
            suggested to use avgPoolFunc
        hiddenStates is the dim of output embeddings of bert, default is 768
        """
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        for param in self.bert.parameters():
            param.requires_grad = False
        #self.pooler = Pooler(hiddenStates, poolFunc)
        self.pooler = nn.Linear(hiddenStates, hiddenStates//2)
        self.dense = nn.Linear(hiddenStates//2, out_dim)
        self.pool_func = poolFunc
        self.normalize = nn.BatchNorm1d(hiddenStates//2)

    def trainable(self):
        return [*self.dense.parameters()]

    def forward(self, x):
        x = self.bert(x)[0]
        x = self.pool_func(x)
        x = self.pooler(x)
        x = self.normalize(x)
        x = F.tanh(x)
        x = self.dense(x)
        x = F.softmax(x, dim=1)
        return x

In [None]:
6*60/83.4*5

In [None]:
right = 0
total = 0
for data in testloader:
    inputs = data["text"].to(device)
    labels = data["label"].to(device)

    output = model(inputs)
    predictions = torch.argmax(output, dim=1)
    correct = predictions == labels
    right += correct.sum().item()
    total += labels.size()[0]
    

In [None]:
labels.size()[0]

In [None]:
right.item()/total

In [None]:
total

In [24]:
result = model(sample["text"].cuda())

In [25]:
torch.save(result, "data/tensor.pt")