<a href="https://colab.research.google.com/github/viyas52/stock-market-prediction-using-twitter-sentiment-analysis/blob/main/BERT_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from transformers import BertTokenizer, BertModel
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import pandas as pd
import preprocess
import numpy as np


data = pd.read_csv('train/stock_data.csv')

data = preprocess.Preprocess_Tweets(data)
display(data)

train_pct = .8
np.random.seed(1)
idx = np.random.permutation(len(data))

X_train = data['Text_Cleaned'].values[idx[:int(train_pct*len(data))]]
y_train = data['Sentiment'].values[idx[:int(train_pct*len(data))]]
y_train[y_train==-1] = 0
X_test = data['Text_Cleaned'].values[idx[int(train_pct*len(data)):]]
y_test = data['Sentiment'].values[idx[int(train_pct*len(data)):]]
y_test[y_test==-1] = 0


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []


    for line in data:

        encoding = tokenizer.encode_plus(
                text=line,
                add_special_tokens=True,
                padding='max_length',
                max_length = MAX_LEN,
                truncation=True,
                return_tensors="pt",
                return_attention_mask=True
                )


        input_ids.append(encoding.get('input_ids'))
        attention_masks.append(encoding.get('attention_mask'))

    input_ids = torch.concat(input_ids)
    attention_masks = torch.concat(attention_masks)

    return input_ids, attention_masks


encoded = [tokenizer.encode(sent, add_special_tokens=True) for sent in data['Text_Cleaned'].values]
MAX_LEN = max([len(sent) for sent in encoded])
print('Max length: ', MAX_LEN)



X_train_inputs, X_train_masks = preprocessing_for_bert(X_train)
X_test_inputs, X_test_masks = preprocessing_for_bert(X_test)


y_train_labels = torch.tensor(y_train)
y_test_labels = torch.tensor(y_test)

print(X_train_inputs.shape, X_train_masks.shape, y_train_labels.shape)
print(X_test_inputs.shape, X_test_masks.shape, y_test_labels.shape)


batch_size = 16


train_data = TensorDataset(X_train_inputs, X_train_masks, y_train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(X_test_inputs, X_test_masks, y_test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  pat = re.compile(pat, flags=flags)


Unnamed: 0,Text,Sentiment,Text_Cleaned
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,aap movie 55 percent return for the fea and ge...
2,user I'd be afraid to short AMZN - they are lo...,1,id be afraid to short amzn to they are looking...
3,MNTA Over 12.00,1,mnta over 12.00
4,OI Over 21.37,1,oi over 21.37
...,...,...,...
5786,Industry body CII said #discoms are likely to ...,-1,industry body cii said are likely to suffer a ...
5787,"#Gold prices slip below Rs 46,000 as #investor...",-1,prices slip below rs 46000 as book profits ami...
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1,workers at bajaj auto have agreed to a 10 perc...
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1,live sensex off day’s high up 600 points tests...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Max length:  53
torch.Size([4632, 53]) torch.Size([4632, 53]) torch.Size([4632])
torch.Size([1159, 53]) torch.Size([1159, 53]) torch.Size([1159])


In [2]:
class BertClassifier(nn.Module):
    def __init__(self, freeze=False):
        super(BertClassifier, self).__init__()


        input_layer = 768
        hidden_layer = 50
        output_layer = 2


        self.bert = BertModel.from_pretrained('bert-base-uncased')


        self.classifier = nn.Sequential(
            nn.Linear(input_layer, hidden_layer),
            nn.ReLU(),
            nn.Linear(hidden_layer, output_layer))


        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        h_cls = outputs[0][:, 0, :]
        logits = self.classifier(h_cls)

        return logits


torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
np.random.seed(1)


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


model = BertClassifier(freeze=False)


model.to(device)


epochs = 4
steps = len(train_dataloader) * epochs
learning_rate = 5e-5
epsilon = 1e-8


optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)


scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=steps)


loss_function = nn.CrossEntropyLoss()


for e in range(epochs):

    model.train()


    train_loss = 0


    for batch in train_dataloader:

        batch_inputs, batch_masks, batch_labels = batch


        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)


        model.zero_grad()


        logits = model(batch_inputs, batch_masks)


        loss = loss_function(logits, batch_labels)


        train_loss += loss.item()


        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()


    train_loss /= len(train_dataloader)


    model.eval()


    test_loss = 0
    test_acc = 0


    for batch in test_dataloader:

        batch_inputs, batch_masks, batch_labels = batch


        batch_inputs = batch_inputs.to(device)
        batch_masks = batch_masks.to(device)
        batch_labels = batch_labels.to(device)


        with torch.no_grad():
            logits = model(batch_inputs, batch_masks)


        loss = loss_function(logits, batch_labels)
        test_loss += loss.item()


        preds = torch.argmax(logits, dim=1).flatten()


        accuracy = (preds == batch_labels).cpu().numpy().mean() * 100
        test_acc += accuracy


    test_loss /= len(test_dataloader)
    test_acc /= len(test_dataloader)


    print('Epoch: %d  |  Train Loss: %1.5f  |  Test Loss: %1.5f  |  Test Accuracy: %1.2f'%(e+1, train_loss, test_loss, test_acc))


torch.save(model.state_dict(), 'stock_sentiment_model.pt')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch: 1  |  Train Loss: 0.52763  |  Test Loss: 0.47937  |  Test Accuracy: 80.28
Epoch: 2  |  Train Loss: 0.29166  |  Test Loss: 0.53739  |  Test Accuracy: 80.66
Epoch: 3  |  Train Loss: 0.13069  |  Test Loss: 0.74568  |  Test Accuracy: 80.54
Epoch: 4  |  Train Loss: 0.05439  |  Test Loss: 0.85913  |  Test Accuracy: 81.52
