In [3]:
# if you do not have transformers, please !pip install transformers
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW

# if you do not have torch, please refer to https://pytorch.org/ [INSTALL PYTORCH]
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

import pandas as pd
import re
import string
import operator
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

print(transformers.__version__)
seed = 38
device = torch.device('cuda')
print('\n')
print(device)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

4.36.2


cuda


In [4]:
from nltk.corpus import stopwords
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

nltk.download('stopwords')
stop = stopwords.words('english')

# Load your datasets
bjp = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine the datasets
df = pd.concat([bjp, congress])

# Fill missing values with an empty string
df['commentText'] = df['commentText'].fillna('')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)

# Split the dataset into train, validation, and test sets
df_train, temp_df = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

print(df_train.shape, df_test.shape, df_val.shape)
print('\n')

# get the list of {content, token, ids}

df_val['pre_text'] = df_val['commentText'].str.lower()
df_val['pre_text'] = df_val['pre_text'].str.replace(r'[^\w\s]+', '')
df_val['pre_text'] = df_val['pre_text'].str.replace('<br />','')
df_val['pre_text'] = df_val['pre_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df_train['pre_text'] = df_train['commentText'].str.lower()
df_train['pre_text'] = df_train['pre_text'].str.replace(r'[^\w\s]+', '')
df_train['pre_text'] = df_train['pre_text'].str.replace('<br />','')
df_train['pre_text'] = df_train['pre_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print('Text Pre-Processing Finish!')

# To simplify the process, I make all df_val['text'] = df_val['pre_text']; df_train['text'] = df_train['pre_text']
df_val['commentText'] = df_val['pre_text']
df_train['commentText'] = df_train['pre_text']

print(df_train.shape, df_test.shape, df_val.shape)
content = df_train['commentText'].values
labels = df_train['Label'].values


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(2370, 3) (790, 3) (790, 3)


Text Pre-Processing Finish!
(2370, 4) (790, 3) (790, 4)


In [5]:
def encoding_process(_content):
    get_ids = []    
    for text in _content:
        input_ids = tokenizer.encode(
                        text,                      
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True, 
                        return_tensors = 'pt')
        get_ids.append(input_ids)    
    
    get_ids = torch.cat(get_ids, dim=0)
    return get_ids

In [6]:
def encoding_process(content):
    input_ids = []
    attention_masks = []

    for sent in content:
        encoded_dict = tokenizer.encode_plus(
            sent,                      
            add_special_tokens = True, 
            max_length = 512,           
            pad_to_max_length = True,
            return_attention_mask = True,   
            return_tensors = 'pt',     
            truncation=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks


In [7]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Download stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

# Load your datasets
bjp = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine the datasets
df = pd.concat([bjp, congress])

# Fill missing values with an empty string
df['commentText'] = df['commentText'].fillna('')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)

# Split the dataset into train, validation, and test sets
df_train, temp_df = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text and convert the tokens to numerical values
content = df_train['commentText'].values
labels = df_train['Label'].values

# Tokenize and pad the sequences
inputs = []
for sent in tqdm(content, desc="Tokenizing and Padding"):
    tokenized = tokenizer.encode(sent, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    inputs.append(tokenized)

# Convert all inputs and labels into torch tensors
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)

# Create an iterator of our data with torch DataLoader 
train_data = TensorDataset(inputs, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)
model.cpu()
optimizer = AdamW(model.parameters(), lr=2e-5)
output_model = './content/model/imdb_bert.pth'
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Save function
def save(model, optimizer):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

# Accuracy calculation functions
def accuracy_calc(preds, labels):
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return accuracy_score(real, pre)

def f1_accuracy(preds, labels):
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return f1_score(real, pre)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Tokenizing and Padding: 100%|██████████| 2370/2370 [00:00<00:00, 8509.76it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# 💥 IMPORTANT: Please create the directory in your environment, 
# such like './content/model/', in order to save your model in your local!
for epoch in range(epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    _f1 = 0
    _train_f1 = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        scheduler.step()
        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)
        
    model.eval()
    for i, batch in enumerate(val_dataloader):
        with torch.no_grad():
            loss, val_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
                
            total_val_loss += loss.item()
            
            val_ = val_.detach().cpu().numpy()
            label_ids = batch[1].to('cpu').numpy()
            total_eval_accuracy += accuracy_calc(val_, label_ids)
            _f1 += f1_accuracy(val_, label_ids)
    
    training_loss = total_loss / len(train_dataloader)
    valid_loss = total_val_loss / len(val_dataloader)
    _accuracy = total_eval_accuracy / len(val_dataloader)
    _f1_score = _f1 / len(val_dataloader)
    train_f1_score = _train_f1/ len(train_dataloader)
    
    print('Training loss is', training_loss)
    print('Valid loss is:', valid_loss)
    print('Acc score is:', _accuracy)
    print('F1_score is:', _f1_score)
    print('train_F1_score is:', train_f1_score)
    print('\n')

save(model, optimizer)

AssertionError: Torch not compiled with CUDA enabled