In [None]:
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW

# if you do not have torch, please refer to https://pytorch.org/ [INSTALL PYTORCH]
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

import pandas as pd
import re
import string
import operator
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

print(transformers.__version__)
seed = 38
device = torch.device('cuda')
print('\n')
print(device)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

4.35.2


cuda


In [None]:
from nltk.corpus import stopwords
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

nltk.download('stopwords')
stop = stopwords.words('english')

# Load your datasets
bjp = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine the datasets
df = pd.concat([bjp, congress])

# Fill missing values with an empty string
df['commentText'] = df['commentText'].fillna('')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)

# Split the dataset into train, validation, and test sets
df_train, temp_df = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

print(df_train.shape, df_test.shape, df_val.shape)
print('\n')

# get the list of {content, token, ids}

df_val['pre_text'] = df_val['commentText'].str.lower()
df_val['pre_text'] = df_val['pre_text'].str.replace(r'[^\w\s]+', '')
df_val['pre_text'] = df_val['pre_text'].str.replace('<br />','')
df_val['pre_text'] = df_val['pre_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df_train['pre_text'] = df_train['commentText'].str.lower()
df_train['pre_text'] = df_train['pre_text'].str.replace(r'[^\w\s]+', '')
df_train['pre_text'] = df_train['pre_text'].str.replace('<br />','')
df_train['pre_text'] = df_train['pre_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print('Text Pre-Processing Finish!')

# To simplify the process, I make all df_val['text'] = df_val['pre_text']; df_train['text'] = df_train['pre_text']
df_val['commentText'] = df_val['pre_text']
df_train['commentText'] = df_train['pre_text']

print(df_train.shape, df_test.shape, df_val.shape)
content = df_train['commentText'].values
labels = df_train['Label'].values

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(2370, 3) (790, 3) (790, 3)


Text Pre-Processing Finish!
(2370, 4) (790, 3) (790, 4)


  df_val['pre_text'] = df_val['pre_text'].str.replace(r'[^\w\s]+', '')
  df_train['pre_text'] = df_train['pre_text'].str.replace(r'[^\w\s]+', '')


In [None]:
def encoding_process(_content):
    get_ids = []
    for text in _content:
        input_ids = tokenizer.encode(
                        text,
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        return_tensors = 'pt')
        get_ids.append(input_ids)

    get_ids = torch.cat(get_ids, dim=0)
    return get_ids

In [None]:
def encoding_process(content):
    input_ids = []
    attention_masks = []

    for sent in content:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens = True,
            max_length = 512,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt',
            truncation=True
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Download stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

# Load your datasets
bjp = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine the datasets
df = pd.concat([bjp, congress])

# Fill missing values with an empty string
df['commentText'] = df['commentText'].fillna('')

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)

# Split the dataset into train, validation, and test sets
df_train, temp_df = train_test_split(df, test_size=0.4, random_state=42)
df_val, df_test = train_test_split(temp_df, test_size=0.5, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize the text and convert the tokens to numerical values
content = df_train['commentText'].values
labels = df_train['Label'].values

# Tokenize and pad the sequences
inputs = []
for sent in tqdm(content, desc="Tokenizing and Padding"):
    tokenized = tokenizer.encode(sent, add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    inputs.append(tokenized)

# Convert all inputs and labels into torch tensors
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)

# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(inputs, labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)
model.cpu()
optimizer = AdamW(model.parameters(), lr=2e-5)
output_model = './content/model/imdb_bert.pth'
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Save function
def save(model, optimizer):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

# Accuracy calculation functions
def accuracy_calc(preds, labels):
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return accuracy_score(real, pre)

def f1_accuracy(preds, labels):
    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return f1_score(real, pre)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Tokenizing and Padding: 100%|██████████| 2370/2370 [00:00<00:00, 2488.30it/s]


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
for epoch in range(epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    _f1 = 0
    _train_f1 = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)

    model.eval()
    for i, batch in enumerate(val_dataloader):
        with torch.no_grad():
            loss, val_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)

            total_val_loss += loss.item()

            val_ = val_.detach().cpu().numpy()
            label_ids = batch[1].to('cpu').numpy()
            total_eval_accuracy += accuracy_calc(val_, label_ids)
            _f1 += f1_accuracy(val_, label_ids)

    training_loss = total_loss / len(train_dataloader)
    valid_loss = total_val_loss / len(val_dataloader)
    _accuracy = total_eval_accuracy / len(val_dataloader)
    _f1_score = _f1 / len(val_dataloader)
    train_f1_score = _train_f1/ len(train_dataloader)

    print('Training loss is', training_loss)
    print('Valid loss is:', valid_loss)
    print('Acc score is:', _accuracy)
    print('F1_score is:', _f1_score)
    print('train_F1_score is:', train_f1_score)
    print('\n')

save(model, optimizer)

KeyError: 211

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

accumulation_steps = 2  # Accumulate gradients over 2 batches before updating weights

for epoch in range(epochs):
    model.train()
    total_loss = 0
    _train_f1 = 0

    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0] > 0).to(device),
                            labels=batch[1].to(device), return_dict=False)
        total_loss += loss.item()
        loss.backward()

        # Accumulate gradients
        if (step + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)

    # Handling remaining accumulated gradients if any
    if len(train_dataloader) % accumulation_steps != 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Validation loop (similar to your existing code)

    training_loss = total_loss / len(train_dataloader)
    # ... (calculate other metrics as needed)

    print('Epoch:', epoch)
    print('Training loss is', training_loss)
    # ... (print other metrics)

# Save the model and optimizer state at the end of training
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    # ... (add other components you want to save)
}, 'checkpoint.pth')


KeyError: 2166

In [None]:
batch_size = 8  # or any smaller value
train_dataloader = DataLoader(df_train, batch_size=batch_size, shuffle=True)


In [None]:
torch.cuda.empty_cache()


In [None]:
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  # or any other value


tensor(0.)

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    _f1 = 0
    _train_f1 = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)

    model.eval()
    for i, batch in enumerate(val_dataloader):
        with torch.no_grad():
             loss, val_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)

            total_val_loss += loss.item()

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 24)

In [None]:
#Trying another one from here
# if you do not have transformers, please !pip install transformers
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW

# if you do not have torch, please refer to https://pytorch.org/ [INSTALL PYTORCH]
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

import pandas as pd
import re
import string
import operator
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
print(transformers.__version__)
seed = 38
device = torch.device('cuda')
print('\n')
print(device)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

4.35.2


cuda


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch


In [None]:
from torch.nn.utils.rnn import pad_sequence

# Tokenize input texts
tokenized_texts = [tokenizer(str(comment), padding='max_length', truncation=True, max_length=64, return_tensors="pt") for comment in combined_data['commentText']]

# Extract input IDs, attention masks, and labels
input_ids = pad_sequence([tokenized_text['input_ids'].squeeze() for tokenized_text in tokenized_texts], batch_first=True)
attention_masks = pad_sequence([tokenized_text['attention_mask'].squeeze() for tokenized_text in tokenized_texts], batch_first=True)
labels = torch.tensor(combined_data['Label'].tolist())

# Create PyTorch dataset
dataset = TensorDataset(input_ids, attention_masks, labels)


In [None]:
# Split dataset into train and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)




In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

# Save the model
torch.save(model.state_dict(), 'hinglish_sentiment_model.pth')


In [None]:
model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # Calculate metrics or make predictions as needed
print(outputs)


SequenceClassifierOutput(loss=tensor(0.8705, device='cuda:0'), logits=tensor([[-0.3767, -0.3177],
        [-0.3767, -0.3177],
        [-0.3767, -0.3177],
        [ 0.9490, -1.1676],
        [-2.0471,  1.7929],
        [-1.5913,  1.3966]], device='cuda:0'), hidden_states=None, attentions=None)


In [None]:
from sklearn.metrics import accuracy_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f'Accuracy: {accuracy}')


Accuracy: 0.6822784810126582


In [None]:
import torch
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load datasets
bjp_data = pd.read_csv('BJP.csv', encoding='ISO-8859-1')
congress_data = pd.read_csv('congress.csv', encoding='ISO-8859-1')

# Combine datasets
combined_data = pd.concat([bjp_data, congress_data], ignore_index=True)

# Drop rows with NaN or non-string values in 'commentText'
combined_data = combined_data.dropna(subset=['commentText'])
combined_data = combined_data[combined_data['commentText'].apply(lambda x: isinstance(x, str))]

# Split the dataset
train_data, val_data = train_test_split(combined_data, test_size=0.2, random_state=42)

# Tokenize input texts
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_tokenized = tokenizer(list(train_data['commentText']), padding=True, truncation=True, return_tensors="pt")
val_tokenized = tokenizer(list(val_data['commentText']), padding=True, truncation=True, return_tensors="pt")

# Create PyTorch datasets
train_labels = torch.tensor(train_data['Label'].tolist())
val_labels = torch.tensor(val_data['Label'].tolist())

train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
val_dataset = TensorDataset(val_tokenized['input_ids'], val_tokenized['attention_mask'], val_labels)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.9)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Validation loop
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss}, Validation Accuracy: {accuracy}')

# Save the trained model
torch.save(model.state_dict(), 'bert_model.pth')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5, Loss: 103.04284599423409, Validation Accuracy: 0.7953964194373402
Epoch 2/5, Loss: 66.81324778683484, Validation Accuracy: 0.8465473145780051
Epoch 3/5, Loss: 42.37092965096235, Validation Accuracy: 0.8465473145780051
Epoch 4/5, Loss: 28.504635978490114, Validation Accuracy: 0.8618925831202046
Epoch 5/5, Loss: 20.046503081684932, Validation Accuracy: 0.8414322250639387
