In [None]:
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import warnings
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import evaluate
from sklearn.metrics import accuracy_score
import torch
import os
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

file_path_1 = os.getenv("DATASET_PATH_1")
file_path_2 = os.getenv("DATASET_PATH_2")

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv(file_path_1)

In [None]:
df_2 =  pd.read_cs(file_path_2)

In [4]:
data = df[["text","sentiment"]]

In [5]:
data = pd.concat([data, df_2])

In [6]:
data["sentiment"].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [7]:
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["sentiment"])
data

Unnamed: 0,text,sentiment,Label
0,What a great day!!! Looks like dream.,positive,2
1,"I feel sorry, I miss you here in the sea beach",positive,2
2,Don't angry me,negative,0
3,We attend in the class just for listening teac...,negative,0
4,"Those who want to go, let them go",negative,0
...,...,...,...
10277,Cheap doesn't mean better btw! . . techsall.co...,negative,0
10278,Nvidia doesn’t want to give up its 2017 ‘crypt...,neutral,1
10279,Nvidia really delayed the 3070 2 weeks .,negative,0
10280,Let no elim go unnoticed. . . . NVIDIA Highlig...,positive,2


In [11]:
# Load the model and tokenizer from the correct path
model = BertForSequenceClassification.from_pretrained("./saved_bert_model")
tokenizer = BertTokenizer.from_pretrained("./saved_bert_model")

In [12]:
dataset = Dataset.from_pandas(data[['text','Label']])

In [13]:
def preprocess_tokens(examples):
    return tokenizer(examples["text"], padding= "max_length", truncation= True, max_length=128)

In [14]:
tokenized_dataset = dataset.map(preprocess_tokens, batched=True)

Map: 100%|██████████| 10781/10781 [00:03<00:00, 3211.76 examples/s]


In [15]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [16]:
tokenized_dataset = tokenized_dataset.rename_column("Label", "label")

In [17]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


In [18]:
training_args = TrainingArguments(
    output_dir= "./results",
    evaluation_strategy= "epoch",
    save_strategy= "epoch",
    num_train_epochs= 10,
    learning_rate= 2e-5,
    weight_decay= 0.01,
    logging_dir= "./logs",
    logging_steps= 10,
    logging_strategy= "epoch"
)

trainer = Trainer(
    model = model,
    args= training_args,
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["test"],
    compute_metrics= compute_metrics,
    tokenizer = tokenizer
)

In [216]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.773,No log
2,0.5286,No log
3,0.3497,No log
4,0.23,No log
5,0.1467,No log
6,0.0839,No log
7,0.0551,No log
8,0.0408,No log
9,0.0331,No log
10,0.0268,No log


TrainOutput(global_step=10780, training_loss=0.22676840624694258, metrics={'train_runtime': 3317.8408, 'train_samples_per_second': 25.993, 'train_steps_per_second': 3.249, 'total_flos': 5672725286215680.0, 'train_loss': 0.22676840624694258, 'epoch': 10.0})

In [217]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_runtime': 17.4029, 'eval_samples_per_second': 123.945, 'eval_steps_per_second': 15.515, 'epoch': 10.0}


In [249]:
predictions = trainer.predict(tokenized_dataset["test"])

In [21]:
texts = ["I like you"]
device = next(model.parameters()).device  
inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=1)

print(f'the text - {texts[0]} and the emotion is {label_encoder.inverse_transform([preds.cpu().numpy()[0]])}')

the text - I like you and the emotion is ['positive']


In [256]:
# Evaluate on test set
metrics = trainer.evaluate(tokenized_dataset["test"])
print(metrics)

{'eval_runtime': 16.6695, 'eval_samples_per_second': 129.398, 'eval_steps_per_second': 16.197, 'epoch': 10.0}


In [269]:
test_results = trainer.evaluate(tokenized_dataset["test"])
print(test_results)  # Confirm if "eval_accuracy" is now present
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")


{'eval_loss': 2.3420023918151855, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.71256374594344, 'eval_runtime': 16.6666, 'eval_samples_per_second': 129.42, 'eval_steps_per_second': 16.2}
Test Accuracy: 0.7126


In [80]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

model_name = f"cardiffnlp/twitter-roberta-base-sentiment"
roberta_tokenizer = AutoTokenizer.from_pretrained(model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [107]:
def classify(text):

    tokenized_text = roberta_tokenizer(text, return_tensors= 'pt')
    output = roberta_model(**tokenized_text)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    scores_dict = {0 : "Negative",1:  "Neutral", 2: "Positive"}

    sentiment = scores_dict[np.argmax(scores)]

    return sentiment

In [61]:
tokenized_dataset = dataset.map(preprocess_tokens, batched=True)

Map: 100%|██████████| 10781/10781 [00:02<00:00, 4340.94 examples/s]


In [110]:
test_set = tokenized_dataset['test']

In [63]:
tokenized_dataset.rename_column("Label", "label")

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 8624
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2157
    })
})

In [124]:
def make_predictions(example):
    example['sentiment'] = classify(example['text'])

    return example

predictions = test_set.map(make_predictions)

Map: 100%|██████████| 2157/2157 [02:36<00:00, 13.77 examples/s]


In [125]:
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

y_true = [label_map[label] for label in predictions['Label']]
y_pred = predictions['sentiment']

acc = accuracy_score(y_true, y_pred)
acc

0.6657394529439036