In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from transformers import BertTokenizer
import torch

In [11]:
# Load dataset
data = pd.read_csv('Sentiment.csv')
data = data[['text', 'sentiment']]

# Split the dataset into train and test sets
train, test = train_test_split(data, test_size=0.1)

# Remove neutral sentiments for binary classification
train = train[train.sentiment != 'Neutral']
test = test[test.sentiment != 'Neutral']

In [12]:
# Separate majority and minority classes
train_majority = train[train.sentiment == 'Positive']
train_minority = train[train.sentiment == 'Negative']

# Upsample minority class
train_minority_upsampled = resample(train_minority, 
                                    replace=True,     # sample with replacement
                                    n_samples=len(train_majority),    # to match majority class
                                    random_state=123) # reproducible results

# Combine majority class with upsampled minority class
train_upsampled = pd.concat([train_majority, train_minority_upsampled])

# Display new class counts
print(train_upsampled.sentiment.value_counts())


sentiment
Positive    2004
Negative    2004
Name: count, dtype: int64


In [13]:
# Separate majority and minority classes
train_majority = train[train.sentiment == 'Positive']
train_minority = train[train.sentiment == 'Negative']

# Upsample minority class
train_minority_upsampled = resample(train_minority, 
                                    replace=True,     # sample with replacement
                                    n_samples=len(train_majority),    # to match majority class
                                    random_state=123) # reproducible results

# Combine majority class with upsampled minority class
train_upsampled = pd.concat([train_majority, train_minority_upsampled])

# Display new class counts
print(train_upsampled.sentiment.value_counts())


sentiment
Positive    2004
Negative    2004
Name: count, dtype: int64


In [14]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize training data
train_texts = train_upsampled['text'].tolist()
train_labels = train_upsampled['sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

# Tokenize validation data
val_texts = test['text'].tolist()
val_labels = test['sentiment'].apply(lambda x: 1 if x == 'Positive' else 0).tolist()
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [18]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)


In [19]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments




In [21]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


OSError: The paging file is too small for this operation to complete. (os error 1455)

In [None]:
results = trainer.evaluate()
print(results)

from sklearn.metrics import classification_report, confusion_matrix

y_pred = trainer.predict(val_dataset).predictions.argmax(axis=1)
print(classification_report(val_labels, y_pred, target_names=['Negative', 'Positive']))
print(confusion_matrix(val_labels, y_pred))
