In [1]:
import os

requirements_installed = False
max_retries = 3
retries = 0


def install_requirements():
    """Installs the requirements from requirements.txt file"""
    global requirements_installed
    if requirements_installed:
        print("Requirements already installed.")
        return

    print("Installing requirements...")
    install_status = os.system("pip install -r requirements.txt")
    if install_status == 0:
        print("Requirements installed successfully.")
        requirements_installed = True
    else:
        print("Failed to install requirements.")
        if retries < max_retries:
            print("Retrying...")
            retries += 1
            return install_requirements()
        exit(1)
    return

In [2]:
install_requirements()

Installing requirements...
Requirements installed successfully.


In [3]:
from dotenv import load_dotenv
import os


def setup_env():
    """Sets up the environment variables"""

    def check_env(env_var):
        value = os.getenv(env_var)
        if value is None:
            print(f"Please set the {env_var} environment variable.")
            exit(1)
        else:
            print(f"{env_var} is set.")

    load_dotenv()

    variables_to_check = []

    for var in variables_to_check:
        check_env(var)

In [4]:
setup_env()

In [5]:
import pandas as pd


def read_dataset():
    """Reads the dataset"""
    dataset = pd.read_csv("data/twitter_sentiment_analysis/twitter.csv")
    return dataset

In [11]:
dataset = read_dataset()

dataset.head()

Unnamed: 0,tweet_id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [9]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer


def get_train_test_data():
    """Prepare the training and testing data."""
    dataset = read_dataset()
    target_column = "sentiment"
    X = dataset.drop(columns=[target_column])
    y = dataset[target_column]
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    categorical_cols = X.select_dtypes(include=["object"]).columns
    numeric_cols = X.select_dtypes(include=["number"]).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_cols),
            ("cat", OneHotEncoder(sparse_output=False), categorical_cols),
        ]
    )
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    X_train = torch.tensor(preprocessor.fit_transform(X_train), dtype=torch.float32)
    X_test = torch.tensor(preprocessor.transform(X_test), dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
    y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)
    return X_train, X_test, y_train, y_test

In [16]:
dataset_path = "data/twitter_sentiment_analysis/twitter.csv"

In [17]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from transformers import BertTokenizer, BertModel

# Load your dataset
dataset = pd.read_csv(dataset_path)  # Replace with the actual dataset file

# Ensure the 'content' column has only strings and handle missing values
dataset["content"] = dataset["content"].fillna("").astype(str)


# Preprocessing sentiment to numerical labels
def sentiment_to_label(sentiment):
    return {"Positive": 1, "Negative": 0, "Neutral": 2}.get(
        sentiment, 2
    )  # Default to Neutral if not found


dataset["sentiment_label"] = dataset["sentiment"].apply(sentiment_to_label)


# Dataset Class
class TwitterDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        tweet = self.data.iloc[index]["content"]
        sentiment = self.data.iloc[index]["sentiment_label"]

        encoding = self.tokenizer(
            tweet,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(sentiment, dtype=torch.long),
        }


# Hyperparameters
BATCH_SIZE = 16
MAX_LENGTH = 128
EPOCHS = 3
LEARNING_RATE = 2e-5

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Datasets and DataLoaders
dataset = dataset.sample(frac=1).reset_index(drop=True)  # Shuffle dataset
train_size = int(0.8 * len(dataset))
train_data = dataset[:train_size]
val_data = dataset[train_size:]

train_dataset = TwitterDataset(train_data, tokenizer, MAX_LENGTH)
val_dataset = TwitterDataset(val_data, tokenizer, MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


# Model
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        output = self.drop(pooled_output)
        return self.out(output)


model = SentimentClassifier(n_classes=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


# Training Function
def train_epoch(model, data_loader, criterion, optimizer, device):
    print("Training...")
    model.train()
    print("Model in training mode")
    total_loss = 0
    correct_predictions = 0

    i = 0
    for batch in data_loader:
        print(f"Batch {i}")
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        i += 1

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(
        data_loader
    )


# Evaluation Function
def eval_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(
        data_loader
    )


# Training Loop
for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(
        model, train_loader, criterion, optimizer, device
    )
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Save Model
torch.save(model.state_dict(), "sentiment_model.pth")

print("Model training complete!")

KeyboardInterrupt: 