In [1]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
import numpy as np
import pickle
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

from imblearn.over_sampling import SMOTE

import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download("vader_lexicon")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
WIKI_PATH = "/kaggle/input/wiki-data/AllCombined.txt"

wiki_sentences = []
with open(WIKI_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            wiki_sentences.append(line.strip().lower())
        if len(wiki_sentences) == 50000:
            break

print("Loaded Wikipedia sentences:", len(wiki_sentences))


Loaded Wikipedia sentences: 50000


In [5]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=10000,
    special_tokens=["<pad>", "<unk>"]
)

tokenizer.train_from_iterator(wiki_sentences, trainer=trainer)

print("BPE vocab size:", tokenizer.get_vocab_size())





BPE vocab size: 10000


In [6]:
class CBOWDataset(Dataset):
    def __init__(self, sentences, tokenizer, window_size=5):
        self.data = []
        for sent in sentences:
            ids = tokenizer.encode(sent).ids
            for i in range(window_size, len(ids) - window_size):
                context = ids[i-window_size:i] + ids[i+1:i+window_size+1]
                target = ids[i]
                self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ctx, tgt = self.data[idx]
        return torch.tensor(ctx), torch.tensor(tgt)


In [7]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim=384):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)

    def forward(self, x):
        return self.emb(x).mean(dim=1)


class NegativeSamplingLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, ctx_vec, tgt_vec, neg_vecs):
        pos = torch.sum(ctx_vec * tgt_vec, dim=1)
        neg = torch.bmm(neg_vecs, ctx_vec.unsqueeze(2)).squeeze(2)

        pos_loss = torch.nn.functional.logsigmoid(pos)
        neg_loss = torch.nn.functional.logsigmoid(-neg).sum(dim=1)

        return -(pos_loss + neg_loss).mean()


In [9]:
dataset = CBOWDataset(wiki_sentences, tokenizer, window_size=5)
loader = DataLoader(dataset, batch_size=128, shuffle=True)

vocab_size = tokenizer.get_vocab_size()
model = CBOW(vocab_size).to(device)
loss_fn = NegativeSamplingLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    for ctx, tgt in loader:
        ctx, tgt = ctx.to(device), tgt.to(device)

        ctx_vec = model(ctx)
        tgt_vec = model.emb(tgt)

        neg_ids = torch.randint(0, vocab_size, (ctx.size(0), 5)).to(device)
        neg_vecs = model.emb(neg_ids)

        loss = loss_fn(ctx_vec, tgt_vec, neg_vecs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/5 done")

torch.save(model.emb.weight.detach(), "cbow_embeddings.pt")


Epoch 1/5 done
Epoch 2/5 done
Epoch 3/5 done
Epoch 4/5 done
Epoch 5/5 done


In [22]:
# -------- REPLACEMENT CELL 7 --------

PHRASEBANK_PATH = "/kaggle/input/fin-data/Sentences_50Agree.txt"

df = pd.read_csv(
    PHRASEBANK_PATH,
    sep="@",
    header=None,
    names=["sentence", "sentiment"],
    encoding="latin-1"
)

df["sentiment"] = df["sentiment"].str.strip().str.lower()

label_map = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

df["label"] = df["sentiment"].map(label_map)

# REMOVE DUPLICATES BEFORE SPLIT
df = df.drop_duplicates(subset="sentence").reset_index(drop=True)

print("Class distribution:")
print(df["sentiment"].value_counts())
print("Dataset size:", len(df))


Class distribution:
sentiment
neutral     2872
positive    1362
negative     604
Name: count, dtype: int64
Dataset size: 4838


In [23]:
# -------- REPLACEMENT CELL 8 --------

embeddings = torch.load("cbow_embeddings.pt").cpu().numpy()
embed_dim = embeddings.shape[1]

def document_embedding(text):
    ids = tokenizer.encode(text.lower()).ids
    ids = [i for i in ids if i < embeddings.shape[0]]

    if len(ids) == 0:
        return np.zeros(embed_dim)

    return embeddings[ids].mean(axis=0)

X = np.vstack(df["sentence"].apply(document_embedding))
y = df["label"].values

print("NaNs in X:", np.isnan(X).sum())


from sklearn.model_selection import train_test_split

indices = np.arange(len(df))

train_idx, test_idx = train_test_split(
    indices,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train = X[train_idx]
X_test = X[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

print("Train size:", len(train_idx))
print("Test size:", len(test_idx))


NaNs in X: 0
Train size: 3870
Test size: 968


In [24]:
# -------- REPLACEMENT CELL 9 --------

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix

# Apply SMOTE only on training
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

clf = LogisticRegression(
    max_iter=1000,
    multi_class="multinomial",
    solver="lbfgs"
)

clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test)

print("CBOW + Logistic Regression (3-class)")
print("Macro F1-score:", f1_score(y_test, y_pred, average="macro"))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))




CBOW + Logistic Regression (3-class)
Macro F1-score: 0.5679725271435804
Confusion Matrix:
[[ 76  23  22]
 [ 71 370 134]
 [ 44  81 147]]


In [25]:
# -------- REPLACEMENT CELL 10 --------

from nltk.sentiment.vader import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

vader_preds = []

for sentence in df.iloc[test_idx]["sentence"]:
    score = vader.polarity_scores(sentence)["compound"]

    if score >= 0.05:
        vader_preds.append(2)
    elif score <= -0.05:
        vader_preds.append(0)
    else:
        vader_preds.append(1)

print("VADER Baseline (3-class)")
print("Macro F1-score:", f1_score(y_test, vader_preds, average="macro"))
print("Confusion Matrix:")
print(confusion_matrix(y_test, vader_preds))


VADER Baseline (3-class)
Macro F1-score: 0.490134921250291
Confusion Matrix:
[[ 38  35  48]
 [ 36 293 246]
 [  9  75 188]]
