In [36]:
from gensim.models import KeyedVectors
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import re
import numpy as np
from torch.utils.data import DataLoader, Dataset
import spacy

In [37]:
word2vec_1 = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [None]:
data = pd.read_csv("merged_training.csv")
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
label_mapping = {"joy": 0, "sadness": 1, "anger": 2, "fear": 3, "love": 4, "surprise": 5}
data["label_encoded"] = data["label"].map(label_mapping)
important_stopwords = {
    "not", "no", "never", "none", "nowhere", "nothing", "neither", "nor", 
    "n't", "cannot", "without", 
    "very", "more", "most", "least", "less", "much", "many", "quite", "so",
    "such", "just", "too", "enough", "almost", "rather", "even",
    "if", "unless", "though", "although", "while", "whereas", 
    "few", "little", "hardly", "scarcely", "seldom",
    "before", "after", "until", "since", "when", "whenever", "once", 
    "against", "despite", "because", "besides", "however", "otherwise", "yet"
}

def tokenizer(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc 
        if token.is_alpha and (not token.is_stop or token.text in important_stopwords)
    ]
    return tokens
def encode(sentence):
    vectors = [word2vec_1[word] for word in tokenizer(sentence) if word in word2vec_1]  
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_1.vector_size)

['not', 'like', 'movie', 'because', 'not', 'very', 'interesting']
[ 8.05140883e-02 -2.99246646e-02 -1.73754003e-02  1.42159596e-01
 -6.22557215e-02  6.59179688e-02  1.24668665e-01 -2.18069889e-02
  7.81424418e-02  6.73653707e-02  2.90222168e-02 -1.65492460e-01
 -7.26231188e-02 -6.79364875e-02 -1.60853788e-01  1.25767305e-01
  6.49762824e-02  1.18809290e-01 -5.94526008e-02 -6.95800781e-02
 -5.04455566e-02  5.38853230e-03  7.46285543e-02  8.71930843e-06
  8.01478773e-02 -4.24630307e-02 -4.31038998e-02  2.99072266e-02
 -1.92696713e-02  3.19126658e-02 -7.59974867e-02  6.37207031e-02
  4.88804393e-02  1.12304688e-02  9.74469855e-02 -3.35518979e-02
  8.81200507e-02  4.41196971e-02  2.32456755e-02  1.11049108e-01
  1.69642851e-01  1.85350683e-02  1.62004739e-01  2.75530131e-03
 -7.30481818e-02  3.19998595e-03  9.91821289e-03 -2.22996306e-02
  5.27082160e-02 -1.48119242e-03  6.15801150e-03 -2.22516749e-02
 -9.32965949e-02 -3.10494564e-02  2.57742740e-02  5.75474314e-02
  1.74734928e-02 -3.6446

In [41]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.texts = [torch.tensor(encode(sentence), dtype=torch.float) for sentence in data["text"]]
        self.labels = torch.tensor(data["label_encoded"].values, dtype=torch.int64)
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]
dataset = TextDataset(data)
dataloader = DataLoader(dataset=dataset, batch_size=32, shuffle=True)

KeyboardInterrupt: 

In [29]:
for batch in dataloader:
    text, label = batch
    print(text.shape)
    print(label.shape)
    break

torch.Size([32, 300])
torch.Size([32])


In [32]:
class TextClassification(nn.Module):
    def __init__(self, vocab, input = 300, output = 32, classes = 6):
        super(TextClassification, self).__init__()
        self.fc1 = nn.Linear(input, output)
        self.fc2 = nn.Linear(output, classes)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextClassification(300).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 15
for epoch in range(epochs):
    total_loss = 0.0
    total_samples = 0
    correct = 0
    for text, label in dataloader:
        text = text.to(device)
        label = label.to(device)
        optimizer.zero_grad()
        out = model(text)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pred = torch.argmax(out, dim=1)
        correct+= torch.eq(pred, label).sum().item()
        total_samples += label.size(0)
    acc = 100*correct/total_samples
    avg_loss = total_loss/len(dataloader)
    print(f"epoch {epoch+1} loss = {avg_loss:.4f} accuracy = {acc:.4f}")

epoch 1 loss = 0.8973 accuracy = 66.4703
epoch 2 loss = 0.7443 accuracy = 71.6076
epoch 3 loss = 0.7036 accuracy = 72.8319
epoch 4 loss = 0.6789 accuracy = 73.7206
epoch 5 loss = 0.6610 accuracy = 74.3799
epoch 6 loss = 0.6472 accuracy = 74.9144
epoch 7 loss = 0.6368 accuracy = 75.2839
epoch 8 loss = 0.6285 accuracy = 75.6354
epoch 9 loss = 0.6212 accuracy = 75.9290
epoch 10 loss = 0.6154 accuracy = 76.1512
epoch 11 loss = 0.6095 accuracy = 76.3690
epoch 12 loss = 0.6053 accuracy = 76.6051
epoch 13 loss = 0.6015 accuracy = 76.7335
epoch 14 loss = 0.5981 accuracy = 76.9715
epoch 15 loss = 0.5951 accuracy = 77.0518
