# Using MLP with Word2Vec

## Model

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 512),
            nn.ReLU(),
            nn.Dropout(0.6),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

## Dataset

In [2]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import pandas as pd

def review_to_words( review, remove_stopwords=False ):

    review_text = BeautifulSoup(review).get_text()

    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    words = review_text.lower().split()

    if remove_stopwords:    
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words

train_data = pd.read_csv("./data/labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)

        
clean_train_reviews = []
for revuew in train_data["review"]:
    clean_train_reviews.append(review_to_words(revuew))


  review_text = BeautifulSoup(review).get_text()


In [3]:
assert len(clean_train_reviews) == train_data["review"].size, "Error: Cleaning training set reviews failed"
import numpy as np
from gensim.models import Word2Vec

w2v = Word2Vec.load('./data/300features_40minwords_10context')

def review2vec(model, review):
    index2word_set = set(model.wv.index_to_key)
    vectors = [model.wv[word] for word in review if word in index2word_set]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

X = np.array([review2vec(w2v, review) for review in clean_train_reviews])
Y = np.array(train_data["sentiment"])


X_tensor = torch.FloatTensor(X)
Y_tensor = torch.FloatTensor(Y).unsqueeze(1)


In [4]:
X_tensor.shape, Y_tensor.shape

(torch.Size([25000, 300]), torch.Size([25000, 1]))

In [5]:

dataset = TensorDataset(X_tensor, Y_tensor)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), int(len(dataset)*0.2)])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


# Train

## 5-Fold Cross Validation

In [None]:
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
import torch.optim as optim

dataset = TensorDataset(X_tensor, Y_tensor)
kfold = KFold(n_splits=5, shuffle=True, random_state=42) 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fold_train_losses, fold_val_losses, fold_val_accs = [], [], []

best_loss = float('inf')
counter = 0
best_val_acc = 0.0
best_model_state = None

for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
    print(f"Fold {fold+1}")
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)
    
    train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=64, shuffle=False)

    model = MLP().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2)
    loss_fc = nn.BCELoss()
    
    train_losses, va_losses, va_acc = [], [], []
    
    for epoch in range(30):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_fc(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * inputs.size(0)
        
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                loss = loss_fc(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                
                predicted = (outputs > 0.5).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)

        scheduler.step()
        
        train_loss = train_loss / len(train_loader.dataset)
        val_loss = val_loss / len(val_loader.dataset)
        val_acc = correct / total
        train_losses.append(train_loss)
        va_losses.append(val_loss)
        va_acc.append(val_acc)
        
        print(f"Epoch {epoch+1}/30")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        print("-" * 50)

        if val_loss < best_loss:
            best_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), f"./data/mlp_model_fold{fold+1}_best.pth")  # 保存当前折最优
    
    fold_train_losses.append(train_losses)
    fold_val_losses.append(va_losses)
    fold_val_accs.append(va_acc)

    max_acc = max(va_acc)
    if max_acc > best_val_acc:
        best_val_acc = max_acc
        best_model_state = model.state_dict()

torch.save(best_model_state, f"./data/best_mlp_model.pth")



Fold 1
Epoch 1/30
Train Loss: 0.4508 | Val Loss: 0.3332 | Val Acc: 0.8610
--------------------------------------------------
Epoch 2/30
Train Loss: 0.3425 | Val Loss: 0.3151 | Val Acc: 0.8658
--------------------------------------------------
Epoch 3/30
Train Loss: 0.3337 | Val Loss: 0.3105 | Val Acc: 0.8688
--------------------------------------------------
Epoch 4/30
Train Loss: 0.3260 | Val Loss: 0.3052 | Val Acc: 0.8698
--------------------------------------------------
Epoch 5/30
Train Loss: 0.3219 | Val Loss: 0.3034 | Val Acc: 0.8708
--------------------------------------------------
Epoch 6/30
Train Loss: 0.3195 | Val Loss: 0.2979 | Val Acc: 0.8742
--------------------------------------------------
Epoch 7/30
Train Loss: 0.3171 | Val Loss: 0.2984 | Val Acc: 0.8736
--------------------------------------------------
Epoch 8/30
Train Loss: 0.3154 | Val Loss: 0.2955 | Val Acc: 0.8764
--------------------------------------------------
Epoch 9/30
Train Loss: 0.3127 | Val Loss: 0.3113 

In [None]:
test = pd.read_csv("./data/testData.tsv", header=0, \
                    delimiter="\t", quoting=3)

model = MLP().to(device)
model.load_state_dict(torch.load("./data/best_mlp_model.pth"))
model.eval()

print(model)
with torch.no_grad():
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append(review_to_words(review))

    Xtest = np.array([review2vec(w2v, review) for review in clean_test_reviews])

    Xtest_tensor = torch.FloatTensor(Xtest)

    output = model(Xtest_tensor)
    result = (output > 0.5).cpu().numpy().astype(int).flatten()
    
    output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
    output.to_csv( "./data/mlp.csv", index=False, quoting=3 )

MLP(
  (layers): Sequential(
    (0): Linear(in_features=300, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.6, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=256, out_features=1, bias=True)
    (7): Sigmoid()
  )
)


  review_text = BeautifulSoup(review).get_text()
