In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch import nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv("./data/movie.csv")
x = data["text"]
y = data["label"]

#tfidf: convert the words string inputs into numbers
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True, norm="l1")
x_train_v = vectorizer.fit_transform(x_train)
x_test_v = vectorizer.transform(x_test)

#convert data to torch tensors
x_train_tensor = torch.tensor(x_train_v.toarray(), dtype=torch.float32)
x_test_tensor = torch.tensor(x_test_v.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)



In [7]:
#prepare dataset
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True)


In [9]:
#defining mlp
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 2),
        )
    
    def forward(self, x):
        return self.layers(x)

input_size = x_train_tensor.shape[1]
mlp = MLP(input_size, 2)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 20
epsilon = 1e-5
prev_loss = float('inf')
patience_batches = 10

for epoch in range(num_epochs):
    print(f'Starting Epoch {epoch+1}')
    current_loss = 0.0
    batch_no_improve = 0

    for i, (inputs, targets) in enumerate(trainloader, 0):
        inputs, targets = inputs.float(), targets.long()  # Ensure correct types

        optimizer.zero_grad()
        outputs = mlp(inputs)

        loss = loss_function(outputs, targets)

        loss.backward()
        optimizer.step()

        current_loss += loss.item()
        avg_loss = current_loss / (i + 1)

        if i % 100 == 0:
            print(f'Loss after mini-batch {i+1}: {avg_loss:.6f}')

        if abs(prev_loss - avg_loss) < epsilon:
            batch_no_improve += 1
            if batch_no_improve >= patience_batches:
                print(f"Stopping epoch {epoch+1} early after {i+1} batches due to no improvement.")
                break
        else:
            batch_no_improve = 0

        prev_loss = avg_loss

    scheduler.step()  # Reduce learning rate after step_size epochs
    print(f"Epoch {epoch+1} finished after {i+1} batches.")

print("Training has completed")


Starting Epoch 1
Loss after mini-batch 1: 0.704226
Loss after mini-batch 101: 0.694554
Loss after mini-batch 201: 0.693947
Loss after mini-batch 301: 0.693816
Stopping epoch 1 early after 351 batches due to no improvement.
Epoch 1 finished after 351 batches.
Starting Epoch 2
Loss after mini-batch 1: 0.693609
Loss after mini-batch 101: 0.693556
Loss after mini-batch 201: 0.693637
Loss after mini-batch 301: 0.693718
Stopping epoch 2 early after 359 batches due to no improvement.
Epoch 2 finished after 359 batches.
Starting Epoch 3
Loss after mini-batch 1: 0.694988
Loss after mini-batch 101: 0.694353
Loss after mini-batch 201: 0.694090
Stopping epoch 3 early after 204 batches due to no improvement.
Epoch 3 finished after 204 batches.
Starting Epoch 4
Loss after mini-batch 1: 0.694248
Loss after mini-batch 101: 0.693259
Stopping epoch 4 early after 169 batches due to no improvement.
Epoch 4 finished after 169 batches.
Starting Epoch 5
Loss after mini-batch 1: 0.691648
Loss after mini-batch