In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ys19-2023-assignment-3/sample_submission.csv
/kaggle/input/ys19-2023-assignment-3/test_set.csv
/kaggle/input/ys19-2023-assignment-3/valid_set.csv
/kaggle/input/ys19-2023-assignment-3/train_set.csv


# Load and Pre-process Data

In [2]:
import re
from nltk.tokenize import word_tokenize
import unicodedata

def remove_urls(text):
    text = re.sub(r'http\S+|www\S+|https\S', '', text)
    return text

def remove_hashtags_mentions(text):
    hashtags = re.findall(r'\#\w+', text)
    for hashtag in hashtags:
        split_words = hashtag[1:].split('_')
        text = text.replace(hashtag, ' '.join(split_words))
    text = re.sub(r'\@\w+', '', text)
    return text

def remove_punctuations(text):
    text = re.sub(r'[.,!?:;/()|&]+', '', text)
    return text

def remove_special_characters(text):
    text = re.sub(r'[\"«»-]', '', text)
    return text

def strip_accents_and_lowercase(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn').lower()

def preprocess_tweets(text, stopwords=False):
    text = remove_urls(text)
    text = remove_hashtags_mentions(text)
    text = remove_punctuations(text)
    text = remove_special_characters(text)
    text = strip_accents_and_lowercase(text)
    if stopwords:
        tokens = word_tokenize(text)
        custom_greek_stopwords = pd.read_csv('../input/greek-stopwords-custom/custom_greek_stopwords_lowercase.csv',
                                                 delimiter='\t')
        greek_stopwords = set(custom_greek_stopwords['greek_stopwords'].values.flatten().tolist())
        tokens = [token for token in tokens if token not in greek_stopwords]
        return " ".join(tokens)
    else:
        return text
    

train_data = pd.read_csv('../input/ys19-2023-assignment-3/train_set.csv')
valid_data = pd.read_csv('../input/ys19-2023-assignment-3/valid_set.csv')
test_data = pd.read_csv('../input/ys19-2023-assignment-3/test_set.csv')

train_data['Text'] = train_data['Text'].apply(preprocess_tweets)
valid_data['Text'] = valid_data['Text'].apply(preprocess_tweets)
test_data['Text'] = test_data['Text'].apply(preprocess_tweets)

# Load pre-trained Word2Vec model

In [3]:
from gensim.models import KeyedVectors
!wget http://vectors.nlpl.eu/repository/20/46.zip
!unzip -o 46.zip

word2vec_greek_model = KeyedVectors.load_word2vec_format('model.bin', binary=True)
# model's vector size is 100

--2024-01-24 14:14:14--  http://vectors.nlpl.eu/repository/20/46.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 899316822 (858M) [application/zip]
Saving to: '46.zip'


2024-01-24 14:14:50 (23.8 MB/s) - '46.zip' saved [899316822/899316822]

Archive:  46.zip
  inflating: LIST                    
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


# Conver Tweet to padded Embeddings

In [4]:
import torch
from torch.nn.utils.rnn import pad_sequence


def tweet_to_embeddings(tweet, model):
    embeddings = []
    for word in tweet.split():
        if word in model:
            embedding = model[word]
            embeddings.append(embedding)
    return embeddings

def padding_tweets(tweets, model, embedding_size=100):
    padded_tweets = []
    for tweet in tweets:
        tweet_embedding = tweet_to_embeddings(tweet, model)
        if not tweet_embedding: # no words in tweet are in the model
            tweet_embedding = [np.zeros(embedding_size)]
        padded_tweets.append(torch.tensor(tweet_embedding)) # convert to tensor
    
    return pad_sequence(padded_tweets, batch_first=False, padding_value=0)

# Padded Embeddings and Sentiments for train and validation datasets

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# train tensors

train_padded_embeddings = padding_tweets(train_data['Text'], word2vec_greek_model)

train_sentiments = label_encoder.fit_transform(train_data['Sentiment'])
train_sentiments_tensor = torch.tensor(train_sentiments, dtype=torch.long)

# valid tensors

valid_padded_embeddings = padding_tweets(valid_data['Text'], word2vec_greek_model)

valid_sentiments = label_encoder.fit_transform(valid_data['Sentiment'])
valid_sentiments_tensor = torch.tensor(valid_sentiments, dtype=torch.long)

  padded_tweets.append(torch.tensor(tweet_embedding)) # convert to tensor


# Tensor Datasets

In [6]:
from torch.utils.data import TensorDataset, DataLoader

#Transpose the first 2 dimensions of padded embeddings -> [batch_size, sequence_length, embedding_size]
train_padded_embeddings = train_padded_embeddings.transpose(0, 1)
valid_padded_embeddings = valid_padded_embeddings.transpose(0, 1)

train_tensor_dataset = TensorDataset(train_padded_embeddings, train_sentiments_tensor)
valid_tensor_dataset = TensorDataset(valid_padded_embeddings, valid_sentiments_tensor)

train_loader = DataLoader(train_tensor_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_tensor_dataset, batch_size=128, shuffle=True)

# Sentiment RNN Initialization

In [7]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, cell_type, input_size, hidden_size, output_size,
                num_layers=1, bidirectional=False, dropout=0):
        super(SentimentRNN, self).__init__()

        cells = {
          "RNN" : nn.RNN,
          "LSTM" : nn.LSTM,
          "GRU" : nn.GRU
        }

        self.cell_type = cell_type

        self.rnn = cells[cell_type](         # Pick the specific model
            input_size=input_size,           # Number of features for each time step
            hidden_size=hidden_size,         # rnn hidden units
            batch_first=True,       # input & output will have batch size as 1s dimension. e.g. (batch, time_step, input_size)
            num_layers=num_layers,
            bidirectional=bidirectional,
            dropout=dropout
        )

        bidirectional_multiplier = 2 if bidirectional else 1
        self.out = nn.Linear(hidden_size * bidirectional_multiplier, output_size) # Feed forward network

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size): final hidden state
        # h_c shape (n_layers, batch, hidden_size): final cell state

        # LSTM returns output and tuple (hidden_state, cell_state)
        # Other RNN return output and hidden_state
        if self.cell_type == 'LSTM':
            r_out, (h_n, h_c) = self.rnn(x)
        else:
            r_out, h_n = self.rnn(x, None)
            
        # For bidirectional RNN, concat the final states
        # if self.cell_type == 'LSTM':
        if self.rnn.bidirectional:
            h_n = torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)
        else:
            h_n = h_n[-1,:,:]
        # else:
        #     # For non-LSTM, use last hidden state
        #     h_n = h_n[-1,:,:]

        # in sentiment classification, we're interested in the overall sentiment of the sequence
        # not the sentiment at each individual time step. Therefore
        # we use h_n (final hidden state) as the input to our linear layer (self.out)
        out = self.out(h_n)
        return out

# Train and testing the model

In [8]:
def train(model, train_loader, optimizer, loss_func, device):
    model.train()
    train_loss, train_acc = 0, 0
    all_labels, all_preds = [], []

    for inputs, labels in train_loader:
        # setting gpu
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() # Clear gradients
        outputs = model(inputs)
        loss = loss_func(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).float().mean()
        train_acc += accuracy.item()

        all_labels.extend(labels.tolist())
        all_preds.extend(predicted.tolist())
    
    avg_train_loss = train_loss / len(train_loader)
    avg_train_acc = train_acc / len(train_loader)

    return avg_train_loss, avg_train_acc, all_labels, all_preds

In [9]:
def evaluate(model, valid_loader, loss_func, device):
    model.eval()
    valid_loss, valid_acc = 0, 0
    all_labels, all_preds = [], []

    with torch.no_grad():
        for inputs, labels in valid_loader:
            # setting gpu
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = loss_func(outputs, labels)

            valid_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            accuracy = (predicted == labels).float().mean()
            valid_acc += accuracy.item()

            all_labels.extend(labels.tolist())
            all_preds.extend(predicted.tolist())
    
    avg_valid_loss = valid_loss / len(valid_loader)
    avg_valid_acc = valid_acc / len(valid_loader)

    return avg_valid_loss, avg_valid_acc, all_labels, all_preds

# Optuna usage

In [10]:
# import optuna
# import torch.optim as optim
# import matplotlib.pyplot as plt

# def objective(trial):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
#     lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
#     hidden_size = trial.suggest_categorical('hidden_size', [64, 128, 256])
#     num_layers = trial.suggest_int('num_layers', 1, 3)
#     if num_layers > 1:
#         dropout = trial.suggest_uniform('dropout', 0.1, 0.5)
#     else:
#         dropout = 0
#     cell_type = trial.suggest_categorical('cell_type', ['LSTM', 'GRU'])

#     model = SentimentRNN(cell_type=cell_type, input_size=100,
#                          hidden_size=hidden_size, output_size=3,
#                          num_layers=num_layers, bidirectional=True,
#                          dropout=dropout).to(device)
    
#     optimizer = optim.Adam(model.parameters(), lr=lr)
#     loss_func = nn.CrossEntropyLoss()

#     EPOCH = 10

#     train_epoch_loss, valid_epoch_loss = [], []
#     train_epoch_acc, valid_epoch_acc = [], []

#     for epoch in range(EPOCH):
#         train_loss, train_acc, _, _ = train(model, train_loader, optimizer, loss_func, device)
#         valid_loss, valid_acc, _, _ = evaluate(model, valid_loader, loss_func, device)

#         train_epoch_loss.append(train_loss)
#         valid_epoch_loss.append(valid_loss)
#         train_epoch_acc.append(train_acc)
#         valid_epoch_acc.append(valid_acc)

#         print(f'Epoch {epoch+1} | Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
#         print(f'Epoch {epoch+1} | Validation Loss: {valid_loss:.4f}, Accuracy: {valid_acc:.4f}')

#     # Combined plot for both Training and Validation Loss
#     plt.figure(figsize=(10, 5))
#     plt.plot(train_epoch_loss, label='Training Loss', color='blue')
#     plt.plot(valid_epoch_loss, label='Validation Loss', color='red')
#     plt.xlabel('Epochs')
#     plt.ylabel('Loss')
#     plt.title('Training & Validation Loss Over Time')
#     plt.legend()
#     plt.show()

#     # Combined plot for both Training and Validation Accuracy
#     plt.figure(figsize=(10, 5))
#     plt.plot(train_epoch_acc, label='Training Accuracy', color='orange')
#     plt.plot(valid_epoch_acc, label='Validation Accuracy', color='green')
#     plt.xlabel('Epochs')
#     plt.ylabel('Accuracy')
#     plt.title('Training & Validation Accuracy Over Time')
#     plt.legend()
#     plt.show()

#     avg_valid_acc = sum(valid_epoch_acc) / len(valid_epoch_acc)
#     avg_train_acc = sum(train_epoch_acc) / len(train_epoch_acc)
#     # returning the avg_train_acc can show how well the model was trained on the seen data, optimizing the hyperparameters based on the training accuracy
#     # which can lead to a model that is overfitted to the training data.
#     return avg_valid_acc


# study = optuna.create_study(direction='maximize') # minimize for loss
# study.optimize(objective, n_trials=30)

# Optuna Visualizations

In [11]:
# print("Best trial:")
# trial = study.best_trial

# print(f"Trial value: {trial.value}")
# print("Parameters: ")
# for key, value in trial.params.items():
#     print(f"{key}: {value}")

# for trial in study.trials:
#     print(f"Trial {trial.number}: Cell Type: {trial.params['cell_type']}")
    
# optuna.visualization.plot_optimization_history(study)
# optuna.visualization.plot_param_importances(study)

# Use the Best Hyperparameters after running Optuna

In [12]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# best_params = study.best_trial.params
# best_model = SentimentRNN(cell_type=best_params['cell_type'], input_size=100,
#                          hidden_size=best_params['hidden_size'], output_size=3,
#                          num_layers=best_params['num_layers'], bidirectional=True,
#                          dropout=best_params['dropout']).to(device)

# optimizer = optim.Adam(best_model.parameters(), lr=best_params['lr'])
# loss_func = nn.CrossEntropyLoss()

# EPOCH = 10

# train_epoch_loss, valid_epoch_loss = [], []
# train_epoch_acc, valid_epoch_acc = [], []

# for epoch in range(EPOCH):
#         train_loss, train_acc, _, _ = train(best_model, train_loader, optimizer, loss_func, device)
#         print(f'Epoch {epoch+1} | Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')

import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = SentimentRNN(cell_type='LSTM', input_size=100,
                         hidden_size=64, output_size=3,
                         num_layers=2, bidirectional=True,
                         dropout=0.24).to(device)

optimizer = optim.Adam(best_model.parameters(), lr=0.0005)
loss_func = nn.CrossEntropyLoss()

EPOCH = 10

train_epoch_loss, valid_epoch_loss = [], []
train_epoch_acc, valid_epoch_acc = [], []

for epoch in range(EPOCH):
        train_loss, train_acc, _, _ = train(best_model, train_loader, optimizer, loss_func, device)
        print(f'Epoch {epoch+1} | Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')
    
    

Epoch 1 | Training Loss: 1.0955, Accuracy: 0.3594
Epoch 2 | Training Loss: 1.0889, Accuracy: 0.3804
Epoch 3 | Training Loss: 1.0841, Accuracy: 0.3847
Epoch 4 | Training Loss: 1.0801, Accuracy: 0.3892
Epoch 5 | Training Loss: 1.0764, Accuracy: 0.3948
Epoch 6 | Training Loss: 1.0739, Accuracy: 0.3963
Epoch 7 | Training Loss: 1.0709, Accuracy: 0.4013
Epoch 8 | Training Loss: 1.0684, Accuracy: 0.4021
Epoch 9 | Training Loss: 1.0648, Accuracy: 0.4049
Epoch 10 | Training Loss: 1.0621, Accuracy: 0.4102


# Test Predictions

In [13]:

test_padded_embeddings = padding_tweets(test_data['Text'], word2vec_greek_model)

# embedding in the correct shape (batch_size, sequence_length, embedding_size)
test_padded_embeddings = test_padded_embeddings.transpose(0, 1)

test_loader = DataLoader(test_padded_embeddings, batch_size=128, shuffle=False)

best_model.eval()
best_model.to(device)
test_predictions = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)
        outputs = best_model(inputs).to(device)
        _, predicted = torch.max(outputs, 1)
        test_predictions.extend(predicted.tolist())
        
predicted_sentiment_labels = label_encoder.inverse_transform(test_predictions)

predictions_df = pd.DataFrame({
    'Id': range(1, len(test_data) + 1),
    'Predicted': predicted_sentiment_labels})
predictions_df.to_csv('submission.csv', index=False)