## Import libraries
Here, we import the libraries required to develop our neural network model. Besides familiar libraries such as sklearn, nltk, we also import pytorch which is a deep learning library used for applications such as computer vision and natural language processing.

In [None]:
import os
import io
import sys
import torch
from torch.autograd import Variable
from sklearn.metrics import f1_score, classification_report, roc_curve, auc
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')


## Upload data files and load data from csv to pandas dataframe
Here, you upload the training and test files that we provide from your local machine to Google Colab.

In [None]:
# Upload data files - note that it would take about 3 mins for the colab upload your files successfully
from google.colab import files
uploaded = files.upload()

In [None]:
from sklearn import preprocessing

# Import data into panda dataframes
train_df = pd.read_csv(io.BytesIO(uploaded['assignment5_processed_train.csv']))
test_df = pd.read_csv(io.BytesIO(uploaded['assignment5_processed_test.csv']))

# Get only Text and Label columns for the task
train_df = train_df[["ProcessedTweet","Sentiment"]]
test_df = test_df[["ProcessedTweet","Sentiment"]]

# Change name of the columns for convenience
train_df.columns = ["TEXT","LABEL"]
test_df.columns = ["TEXT","LABEL"]

# convert labels to numeric values
le = preprocessing.LabelEncoder()
le.fit(["Positive","Negative","Neutral"])
print ("List of labels: ", list(le.classes_))
train_df.LABEL = le.transform(train_df.LABEL)
test_df.LABEL = le.transform(test_df.LABEL)

# Print the size of each set
print ("Training set: ", len(train_df))
print ("Test set: ", len(test_df))

# Display the first 5 rows in each set for double-checking
display(train_df.head(5))
display(test_df.head(5))

## Check if GPU is available to run the neural network

In [None]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

## Function 1:

In [None]:
from nltk.tokenize import word_tokenize
from collections import defaultdict

# define function
def tokenize(texts):
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    idx = 2
    for sent in texts:
        tokenized_sent = word_tokenize(sent)

        tokenized_texts.append(tokenized_sent)

        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len

# Run the function
all_text = train_df.TEXT.to_list() + test_df.TEXT.to_list()
tokenized_texts, word2idx, max_len = tokenize(all_text)


## Function 2:

In [None]:
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for tokenized_sent in tokenized_texts:

        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

# Run the function
input_ids = encode(tokenized_texts, word2idx, max_len)

## Download pre-trained word embeddings
In this step, we are going to use fasttext pre-trained word embeddings.

In [None]:
%%time
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    !wget -P $FILE $URL
    !unzip $FILE/crawl-300d-2M.vec.zip -d $FILE

## Function 3:

In [None]:
from tqdm import tqdm_notebook

def load_pretrained_vectors(word2idx, fname):
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    return embeddings
  
# Run the function
embeddings = load_pretrained_vectors(word2idx, "fastText/crawl-300d-2M.vec")
embeddings = torch.tensor(embeddings)

## Function 4:

In [None]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,SequentialSampler)

def data_loader(train_inputs, test_inputs, train_labels, test_labels,
                batch_size=50):

    train_inputs, test_inputs, train_labels, test_labels = tuple(torch.tensor(data) for data in [train_inputs, test_inputs, train_labels, test_labels])

    batch_size = 50

    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    test_data = TensorDataset(test_inputs, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    return train_dataloader, test_dataloader

# Run the function
train_inputs = input_ids[:41142]
test_inputs = input_ids[41142:]

train_labels = train_df.LABEL.tolist()
test_labels = test_df.LABEL.tolist()

train_dataloader, test_dataloader = data_loader(train_inputs, test_inputs, train_labels, test_labels, batch_size=50)
  

## Function 5 - CNN Model
In this section we are going to define a vanila CNN model.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_classifier(nn.Module):
    def __init__(self,vocab_size=None,embed_dim=300,filter_sizes=2,num_filters=100,num_classes=3,dropout=0.5, learning_rate = 0.25):

        super(CNN_classifier, self).__init__()

        # Layer 1
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=self.embed_dim,padding_idx=0, max_norm=5.0)
            
        # Layer 2
        self.conv1d_list = nn.ModuleList([nn.Conv1d(in_channels=self.embed_dim,out_channels=num_filters,kernel_size=filter_sizes)])
        
        # Layer 3
        self.fc = nn.Linear(np.sum(num_filters), num_classes)

        # Layer 4
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        x_embed = self.embedding(input_ids).float()
        x_reshaped = x_embed.permute(0, 2, 1)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],dim=1)
        logits = self.fc(self.dropout(x_fc))

        return logits

## Function 6:

In [None]:
import random

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def train(model, optimizer, train_dataloader, test_dataloader=None, epochs=10):
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Test Loss':^10} | {'Test F1':^9}")
    print("-"*50)

    for epoch_i in range(epochs):
        total_loss = 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids)
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_loss / len(train_dataloader)

        if test_dataloader is not None:
            test_loss, test_f1_score_mean = evaluate(model, test_dataloader)
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {test_loss:^10.6f} | {test_f1_score_mean:^9.2f}")
            

def evaluate(model, val_dataloader):
    model.eval()

    val_f1_score = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        f1_score_item = f1_score(b_labels.cpu().numpy(),preds.cpu().numpy(), average="weighted")
        val_f1_score.append(f1_score_item)

    val_loss = np.mean(val_loss)
    val_f1_score_mean = np.mean(val_f1_score)

    return val_loss, val_f1_score_mean

## Function 7 - Train and evaluate model

In [None]:
import torch.optim as optim

# Define hyperparameters
vocab_size=len(word2idx)
embed_dim=300
filter_sizes=1
num_filters=100
num_classes=3
dropout = 0.1
learning_rate = 0.01

cnn_model = CNN_classifier(vocab_size=vocab_size,
                    embed_dim=embed_dim,
                    num_classes= num_classes,
                    filter_sizes = filter_sizes,
                    num_filters = num_filters,
                    dropout = dropout,
                    learning_rate = learning_rate)

optimizer = optim.Adam(cnn_model.parameters(),lr=learning_rate)
    
cnn_model.to(device)

train(cnn_model, optimizer, train_dataloader, test_dataloader, epochs=20)

## Function 8:


In [None]:
def predict(text, model=cnn_model.to("cpu"), max_len=62):

    tokens = word_tokenize(text.lower())
    padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens))
    input_id = [word2idx.get(token, word2idx['<unk>']) for token in padded_tokens]

    input_id = torch.tensor(input_id).unsqueeze(dim=0)

    logits = model.forward(input_id)

    probs = F.softmax(logits, dim=1).squeeze(dim=0)

    print(f"This review is {probs[0] * 100:.5f}% Negative;  {probs[1] * 100:.5f}% Neutral;  {probs[2] * 100:.5f}% Positive.")

predict("covid 19 is suck. I am fed up of staying at home.")
predict("I feel much better now since the vaccine has been produced.")
predict("Covid 19 is dangerous. I feel unsafe when going out these days.")
predict("It is good that the govenrment starts acting.")

## Exporting your results to PDF
1. Download your notebook with _File -> Download .ipynb_
1. Rename with your name like in other assignments, for example lastname_firstname_assignment5.ipynb
1. Submit the notebook file on Moodle