In [1]:
#!pip install gensim

In [2]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support

from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import functools

In [3]:

np.random.seed(0)

# Task 2: Emoji prediction (Classification task)

#### The dataset consists of crawled tweets from Twitter. Every tweet is labeled with a class corresponding to the emoji the user put after the text of the tweet. Your task is to predict the emoji from a given tweet. This exercise represents a typical application of a classification task. As with the regression task, report all your preprocessing steps and mind their importance. The dataset consists of a separate training and testing dataset. Report your performance, including overall accuracy, precision and recall for all classes and the micro and macro average for precision and recall, on the test dataset!

#### Hint: The train and test datasets are pickle files (.pkl). Use the function pandas.read_pickle(path) to read the files into a pandas data frame.

In [4]:
df_train = pd.read_pickle("./emoji_train.pkl")      # Shape: (42627, 4)
df_test = pd.read_pickle("./emoji_test.pkl")        # (10657, 4)
df_test

Unnamed: 0,tweet,emoji_class,emoji,predicted_class
0,Lmao. My #Bitmoji is so perfect. Looks and act...,1,😂,
1,I like to call this the #tandem because we dec...,0,❤,
2,Crab dip French toast! Yum! I Miss Shirley's! ...,0,❤,
3,Happy Thanksgiving from my family to yours! ️ ...,0,❤,
4,#familynight ️ @ Soho House West Hollywood \n,0,❤,
...,...,...,...,...
10652,Overshine by the sunlight ️ - Golden gate brid...,6,☀,
10653,Those one handed interception drills coming in...,3,🔥,
10654,Can I get a for this good looking group? We're...,0,❤,
10655,"w/ @user : @user with the shots @ Manhattan, N...",2,📸,


In [5]:
n_classes = df_train['emoji_class'].max()
print("Number of Classes:", n_classes)

Number of Classes: 6


In [6]:
X_train = df_train['tweet'].values              # (42627,)
X_test = df_test['tweet'].values                # (10657,)
y_train = df_train['emoji_class'].values        # (42627,)
y_test = df_test['emoji_class'].values          # (10657,)

## Convert Array to Torch Tensor

In [7]:
#X_train = torch.from_numpy(X_train)    	encode the words as some indices similar to the Lang class in the Seq2Seq Tutorial
y_train = torch.from_numpy(y_train)         # Shape 1D
# X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)           # Shape 1D 
print("y_train:", y_train.shape, "y_test:", y_test.shape)

y_train: torch.Size([42627]) y_test: torch.Size([10657])


## Create Word Embeddings and convert string to vector

In [8]:
def preprocess(originalString):
  cleanString = originalString.lower() # lowercase
  cleanString = re.sub(r'/(<.*?>)|[@]|[^\w\d\n]/g', ' ', cleanString) # replace non-word chars
  cleanString = re.sub(r'/(ies|y|ed|ing|s)(\s|\b)/g', ' ', cleanString) # stemming of the word endings
  cleanString = re.sub(r'/\s{2,}/g', ' ', cleanString) # replace redundand whitespaces
  # cleanString = cleanString.trim() # trim leading and ending whitespaces

  return cleanString

In [9]:
corpus = api.load('text8')
model = Word2Vec(corpus)
model.wv['tree'].size

100

In [10]:
# TODO: remove stop words
def createCumulativeSentenceEmbedding(accum, word):
  if (len(word) == 0):
    return accum
  if (isinstance(accum, str)):
    if (accum in model.wv):
      accum = model.wv[accum]
    else:
      accum = ''

  if(word in model.wv):
    wordVec = model.wv[word]
    if (isinstance(accum, str)):
      return wordVec
    vSum = accum.copy()
    for index in range(1, accum.size):
      vSum[index] = accum[index] + wordVec[index]
    return vSum
  else:
    return accum

def createEmbeddings(text):
  try:
    cleanedString = preprocess(text)
    words = cleanedString.split(' ')
    vectorSum = functools.reduce(lambda accum, word: createCumulativeSentenceEmbedding(accum, word), words)
    if (isinstance(vectorSum, str)):
      return [0] * 100
    vGetAveragedVector = np.vectorize(lambda value: value / len(words))
    averagedVector = vGetAveragedVector(vectorSum)
    return averagedVector
  except Exception as error:
    print("Error: ", text, error)

In [11]:
X_train = df_train['tweet'].map(createEmbeddings)

In [12]:
X_train=X_train.to_frame()
X_train = X_train.values.tolist()
X_train = torch.Tensor(X_train)

  X_train = torch.Tensor(X_train)


In [13]:
#X_train = X_train.squeeze()
#y_train = y_train.unsqueeze(dim=1)

In [14]:
print(X_train.shape, type(X_train), y_train.shape, type(y_train))

torch.Size([42627, 1, 100]) <class 'torch.Tensor'> torch.Size([42627]) <class 'torch.Tensor'>


## Model

### Can select between different models, Deep Neural Network, SVM, 

In [15]:
## Input 3 Channels two classes, can be changed 
class ConvModel(nn.Module):
    def __init__(self):
        super(ConvModel, self).__init__()
        self.conv1 = nn.Sequential(
                    nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool1d(3, stride=2),
                    nn.Conv1d(in_channels=64, out_channels=96, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool1d(3, stride=2),
                    nn.Conv1d(in_channels=96, out_channels=128, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool1d(3, stride=2),
                    nn.Conv1d(in_channels=128, out_channels=32, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool1d(3, stride=2)
                    )       

    def forward(self,x):
        x= self.conv1(x)
        x= x.view(x.size(0), -1)            # Flattening
        return x

In [16]:
# guter test 
class MLP_V2_2(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(100, 16)
        #self.hidden2 = nn.Linear(80, 32)
        self.output = nn.Linear(16, 1)
        #self.relu = nn.ReLU()
        #self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.hidden(x)
        #x = self.sigmoid(x)
        #x = self.hidden2(x)
        #x = self.relu(x)
        x = self.output(x)
        return x

In [17]:
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(100, 200)
        self.activation = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(200, 1)
        self.softmax = torch.nn.Softmax()

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

## Training

### Choose different loss (BCE, MSE, CrossEntropy loss, Choose different schedular

In [18]:
def train(model, optimizer, X_train, train_y, max_epoch):

  train_loss = []
  train_acc = []
  y_target =[]

  # Divide the learning rate by 2 at each epoch, as suggested in paper
  scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.5, last_epoch=-1)     # Decays the learning rate of each parameter group by gamma every step_size epochs. 
  #                                                                                   Notice that such decay can happen simultaneously with other changes to the learning rate from outside this scheduler.
  #                                                                                   When last_epoch = -1, sets initial lr as lr.
  epoch = 0                                                                         # Epochs done so far
  stop = False   
  
  #loss_func=torch.nn.BCELoss()                                                                   # Status to know when to stop
  loss_func = nn.CrossEntropyLoss()
  #loss_func = nn.SmoothL1Loss()


  while epoch < max_epoch and not stop:                   
    running_loss = 0.0
    running_acc = 0.0

    optimizer.zero_grad()                                                          # Sets the gradients of all optimized torch.Tensors to zero

    yhat = model(X_train)                     # input x and predict based on 
    #print(yhat.shape, yhat)
    #out = out.squeeze()
    #train_y = train_y.to(torch.float)
    #print(train_y.shape, train_y)
    #yhat = yhat.to(torch.float)
    #yhat =yhat.double()
    loss = loss_func(yhat, train_y)           # must be (1. nn output, 2. target), the target label is NOT one-hotted
    #loss = loss.to(torch.float32)
    loss.backward()                           # backward pass
    optimizer.step()                          # gradient descent; Performs a single optimization step (parameter update)                                                                               
      
    #Print the statistic
    running_loss += loss                                                
    #running_acc += loss

    epoch_loss = running_loss
    epoch_acc = running_acc
    print('Epoch {:d} -- Loss: {:.4f} '.format(epoch+1,epoch_loss))
    #print('Epoch {:d} -- Loss: {:.4f} Acc: {:.4f}'.format(epoch+1,epoch_loss, epoch_acc))
    epoch += 1
    scheduler.step()                                                                 # This change the learning rate at each epoch, otherwise the LR would stays at the initial value                                             
    train_loss.append(loss)
    #train_acc= np.append(train_acc,epoch_acc)
    #y_target.append(target_inds)

  return train_loss, train_acc 

In [19]:
model = ConvModel()
#model = MLP_V2_2()
#model = TinyModel()

max_epoch = 1000
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001,weight_decay=5e-4) #  L2 regularization
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
train_loss, train_acc = train(model, optimizer, X_train, y_train, max_epoch)

Epoch 1 -- Loss: 5.0390 
Epoch 2 -- Loss: 5.0050 
Epoch 3 -- Loss: 4.9723 
Epoch 4 -- Loss: 4.9486 
Epoch 5 -- Loss: 4.9334 
Epoch 6 -- Loss: 4.9243 
Epoch 7 -- Loss: 4.9190 
Epoch 8 -- Loss: 4.9160 
Epoch 9 -- Loss: 4.9144 
Epoch 10 -- Loss: 4.9135 


KeyboardInterrupt: 

## Testing

In [None]:
def test(model, test_x, test_y, test_episode):
  target_test =[]
  pred_test =[]

  running_loss = 0.0
  running_acc = 0.0
  for episode in tnrange(test_episode):                                                  

    ypred = model(X_test)
    running_loss += output['loss']
    running_acc += output['acc']
    target_test.append(target_inds)
    pred_test.append(y_hat)
    
  avg_loss = running_loss / test_episode
  avg_acc = running_acc / test_episode
  #print('Test results -- Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, avg_acc))
  
  return avg_loss, avg_acc, y_target, y_pred

In [None]:
test_episode = 1000

test_loss, test_acct, y_target, y_pred =test(model, X_test, y_test, test_episode)


In [None]:
print( "True set of labels in training:", y_target)
print("Predicted set of labels in training:", y_pred)
print( "True set of labels in test:", y_target_test)
print( "Predicted set of labels in test:", y_pred_test)

## Evaluation on Performance Metrics

In [None]:
plt.rc('text', usetex=True)             # use LaTeX fonts in the plot
plt.rc('font', family='serif')
sns.set_theme(style="darkgrid")

### Train Loss

In [None]:
plt.plot(train_loss, label='loss')
plt.xlabel('Epoch')
plt.ylabel('Loss [Anomaly]')
plt.legend()
plt.grid(True)

### Train Accuracy


In [None]:
train_acc = accuracy_score(y_target_test, y_pred_test)
plt.plot(train_acc, label='accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy [Anomaly]')
plt.legend()
plt.grid(True)

In [None]:
a_score = accuracy_score(y_target_test, y_pred_test)
print(a_score)

### F1-Score & Precision & Recall

In [None]:
f1 = f1_score( y_target_test,y_pred_test, pos_label=1)
precision = precision_score( y_target_test,y_pred_test)
recall = recall_score( y_target_test,y_pred_test)

### Confusion Matrix

In [None]:
def compute_cm(y_target_test, y_pred_test):         
    cm = confusion_matrix(y_pred_test, y_target_test)
    cm = cm.tolist()
    cm_dict = {'class'+str(count): cm[count] for count in range(len(cm))}
    return cm, cm_dict


cm = confusion_matrix( y_target_test,y_pred_test)
group_names = ['True Neg','False Pos','False Neg','True Pos']
categories = ['Negativ(0)','Positiv(1)']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap="icefire_r")

## Results

In [None]:
print(" Accuracy:", format(test_acc, ".3f"))
print(" Loss:", format(test_loss, ".3f"))
print(" F1 Score: %.3f" % f1)
print(" Precision :", format(precision, ".3f"))
print(" Recall:", format(recall, ".1f"))

## Save to Dataframe

In [None]:
df = pd.DataFrame({'Train Loss': train_loss,
                   'Train Accuracy': train_acc,
                   'Test Loss': test_loss,
                   'Test Accuracy': test_acc,
                   'F1-Score': f1,
                   'Precision': precision,
                   'Recall': recall,
                   })
                   
df_label = pd.DataFrame({ 'y_pred_test': y_pred_test,
                           'y_target_test': y_target_test
                    })


pd.DataFrame(df).to_csv("/Users/PFR1UL/Desktop/.csv", index = False)
pd.DataFrame(df_label).to_csv("/Users/PFR1UL/Desktop/.csv", index = False)
