In [1]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support
np.random.seed(0)

# Task 2: Emoji prediction (Classification task)

#### The dataset consists of crawled tweets from Twitter. Every tweet is labeled with a class corresponding to the emoji the user put after the text of the tweet. Your task is to predict the emoji from a given tweet. This exercise represents a typical application of a classification task. As with the regression task, report all your preprocessing steps and mind their importance. The dataset consists of a separate training and testing dataset. Report your performance, including overall accuracy, precision and recall for all classes and the micro and macro average for precision and recall, on the test dataset!

#### Hint: The train and test datasets are pickle files (.pkl). Use the function pandas.read_pickle(path) to read the files into a pandas data frame.

In [2]:
df_train = pd.read_pickle("assets/emoji_train.pkl")      # Shape: (42627, 4)
df_test = pd.read_pickle("assets/emoji_test.pkl")        # (10657, 4)
df_test

Unnamed: 0,tweet,emoji_class,emoji,predicted_class
0,Lmao. My #Bitmoji is so perfect. Looks and act...,1,😂,
1,I like to call this the #tandem because we dec...,0,❤,
2,Crab dip French toast! Yum! I Miss Shirley's! ...,0,❤,
3,Happy Thanksgiving from my family to yours! ️ ...,0,❤,
4,#familynight ️ @ Soho House West Hollywood \n,0,❤,
...,...,...,...,...
10652,Overshine by the sunlight ️ - Golden gate brid...,6,☀,
10653,Those one handed interception drills coming in...,3,🔥,
10654,Can I get a for this good looking group? We're...,0,❤,
10655,"w/ @user : @user with the shots @ Manhattan, N...",2,📸,


In [3]:
n_classes = df_train['emoji_class'].max()
print("Number of Classes:", n_classes)

Number of Classes: 6


In [4]:
X_train = df_train['tweet'].values              # (42627,)
X_test = df_test['tweet'].values                # (10657,)
y_train = df_train['emoji_class'].values        # (42627,)
y_test = df_test['emoji_class'].values          # (10657,)

### Create Word Embeddings and convert string to vector


In [5]:
def preprocess(originalString):
  cleanString = originalString.lower() # lowercase
  cleanString = re.sub(r'/(<.*?>)|[@]|[^\w\d\n]/g', ' ', cleanString) # replace non-word chars
  cleanString = re.sub(r'/(ies|y|ed|ing|s)(\s|\b)/g', ' ', cleanString) # stemming of the word endings
  cleanString = re.sub(r'/\s{2,}/g', ' ', cleanString) # replace redundand whitespaces
  # cleanString = cleanString.trim() # trim leading and ending whitespaces

  return cleanString

In [6]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import functools

In [7]:
corpus = api.load('text8')


In [8]:
model = Word2Vec(corpus)

In [9]:
model.wv['tree'].size

100

In [17]:

# TODO: remove stop words
def createCumulativeSentenceEmbedding(accum, word):
  if (len(word) == 0):
    return accum
  if (isinstance(accum, str)):
    if (accum in model.wv):
      accum = model.wv[accum]
    else:
      accum = ''

  if(word in model.wv):
    wordVec = model.wv[word]
    if (isinstance(accum, str)):
      return wordVec
    vSum = accum.copy()
    for index in range(1, accum.size):
      vSum[index] = accum[index] + wordVec[index]
    return vSum
  else:
    return accum

def createEmbeddings(text):
  try:
    cleanedString = preprocess(text)
    words = cleanedString.split(' ')
    vectorSum = functools.reduce(lambda accum, word: createCumulativeSentenceEmbedding(accum, word), words)
    if (isinstance(vectorSum, str)):
      return [0] * 100
    vGetAveragedVector = np.vectorize(lambda value: value / len(words))
    averagedVector = vGetAveragedVector(vectorSum)
    return averagedVector
  except Exception as error:
    print("Error: ", text, error)


In [18]:
X_train = df_train['tweet'].map(createEmbeddings)

In [19]:
X_train

0        [0.002096562832593918, 0.2509743690490723, -0....
1        [-0.17851143223898752, -0.21594536304473877, 0...
2        [-0.0027228393233739412, 0.15442683146550104, ...
3        [-0.17585518143393777, 0.26584709774364124, 0....
4        [0.07648611730999416, -0.6102372275458442, -0....
                               ...                        
42622    [-0.08925571611949376, -0.3218010153089251, -0...
42623    [-0.004910023036328229, -0.4268351034684615, 0...
42624    [-0.032213255763053894, -0.3905642713819231, -...
42625    [0.13893904856273107, 0.10633475439889091, -0....
42626    [0.03814888993899027, 0.2947390874226888, 0.20...
Name: tweet, Length: 42627, dtype: object

KeyError: 'tweet'

## Convert Array to Torch Tensor

In [23]:
X_train = torch.tensor(X_train)
X_train.shape

  X_train = torch.tensor(X_train)


torch.Size([42627, 100])

In [19]:
#X_train = torch.from_numpy(X_train)    	encode the words as some indices similar to the Lang class in the Seq2Seq Tutorial
y_train = torch.from_numpy(y_train)         # Shape 1D
# X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)           # Shape 1D 
print("y_train:", y_train.shape, "y_test:", y_test.shape)

y_train: torch.Size([42627]) y_test: torch.Size([10657])


In [None]:
# Shape of X_train: [42627,3,6]

## Model

### Can select between different models, Deep Neural Network, SVM, 

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self,x):
        return x

## Training

### Choose different loss (BCE, MSE, CrossEntropy loss, Choose different schedular

In [None]:
def train(model, optimizer, X_train, train_y, max_epoch):

  return train_loss, train_acc 


## Testing

In [None]:
def test(model, test_x, test_y, test_episode):
  
  return avg_loss, avg_acc, y_hat, target_inds

## Evaluation on Performance Metrics

### Train Loss

### Train Accuracy


### F1-Score & Precision & Recall

### Confusion Matrix

## Results

## Save to Dataframe