# Data

In [1]:
import pickle
from os import path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.txt', sep = '\t')

In [3]:
data.head()

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others
1,1,When did I?,saw many times i think -_-,No. I never saw you,angry
2,2,By,by Google Chrome,Where you live,others
3,3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry
4,4,Just for time pass,wt do u do 4 a living then,Maybe,others


In [4]:
data.label.value_counts()

others    14948
angry      5506
sad        5463
happy      4243
Name: label, dtype: int64

In [5]:
train, test = train_test_split(data, test_size = 0.2, random_state = 123)

In [6]:
#X_train_text = train.apply(lambda x: x[1] + ' ' + x[2] + ' ' + x[3], axis = 1)
X_train_text = train.apply(lambda x: x[1] + ' ' + x[3], axis = 1)
#X_train_text = train.apply(lambda x: x[3], axis = 1)
y_train = train.label

#X_test_text = test.apply(lambda x: x[1] + ' ' + x[2] + ' ' + x[3], axis = 1)
X_test_text = test.apply(lambda x: x[1] + ' ' + x[3], axis = 1)
#X_test_text = test.apply(lambda x: x[3], axis = 1)
y_test = test.label

In [7]:
X_train_text.head()

28481                            upgrade then i'll message
26481                                    cool Cya good nyt
23794    I am having something that u know Jb song i li...
11281    Author name like that Hey that is author name yar
23951                                With me I am Ur crush
dtype: object

# Build features

## TF-IDF features

In [8]:
from nltk.corpus import stopwords
import re

In [9]:
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = ' '.join([x for x in text.split(' ') if x not in STOPWORDS and x != ''])
    return text

In [10]:
X_train = X_train_text.apply(text_prepare)
X_test = X_test_text.apply(text_prepare)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_test):
    tfidf_vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 5, ngram_range = (1,2), token_pattern = '(\S+)')
    
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_test = tfidf_vectorizer.transform(X_test)
    
    return X_train, X_test, tfidf_vectorizer.vocabulary_

In [12]:
X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)

In [13]:
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [14]:
len(tfidf_reversed_vocab.values())

3296

## NRC corpora features 

In [15]:
from nltk import word_tokenize

In [16]:
nrc_path = '../../semeval/NRC-Sentiment-Emotion-Lexicons/'

nrc_affection_file = 'affectDict.pkl'
nrc_emotion_file = 'emotionDict.pkl'
nrc_vad_file = 'vadDict.pkl'

### Affection

In [17]:
with open(path.join(nrc_path, nrc_affection_file), 'rb') as f:
    affectDict = pickle.load(f)

In [18]:
def apply_corpora(text, corpora, dim):
    res = np.zeros((dim,))
    for word in text:
        if corpora.get(word) is not None:
            res = np.vstack((res, np.array(list(corpora[word].values()))))
    if len(res.shape) < 2:
        return res
    else:
        res = res.mean(axis = 0)
        return res

In [19]:
X_train_text = X_train_text.apply(word_tokenize)
X_test_text = X_test_text.apply(word_tokenize)

In [20]:
X_train_affection = np.stack(X_train_text.apply(apply_corpora, corpora = affectDict, dim = 4))
X_test_affection = np.stack(X_test_text.apply(apply_corpora, corpora = affectDict, dim = 4))

### Emotion

In [21]:
with open(path.join(nrc_path, nrc_emotion_file), 'rb') as f:
    emotionDict = pickle.load(f)

In [22]:
X_train_emotion = np.stack(X_train_text.apply(apply_corpora, corpora = emotionDict, dim = 10))
X_test_emotion = np.stack(X_test_text.apply(apply_corpora, corpora = emotionDict, dim = 10))

### NRC VAD

In [23]:
with open(path.join(nrc_path, nrc_vad_file), 'rb') as f:
    vadDict = pickle.load(f)

In [24]:
X_train_vad = np.stack(X_train_text.apply(apply_corpora, corpora = vadDict, dim = 3))
X_test_vad = np.stack(X_test_text.apply(apply_corpora, corpora = vadDict, dim = 3))

## Manual features

In [25]:
from manual_features import *

In [26]:
def extract_manual_features(text):
    return np.array([count_pattern(text, ALL_CAPS), count_pattern(text, ELONGATED), 
                     count_punctuations(text, QE_MARKS), 
                     ending_punctuation(text, ENDING_WITH_QE)])

In [27]:
X_train_manual_features = np.stack(X_train_text.apply(extract_manual_features))
X_test_manual_features = np.stack(X_test_text.apply(extract_manual_features))

## Emoticon features

## Combine features

In [28]:
from scipy import sparse

In [29]:
X_train_combined = sparse.hstack((X_train_tfidf, X_train_affection, X_train_emotion, 
                                  X_train_vad, X_train_manual_features))
X_test_combined = sparse.hstack((X_test_tfidf, X_test_affection, X_test_emotion, 
                                  X_test_vad, X_test_manual_features))

## Classifier

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.multiclass import OneVsRestClassifier

In [31]:
label_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=4, random_state = 0))
label_classifier.fit(X_train_combined, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=4, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [32]:
y_test_pred = label_classifier.predict(X_test_combined)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_summary = precision_recall_fscore_support(y_test, y_test_pred, labels = ['others', 'angry', 'sad', 'happy'])
print('Test accuracy = {}'.format(test_accuracy))
print('Test precision: {} \nTest recall: {} \nTest F1-score {} \nSupport: {}'
      .format(test_summary[0], test_summary[1], test_summary[2], test_summary[3]))

Test accuracy = 0.8353779840848806
Test precision: [0.82166566 0.86284545 0.86745796 0.81643836] 
Test recall: [0.90465116 0.77983349 0.78866906 0.71893848] 
Test F1-score [0.86116382 0.81924198 0.82618935 0.76459269] 
Support: [3010 1081 1112  829]


In [34]:
pd.Series(y_test_pred).value_counts()

others    3314
sad       1011
angry      977
happy      730
dtype: int64

In [36]:
test.label.value_counts()

others    3010
sad       1112
angry     1081
happy      829
Name: label, dtype: int64

## Results 

all 3 turns
```
Test accuracy = 0.8057029177718833
Test precision: [0.79453263 0.84388186 0.8371134  0.76544944] 
Test recall: [0.89800664 0.7400555  0.73021583 0.65741858] 
Test F1-score [0.84310667 0.7885658  0.78001921 0.7073329 ] 
```

turn 1 + turn 3:
```
Test accuracy = 0.8353779840848806
Test precision: [0.82166566 0.86284545 0.86745796 0.81643836] 
Test recall: [0.90465116 0.77983349 0.78866906 0.71893848] 
Test F1-score [0.86116382 0.81924198 0.82618935 0.76459269] 
```

turn-3 only:
```
Test accuracy = 0.7904509283819628
Test precision: [0.75380022 0.83715596 0.85941043 0.84680135] 
Test recall: [0.92259136 0.67530065 0.68165468 0.60675513] 
Test F1-score [0.82969824 0.74756784 0.76028084 0.70695713] 
```