In [1]:
import pandas as pd
import os
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
# https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

In [2]:
current_path = os.getcwd()+'/'
dataset_path =  current_path+'Collected Datasets/'

# load dataset

In [3]:
df_train = pd.read_csv(dataset_path+'train.csv')
df_dev = pd.read_csv(dataset_path+'devel.csv')
df_test = pd.read_csv(dataset_path+'test.csv')

# combine train and test as train and dev as test

In [4]:
df_all = df_train.append(df_test)

# Data preprocessing

In [5]:
#this function removes all the punctuation and takes care of all the word
pattern = re.compile(r"[A-Za-z0-9\-]{3,50}")

In [6]:
#this function clean all the symbols and removes them from the text
df_all['clean_message'] = df_all['message'].str.findall(pattern).str.join(' ')
df_dev['clean_message'] = df_dev['message'].str.findall(pattern).str.join(' ')

In [7]:
#this displays the message with the emotion
df_all = df_all[['clean_message', 'emotion']]
df_dev = df_dev[['clean_message', 'emotion']]
df_all.sample(n=3)

Unnamed: 0,clean_message,emotion
1583,comcast you charge 150 extra for sending someo...,fear
1958,pronounced ing However pretty sure over excite...,fear
2301,Does anyone know are both Sims dual sim phone ...,fear


In [8]:
df_dev.clean_message.values

array(['theclobra lol thought maybe couldn decide there was levity not',
       'Nawaz Sharif getting more funnier than kapilsharmak9 day day laughter challenge kashmir baloch',
       'Nawaz Sharif getting more funnier than kapilsharmak9 day day challenge kashmir baloch',
       'tomderivan73 just people watch and enjoy rare show optimism',
       'love family much lucky grateful smartassfamily love',
       'love family much lucky grateful smartassfamily hilarious love',
       'Casper10666 assure you there laughter but increasing anger the costs and arrogance Westminster',
       'any trump supporters and Hillary haters wanna chirp some weak minded pandering liberals just tweet EmmyA2 snickerfritz04',
       'Google caffeine-an sprightly lengthening into the corridor seo WgJ',
       'This tweet dedicated back pain which not understand because youthful and spry Full life Vivacious',
       'Bluebelle89 lsmith855 liking the optimism',
       'rainy day cheerful sunshine Prussian want

# Agorithm -- 1) Conversion to bag of words 2) Model generation using K-NN 

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

X_train = df_all.clean_message.values
X_test = df_dev.clean_message.values
y_train = df_all.emotion.values
y_test = df_dev.emotion.values

In [12]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', stop_words='english', ngram_range=(1, 3), lowercase = True)),
                     ('clf', KNeighborsClassifier()),
                     ])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

       anger       0.74      0.61      0.67        82
        fear       0.63      0.80      0.70       108
         joy       0.77      0.48      0.59        71
      normal       0.17      0.24      0.20        17
     sadness       0.58      0.62      0.60        69

    accuracy                           0.63       347
   macro avg       0.58      0.55      0.55       347
weighted avg       0.65      0.63      0.63       347



In [13]:
text_sentence = 'i love you'

In [14]:
text_clf.predict([text_sentence])

array(['joy'], dtype=object)