In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam
from keras.utils.np_utils import to_categorical

In [None]:
import pandas as pd

In [None]:
Emotions_DF = pd.read_csv('emotion.txt', delimiter=';',header=None)

In [None]:
Emotions_DF.rename(columns={0:'Sentence',1:'Emotion'},inplace=True)

In [None]:
Emotions_DF

In [None]:
Emotions_DF.Emotion.unique().tolist()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data = Emotions_DF, x = 'Emotion')
plt.title('Emotion',fontsize=20);

In [None]:
plt.figure(figsize=(10,7))
Emotions_DF.groupby('Emotion').size().plot.pie(autopct='%.2f', textprops={'fontsize': 16})
plt.title('Emotion',fontsize=20)
plt.ylabel('');

In [None]:
# {'joy':0.3351,'love':0.0815,'sadness':0.2916,'surprise':0.0358,'anger':0.1349,'fear':0.1211}

In [None]:
for emo in Emotions_DF.Emotion.unique():
    print('Average of sentence length of {} : {}'.format(emo, Emotions_DF.query('Emotion == "{}"'.format(emo)).Sentence.apply(lambda x :len(x)).mean()))

In [None]:
from collections import Counter

In [None]:
stops=['the','and','that','for','with','this','for','was','you','about']

for emotion in Emotions_DF.Emotion.unique():
    Data=' '.join(Emotions_DF[Emotions_DF['Emotion']==emotion]['Sentence'].values)
    tokens =[word.lower() for word in Data.split() if len(word)>6 and word.lower() not in stops ]
    dictWords=Counter(tokens).most_common(20)
    Words=[i[0] for i in dictWords]
    Freq=[i[1] for i in dictWords]
    DF=pd.DataFrame({'Word':Words,'Freq':Freq})
    plt.figure(figsize=(6,5))
    sns.barplot(x=DF['Word'][0:10],y=DF['Freq'][0:10])
    plt.title(f'Most frequent words for {emotion} emotion')
    plt.xticks(rotation=25, ha='center')
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Emotions_DF['Sentence'], 
                                                    Emotions_DF['Emotion'], 
                                                    random_state=42, test_size=0.2)

In [None]:
Emotions_DF.loc[X_train.index, 'Train/Test'] = 'train'

In [None]:
Emotions_DF.loc[X_test.index, 'Train/Test'] = 'test'

In [None]:
Emotions_DF.groupby(['Emotion', 'Train/Test']).count()

## Count Vectorizer

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2),stop_words='english').fit(X_train)
X_train_vectorized = count_vect.transform(X_train)

In [None]:
X_train_vectorized.shape

In [None]:
MNB = MultinomialNB(alpha=0.1)
MNB.fit(X_train_vectorized, y_train)
predictions = MNB.predict(count_vect.transform(X_test))
accuracy_score(y_test, predictions)

In [None]:
lsvc = LinearSVC(C=0.1)
lsvc.fit(X_train_vectorized, y_train)
predictions = lsvc.predict(count_vect.transform(X_test))
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

In [None]:
lsvc_CM = confusion_matrix(y_test, predictions)
lsvc_CM_DF = pd.DataFrame(lsvc_CM, columns=np.unique(y_test), index = np.unique(y_test))
lsvc_CM_DF.index.name = 'Actual'
lsvc_CM_DF.columns.name = 'Predicted'

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(lsvc_CM_DF, fmt='g', annot=True, cmap='Blues')
plt.title('Linear SVC predictions Confusion Matrix\n',fontsize=20)
plt.xlabel('\nPredicted',fontsize=15)
plt.ylabel('Actual',fontsize=15)
plt.xticks(ha='center',fontsize=12)
plt.yticks(fontsize=12, rotation =0);

## Tfidf Vectorizer

In [None]:
tf_vect = TfidfVectorizer(ngram_range = (1,2),stop_words='english').fit(X_train)
X_train_vectorized = tf_vect.transform(X_train)

In [None]:
X_train_vectorized.shape

In [None]:
MNB = MultinomialNB(alpha=0.1)
MNB.fit(X_train_vectorized, y_train)
predictions = MNB.predict(tf_vect.transform(X_test))
accuracy_score(y_test, predictions)

In [None]:
LR = LogisticRegression(C=100, solver='sag', class_weight='balanced', random_state=18)
LR.fit(X_train_vectorized, y_train)
predictions = LR.predict(tf_vect.transform(X_test))
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

In [None]:
LR_CM = confusion_matrix(y_test, predictions)
LR_CM_DF = pd.DataFrame(LR_CM, columns=np.unique(y_test), index = np.unique(y_test))
LR_CM_DF.index.name = 'Actual'
LR_CM_DF.columns.name = 'Predicted'

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(LR_CM_DF, fmt='g', annot=True, cmap='YlOrBr')
plt.title('Logistic Regression predictions Confusion Matrix\n',fontsize=20)
plt.xlabel('\nPredicted',fontsize=15)
plt.ylabel('Actual',fontsize=15)
plt.xticks(ha='center',fontsize=12)
plt.yticks(fontsize=12, rotation =0);

## Hashing Vectorizer

In [None]:
hash_vect = HashingVectorizer(ngram_range=(1,2) ,stop_words='english').fit(X_train)
X_train_vectorized = hash_vect.transform(X_train)

In [None]:
X_train_vectorized.shape

In [None]:
# MNB = MultinomialNB(alpha=0.1)
# MNB.fit(X_train_vectorized, y_train)
# predictions = MNB.predict(hash_vect.transform(X_test))
# accuracy_score(y_test, predictions)

In [None]:
LR = LogisticRegression(C=100, solver='sag', class_weight='balanced', random_state=18)
LR.fit(X_train_vectorized, y_train)
predictions = LR.predict(hash_vect.transform(X_test))
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

In [None]:
LR_CM = confusion_matrix(y_test, predictions)
LR_CM_DF = pd.DataFrame(LR_CM, columns=np.unique(y_test), index = np.unique(y_test))
LR_CM_DF.index.name = 'Actual'
LR_CM_DF.columns.name = 'Predicted'

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(LR_CM_DF, fmt='g', annot=True, cmap='Greens')
plt.title('Logistic Regression predictions Confusion Matrix\n',fontsize=20)
plt.xlabel('\nPredicted',fontsize=15)
plt.ylabel('Actual',fontsize=15)
plt.xticks(ha='center',fontsize=12)
plt.yticks(fontsize=12, rotation =0);

In [None]:
lsvc = LinearSVC(C=10, random_state=18)
lsvc.fit(X_train_vectorized, y_train)
predictions = lsvc.predict(hash_vect.transform(X_test))
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

In [None]:
lsvc_CM = confusion_matrix(y_test, predictions)
lsvc_CM_DF = pd.DataFrame(lsvc_CM, columns=np.unique(y_test), index = np.unique(y_test))
lsvc_CM_DF.index.name = 'Actual'
lsvc_CM_DF.columns.name = 'Predicted'

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(lsvc_CM_DF, fmt='g', annot=True, cmap='Greys')
plt.title('Linear SVC predictions Confusion Matrix\n',fontsize=20)
plt.xlabel('\nPredicted',fontsize=15)
plt.ylabel('Actual',fontsize=15)
plt.xticks(ha='center',fontsize=12)
plt.yticks(fontsize=12, rotation =0);

In [None]:
model = Sequential()
model.add(Dense(8, input_dim=1048576, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_vectorized, pd.get_dummies(y_train),  epochs=10, verbose=1, batch_size=400)

In [None]:
plt.plot(model.history.epoch, model.history.history['loss'])
plt.xlabel('epochs', fontsize=14)
plt.ylabel('loss', fontsize=14);

In [None]:
plt.plot(model.history.epoch, model.history.history['accuracy'])
plt.xlabel('epochs', fontsize=14)
plt.ylabel('Accuracy', fontsize=14);

In [None]:
model.evaluate(hash_vect.transform(X_test), pd.get_dummies(y_test))