In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

# Loading the data

In [None]:
train = pd.read_csv("/kaggle/input/emotion-dataset/training.csv")

In [None]:
train.head()

In [None]:
labels_dict = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
train['description'] = train['label'].map(labels_dict )
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

# Data Pre-processing

In [None]:
train.isnull().sum()

In [None]:
train.columns

In [None]:
train.head()

In [None]:
train['label'].value_counts(normalize = True) 

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x=train['label'].value_counts().index,y=train['label'].value_counts().values)
plt.show()

In [None]:
emotion_label={0:'sadness',1:'joy',2:'love',3:'anger',4:'fear',5:'surprise'}

In [None]:
plt.pie(train['label'].value_counts(), labels=train['description'].value_counts().index, autopct='%.0f%%')
plt.show()

# Word Cloud

Joy texts

In [None]:
from wordcloud import WordCloud
joy = train[train.description == 'joy']['text']
joy_string = ' '.join(joy)
plt.figure(figsize=(10,10))
wc = WordCloud(collocations = False,max_words=1000,height=600,background_color = 'white').generate(joy_string)
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
 from wordcloud import WordCloud
sadness = train[train.description == 'sadness']['text']
sadness_string = ' '.join(sadness)
plt.figure(figsize=(10,10))
wc = WordCloud(collocations = False,max_words=1000,height=600,background_color = 'white').generate(joy_string)
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud
surprise = train[train.description == 'surprise']['text']
surprise_string = ' '.join(surprise)
plt.figure(figsize=(10,10))
wc = WordCloud(collocations = False,max_words=1000,height=600,background_color = 'white').generate(joy_string)
plt.imshow(wc,interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
train0=train[train['label']==0]
train1=train[train['label']==1]
train2=train[train['label']==2]
train3=train[train['label']==3]
train4=train[train['label']==4]
train5=train[train['label']==5]

In [None]:
train0.shape , train1.shape , train2.shape, train3.shape, train4.shape ,train5.shape

# Reduction of Data

In [None]:
train0=train0[:int(train0.shape[0]/2)]
train1=train1[:int(train1.shape[0]/2)]
train2=train2[:int(train2.shape[0]/2)]
train3=train3[:int(train3.shape[0]/2)]
train4=train4[:int(train4.shape[0]/2)]
train5=train5[:int(train5.shape[0]/2)]

In [None]:
train0.shape, train1.shape, train2.shape, train3.shape, train4.shape, train5.shape

In [None]:
train.shape

In [None]:
train=pd.concat([train0,train1,train2,train3,train4,train5],axis=0)

In [None]:
train.shape

# Removing hashtages

In [None]:
train['text'].replace( { r"#(\w+)" : '' }, inplace= True, regex = True)

# Removing mentions

In [None]:
train['text'].replace( { r"@(\w+)" : '' }, inplace= True, regex = True)

# Removing URLS

In [None]:
train['text'].astype(str).replace( { r"http\S+" : '' }, inplace= True, regex = True)

# Lowercase text

In [None]:
train['text']=train['text'].str.lower()

# Stopwords Removal

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

# Randomization

In [None]:
train = train.sample(frac = 1)

# EDA for final dataset

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.info()

In [None]:
plt.figure(figsize=(10, 10))
sns.barplot(x=train['label'].value_counts().index,y=train['label'].value_counts().values)
plt.show()

In [None]:
plt.pie(train['label'].value_counts(), labels=train['description'].value_counts().index, autopct='%.0f%%')
plt.show()

In [None]:
train_df = train

# Vectorizaton

# Text Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
y = train['label']
x = train['text']

# Creating a word corpus for vectorization
corpus = []
for i in range(x.shape[0]):
    corpus.append(x.iloc[i])
    
vectorizer1 = TfidfVectorizer(max_features=1000)
X1 = vectorizer1.fit_transform(x)
feature_names1 = vectorizer1.get_feature_names()
denselist1 = X1.todense().tolist()
train = pd.DataFrame(denselist1, columns=feature_names1)

# Applying the Modals

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import string
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(train, y, test_size=0.3, random_state=42)
# For creating a table of the accuracies in the end
accuracy = {'TF-IDF':[]}

#Linear Regression
linearr = LinearRegression()
linearr.fit(X_temp,y_temp)
predict = linearr.predict(X_test)
a = linearr.score(X_test,y_test)
linearscore = a
accuracy['TF-IDF'].append(a)

# Logistic Regression
regressor_LR_tf = LogisticRegression(C=1.0,penalty='l2',solver='newton-cg')
regressor_LR_tf.fit(X_temp, y_temp)
y_predict_LR_tf = regressor_LR_tf.predict(X_test)
a=(regressor_LR_tf.score(X_test, y_test))
lrscore=a
accuracy['TF-IDF'].append(a)

# Decision Tree
model_DT_tf = DecisionTreeClassifier(criterion = 'gini', max_depth=2) 
model_DT_tf.fit(X_temp, y_temp)
y_predict_DT_tf = model_DT_tf.predict(X_test)
a=(model_DT_tf.score(X_test,y_test))
accuracyscoretree=a
accuracy['TF-IDF'].append(a)

# Random Forest
model_RF_tf = RandomForestClassifier(n_estimators= 100, max_features = 'log2')
model_RF_tf.fit(X_temp, y_temp)
y_predict_RF_tf = model_RF_tf.predict(X_test)
a=(model_RF_tf.score(X_test,y_test))
accuracyscoreforest=a
accuracy['TF-IDF'].append(a)

# K-Neighbors Classifier
model_KN_tf = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 5, weights= 'distance') 
model_KN_tf.fit(X_temp, y_temp)
y_predict_KN_tf = model_KN_tf.predict(X_test)
a=(model_KN_tf.score(X_test,y_test))
knnaccuracy=a
accuracy['TF-IDF'].append(a)

# Evaluation
model = ['LiR','LR','DT','RF','KNN']
data = {'model':model,'accuracy':accuracy['TF-IDF']}
compare_models = pd.DataFrame(data)
compare_models

In [None]:
names=['Linear Regression','Logistic Regression' ,'Decision Tree Classification','Random Forest Classifier','KNN']
acc=[linearscore,lrscore,accuracyscoretree,accuracyscoreforest,knnaccuracy]
plt.figure(figsize=(10,8))
plt.bar(names,acc)
plt.xticks(rotation=90)
plt.ylabel('Accuracy') 

# Bag of Words

In [None]:
y1 = train_df['label']
x1 = train_df['text']
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
#transform
Count_data = CountVec.fit_transform(x1)
 
#create dataframe
train_df=pd.DataFrame(Count_data.toarray(),columns=CountVec.get_feature_names())
print(train_df)

In [None]:
x_temp1, x_test1, Y_temp1, Y_test1 = train_test_split(train_df, y, test_size=0.3, random_state=42)
# For creating a table of the accuracies in the end
accuracy = {'Bag of Words':[]}

#Linear Regression
linearr = LinearRegression()
linearr.fit(x_temp1,Y_temp1)
predict = linearr.predict(x_test1)
a = linearr.score(x_test1,Y_test1)
linearscore = a
accuracy['Bag of Words'].append(a)

# Logistic Regression
regressor_LR_tf = LogisticRegression(C=1.0,penalty='l2',solver='newton-cg')
regressor_LR_tf.fit(x_temp1, Y_temp1)
y_predict_LR_tf = regressor_LR_tf.predict(x_test1)
a=(regressor_LR_tf.score(x_test1, Y_test1))
lrscore=a
accuracy['Bag of Words'].append(a)

# Decision Tree
model_DT_tf = DecisionTreeClassifier(criterion = 'gini', max_depth=2) 
model_DT_tf.fit(x_temp1, Y_temp1)
y_predict_DT_tf = model_DT_tf.predict(x_test1)
a=(model_DT_tf.score(x_test1,Y_test1))
accuracyscoretree=a
accuracy['Bag of Words'].append(a)

# Random Forest
model_RF_tf = RandomForestClassifier(n_estimators= 100, max_features = 'log2')
model_RF_tf.fit(x_temp1, Y_temp1)
y_predict_RF_tf = model_RF_tf.predict(x_test1)
a=(model_RF_tf.score(x_test1,Y_test1))
accuracyscoreforest=a
accuracy['Bag of Words'].append(a)

# K-Neighbors Classifier
model_KN_tf = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 5, weights= 'distance') 
model_KN_tf.fit(x_temp1, Y_temp1)
y_predict_KN_tf = model_KN_tf.predict(x_test1)
a=(model_KN_tf.score(x_test1,Y_test1))
knnaccuracy=a
accuracy['Bag of Words'].append(a)

# Evaluation
model = ['LiR','LR','DT','RF','KNN']
data = {'model':model,'accuracy':accuracy['Bag of Words']}
compare_models = pd.DataFrame(data)
compare_models

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
knn=KNeighborsClassifier()

In [None]:
grid={
    'n_neighbors':[3,5,7],
    'weights':['uniform','distance'],
    'algorithm':['auto','ball_tree','kd_tree'],
    'metric':['manhattan','euclidian','minkowski']
}

In [None]:
gr = GridSearchCV(estimator=knn,param_grid=grid).fit(X_temp,y_temp)
gr = pd.DataFrame(gr.cv_results_)
gr.T