In [1]:
import pandas as pd
import numpy as np
import neattext.functions as nfx

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# Load Dataset
data = pd.read_csv("emojiDataset.csv")

In [3]:
# Value Counts
data['Emotion'].value_counts()

joy         11045
sadness      6722
fear         5410
anger        4297
surprise     4062
neutral      2254
disgust       856
shame         146
Name: Emotion, dtype: int64

In [4]:
# User handles
data['Clean_Text'] = data['Text'].apply(nfx.remove_userhandles)

In [5]:
# Stopwords
data['Clean_Text'] = data['Clean_Text'].apply(nfx.remove_stopwords)

In [6]:
# Features & Labels
Xfeatures = data['Clean_Text']
ylabels = data['Emotion']

In [7]:
#  Split Data
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.2,random_state=42)

In [8]:
# Build Pipeline
from sklearn.pipeline import Pipeline

In [9]:
# LogisticRegression Pipeline
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression(solver='lbfgs', max_iter=1000))])

In [10]:
# Train and Fit Data
pipe_lr.fit(x_train,y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('lr', LogisticRegression(max_iter=1000))])

In [11]:
# Check Accuracy
pipe_lr.score(x_test,y_test)

0.6309814628538584

In [12]:
# Make A Prediction
ex1 = "This book was so interesting it made me happy"

In [13]:
pipe_lr.predict([ex1])

array(['joy'], dtype=object)

In [14]:
# Save Model & Pipeline
import joblib
pipeline_file = open("emojiClassifier.pkl","wb")
joblib.dump(pipe_lr,pipeline_file)
pipeline_file.close()