In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [2]:
df = pd.read_csv('train.txt', sep=';',header=None,names=['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [5]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
  emotion_numbers[emo] = i
  i +=1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [6]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [7]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [8]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [9]:
df['text'] = df['text'].apply(remove_punc)

In [10]:
df['text']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: text, Length: 16000, dtype: object

In [11]:
def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df['text'] = df['text'].apply(remove_numbers)
     

In [12]:
def remove_emojis(txt): 
    new = ""
    for i in txt:
        if i.isascii(): 
           new += i
    return new 
df['text'] = df['text'].apply(remove_emojis)

In [13]:
import nltk 

In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [18]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sumit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sumit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
stop_words = set(stopwords.words('english'))

In [20]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [22]:
def remove(txt): 
    words = txt.split()
    cleaned = []
    for i in words: 
        if not i in stop_words: 
            cleaned.append(i)
    return ' '.join(cleaned)

In [23]:
df['text'] = df['text'].apply(remove)

In [25]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [26]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)
     

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score 


bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow,y_train)

pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test,pred_bow))

0.768125


In [39]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0], dtype=int64)

In [46]:


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [47]:
y_pred = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test,y_pred))

0.6609375


In [48]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)

In [49]:
logistic_model.fit(X_train_tfidf, y_train)

In [50]:
log_pred = logistic_model.predict(X_test_tfidf)

In [51]:
print(accuracy_score(y_test,log_pred))

0.8628125
