In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/train.txt', sep = ';', header = None, names = ['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.shape

(16000, 2)

In [5]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotion,0


In [6]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in  unique_emotions:
  emotion_numbers[emo] = i
  i += 1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [7]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [8]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [9]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

In [10]:
df['text'] = df['text'].apply(remove_punc)

In [11]:
def remove_numbers(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new = new + i
  return new

df['text'] = df['text'].apply(remove_numbers)

In [12]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

In [13]:
import nltk

In [14]:
from nltk.corpus import stopwords
from nltk import word_tokenize

In [15]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [17]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:
def remove(txt):
  words = txt.split()
  cleaned = []
  for word in words:
    if word not in stop_words:
      cleaned.append(word)
  return " ".join(cleaned)

In [19]:
df['text'] = df['text'].apply(remove)

In [20]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [21]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size = 0.2, random_state = 42)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [24]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

## Using Navie Bayes

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [26]:
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

In [27]:
pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))

0.768125


In [28]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0])

In [29]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf, y_train)

In [30]:
y_pred = nb2_model.predict(X_test_tfidf)

In [31]:
print(accuracy_score(y_test, y_pred))

0.6609375


## Using Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression

In [33]:
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf, y_train)

In [34]:
log_pred = logistic_model.predict(X_test_tfidf)

In [35]:
print(accuracy_score(y_test,log_pred))

0.8628125
