In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


In [3]:
df = pd.read_csv('train.txt', sep=';', header= None, names = ['text', 'emotion'])

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [6]:
unique_emot =df.emotion.unique()

In [47]:
unique_emot

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [7]:
unique_emot
emotion_numbers= {}
i=0
for emo in unique_emot:
    emotion_numbers[emo] = i
    i= i+1
df['emotion'] = df['emotion'].map(emotion_numbers)


In [8]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [9]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [10]:
import string
def remove_punc(txt):
    return  txt.translate(str.maketrans('', '', string.punctuation))

In [11]:
df['text']= df['text'].apply(remove_punc)


In [12]:
def remove_num(txt):
    return ''.join([ch for ch in txt if not ch.isdigit()])

df['text'] = df['text'].apply(remove_num)


In [13]:
def remove_emojis(txt):
    return ''.join([ch for ch in txt if ch.isascii()])

df['text'] = df['text'].apply(remove_emojis)

In [14]:
from  nltk.tokenize import word_tokenize
import nltk

In [15]:
from  nltk.corpus import stopwords

In [16]:
import nltk

# Pick one directory, e.g. C:\nltk_data
nltk.download('punkt', download_dir=r"C:\nltk_data")
nltk.download('stopwords', download_dir=r"C:\nltk_data")

# Tell nltk to look here
nltk.data.path.append(r"C:\nltk_data")


[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
stop_words = set(stopwords.words('english'))

In [18]:
len(stop_words)

198

In [19]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [20]:
def remove(txt):
    words = txt.split()
    cleaned =  [i for i in words if i not in stop_words]
    return ' '.join(cleaned)

In [21]:
df['text'] = df['text'].apply(remove)

In [22]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [23]:
import sklearn
print(sklearn.__version__)


1.7.2


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
  df['text'], df['emotion'], test_size=0.20, random_state=42)

In [25]:
X_train.shape

(12800,)

In [26]:
X_test.shape

(3200,)

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
vectorizer_bow = CountVectorizer(ngram_range=(2,2))


In [29]:
X_train_bow= vectorizer_bow.fit_transform(X_train)
X_test_bow= vectorizer_bow.transform(X_test)


In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [31]:
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [32]:
pred_nb = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_nb))


0.6596875


In [33]:
# print("Vocabulary:", vectorizer.vocabulary_)
print("Vocabulary size:", len(vectorizer_bow.vocabulary_))

Vocabulary size: 78267


In [34]:


print("Features:", vectorizer_bow.get_feature_names_out()[:20])
print("\nBoW Matrix:\n", X_train_bow.toarray())

Features: ['aa full' 'aa meeting' 'aaaaand tis' 'aaaand stealing' 'aac feeling'
 'aahhh work' 'aaron friends' 'abandon project' 'abandon sake'
 'abandoned ask' 'abandoned believe' 'abandoned cant' 'abandoning way'
 'abandonment begun' 'abandonment embrace' 'abated lost'
 'abbigail apraxia' 'abc family' 'abc news' 'abc type']

BoW Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
vectorizer_tf = TfidfVectorizer()

In [37]:

X_train_tf = vectorizer_tf.fit_transform(X_train)
X_test_tf = vectorizer_tf.transform(X_test)


In [38]:
import joblib
joblib.dump(vectorizer_tf, "tf_model.pkl")

['tf_model.pkl']

In [39]:
nb_model.fit(X_train_tf, y_train)


0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [40]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)

In [41]:
logistic_model.fit(X_train_tf, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [42]:
pred_log = logistic_model.predict(X_test_tf)
print(accuracy_score(pred_log, y_test))

0.8628125


In [43]:
import joblib
joblib.dump(logistic_model, "sentiment_model.pkl")


['sentiment_model.pkl']

In [44]:
pred_tf = nb_model.predict(X_test_tf)
print(accuracy_score(y_test, pred_tf))
                    

0.6609375


In [45]:
print("Features:", vectorizer_tf.get_feature_names_out()[:20])
print("\nBoW Matrix:\n", X_train_tf.toarray())

Features: ['aa' 'aaaaand' 'aaaand' 'aac' 'aahhh' 'aaron' 'ab' 'abandon' 'abandoned'
 'abandoning' 'abandonment' 'abated' 'abbigail' 'abc' 'abdomen'
 'abdominal' 'abducted' 'abhorrent' 'abide' 'abilities']

BoW Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
