In [1]:
import numpy as np
import pandas as pd

In [119]:
import re
import nltk
import gensim
from gensim.models.word2vec import Word2Vec

In [115]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.preprocessing import text
from keras.utils import pad_sequences

In [120]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [121]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [3]:
data = pd.read_csv('/content/tweet_emotions.csv')

In [4]:
pd.set_option('display.max_colwidth' , None)

In [5]:
data

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,"Happy Mother's Day to all the mommies out there, be you woman or man as long as you're 'momma' to someone this is your day!"
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NEW HIT SINGLES WWW.MYSPACE.COM/IPSOHOT I DEF. WAT U IN THE VIDEO!!


In [6]:
data['tweet_id'].nunique()

40000

In [7]:
#dropping 'tweet id' column

In [8]:
data.drop('tweet_id',axis=1,inplace=True)

In [9]:
data.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


Preprocessing


In [10]:
data.isna().sum()

sentiment    0
content      0
dtype: int64

In [11]:
data['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [12]:
data['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [13]:
# No null values

In [14]:
#cleaning tweets

In [15]:
#url removing

In [16]:
def remove_URL(tweet):
  return re.sub(r"http\S+", "", tweet)

In [17]:
data['tweet_url-less'] = data['content'].apply(lambda x: remove_URL(x))

In [18]:
def remove_name(tweet):
  return re.sub('@[\w]+','',tweet)

In [19]:
data['tweet_name-less'] = data['content'].apply(lambda x: remove_name(x))

In [20]:
data.head()

Unnamed: 0,sentiment,content,tweet_url-less,tweet_name-less
0,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,i know i was listenin to bad habit earlier and i started freakin at his part =[
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...,Layin n bed with a headache ughhhh...waitin on your call...,Layin n bed with a headache ughhhh...waitin on your call...
2,sadness,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!,wants to hang out with friends SOON!,wants to hang out with friends SOON!
4,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will.","@dannycastillo We want to trade with someone who has Houston tickets, but no one will.","We want to trade with someone who has Houston tickets, but no one will."


In [21]:
#preprocessing using Gensim

In [23]:
data['clean_tweet'] = data['tweet_name-less'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [24]:
data.head(3)

Unnamed: 0,sentiment,content,tweet_url-less,tweet_name-less,clean_tweet
0,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[,i know i was listenin to bad habit earlier and i started freakin at his part =[,"[know, was, listenin, to, bad, habit, earlier, and, started, freakin, at, his, part]"
1,sadness,Layin n bed with a headache ughhhh...waitin on your call...,Layin n bed with a headache ughhhh...waitin on your call...,Layin n bed with a headache ughhhh...waitin on your call...,"[layin, bed, with, headache, ughhhh, waitin, on, your, call]"
2,sadness,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]"


In [25]:
data.drop(['tweet_url-less','content','tweet_name-less'],axis=1,inplace=True)

In [26]:
data.head()

Unnamed: 0,sentiment,clean_tweet
0,empty,"[know, was, listenin, to, bad, habit, earlier, and, started, freakin, at, his, part]"
1,sadness,"[layin, bed, with, headache, ughhhh, waitin, on, your, call]"
2,sadness,"[funeral, ceremony, gloomy, friday]"
3,enthusiasm,"[wants, to, hang, out, with, friends, soon]"
4,neutral,"[we, want, to, trade, with, someone, who, has, houston, tickets, but, no, one, will]"


In [27]:
#removing stopwords

In [28]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
  output = [i for i in text if i not in stopwords]
  return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: remove_stopwords(x))

In [30]:
data.head()

Unnamed: 0,sentiment,clean_tweet
0,empty,"[know, listenin, bad, habit, earlier, started, freakin, part]"
1,sadness,"[layin, bed, headache, ughhhh, waitin, call]"
2,sadness,"[funeral, ceremony, gloomy, friday]"
3,enthusiasm,"[wants, hang, friends, soon]"
4,neutral,"[want, trade, someone, houston, tickets, one]"


In [31]:
#dropping unnecessary emotions

In [32]:
data.drop(data.loc[data['sentiment']=='enthusiasm'].index, inplace=True)

In [33]:
data.shape

(39241, 2)

In [34]:
data.drop(data.loc[data['sentiment']=='love'].index, inplace=True)
data.drop(data.loc[data['sentiment']=='relief'].index, inplace=True)
data.drop(data.loc[data['sentiment']=='empty'].index, inplace=True)
data.drop(data.loc[data['sentiment']=='neutral'].index, inplace=True)
data.drop(data.loc[data['sentiment']=='boredom'].index, inplace=True)

In [35]:
#encoding

In [36]:
#as per test requirement creating new categories from existing related emotional categories

In [37]:
def custom_encoder(data):
  data.replace(to_replace=['fun','happiness'],value=1,inplace=True)#joy
  data.replace(to_replace=['sadness',],value=2,inplace=True)#sad
  data.replace(to_replace=['anger','hate'],value=3,inplace=True)#angry
  data.replace(to_replace=['worry'],value=4,inplace=True)  #fear
  data.replace(to_replace=['surprise'],value=0,inplace=True)#surprise

  return data

In [39]:
data['sentiment'] = custom_encoder(data['sentiment'])

In [40]:
data['sentiment'].unique()

array([2, 4, 0, 1, 3])

In [41]:
data['sentiment'].value_counts()

4    8459
1    6985
2    5165
0    2187
3    1433
Name: sentiment, dtype: int64

In [42]:
data.head()

Unnamed: 0,sentiment,clean_tweet
1,2,"[layin, bed, headache, ughhhh, waitin, call]"
2,2,"[funeral, ceremony, gloomy, friday]"
5,4,"[pinging, go, prom, bc, bf, like, friends]"
6,2,"[sleep, im, thinking, old, friend, want, married, damn, amp, wants, scandalous]"
7,4,"[hmmm, http, www, djhero, com]"


In [45]:
#Tokenization

Vectorisation with WordtoVec

In [49]:
x_train_gen,x_test_gen,y_train_gen,y_test_gen = train_test_split(data['clean_tweet'],data['sentiment'],test_size=0.2,random_state=42)

In [50]:
#word embedding with word2vec

In [51]:
model_gs = Word2Vec(x_train_gen, min_count = 1)

In [52]:
words = model_gs.wv.index_to_key

In [53]:
len(words)

19604

In [54]:
x_train_vec = np.array([np.array([model_gs.wv[i] for i in ls if i in words]) for ls in x_train_gen],dtype=object)
x_test_vec = np.array([np.array([model_gs.wv[i] for i in ls if i in words]) for ls in x_test_gen],dtype=object)

In [55]:
len(x_train_vec[0][0])

100

In [56]:
x_train_vec.ndim

1

In [57]:
x_train_vec_avg = []
for v in x_train_vec:
  if v.size:
    x_train_vec_avg.append(v.mean(axis = 0))
  else:
    x_train_vec_avg.append(np.zeros(100,dtype=float))
x_test_vec_avg = []
for v in x_test_vec:
  if v.size:
    x_test_vec_avg.append(v.mean(axis = 0))
  else:
    x_test_vec_avg.append(np.zeros(100,dtype=float))

In [58]:
#Random Forest Classifier

In [60]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train_vec_avg,y_train_gen)

In [62]:
#training accuracy
y_pred = clf.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train_gen)

0.9908166950420472

In [63]:
#testing

In [64]:
y_pred = clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test_gen)

0.38877424680148576

In [117]:
#Descision Tree Classifier

In [66]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train_vec_avg,y_train_gen)

In [67]:
#training accuracy
y_pred = dt_clf.predict(x_train_vec_avg)
accuracy_score(y_pred,y_train_gen)

0.9908166950420472

In [118]:
#testing

In [68]:
y_pred = dt_clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test_gen)

0.3027238959966983

In [69]:
#XGB

In [71]:
xgb_clf = xgb.XGBClassifier(n_estimators=20 ,max_depth = 1, random_state = 42, learning_rate = 0.4, gamma = 0.1 )

In [72]:
xgb_clf.fit(x_train_vec_avg,y_train_gen)

In [73]:
y_pred = xgb_clf.predict(x_test_vec_avg)
accuracy_score(y_pred,y_test_gen)

0.40115559224102354

In [80]:
#Neural network

In [81]:
model = Sequential()

In [82]:
model.add(Dense(85,activation = 'relu'))
model.add(Dense(80,activation = 'relu'))
model.add(Dense(5, activation = 'softmax'))

In [83]:
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

In [84]:
x_train_new = np.array(x_train_vec_avg)
y_train_new = pd.get_dummies(y_train_gen)

In [85]:
history = model.fit(x_train_new,y_train_new,epochs=100,validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [86]:
x_test_new = np.array(x_test_vec_avg)
y_test_new = pd.get_dummies(y_test_gen)

In [87]:
model.evaluate(x_test_new,y_test_new)



[1.3828167915344238, 0.40301278233528137]

Picking the better performing model

In [88]:
label_dict = {1 :'Joy',2 : 'Sad', 3:'Angry',4:'Fear',0:'Surprise'}

In [109]:
def sentiment_analyze(tweet):
  tweet = remove_URL(tweet)
  tweetx = remove_name(tweet)
  tweet1 = gensim.utils.simple_preprocess(tweetx)
  tweet_stp = remove_stopwords(tweet1)
  tweet_vec_avg = np.array([model_gs.wv[i] for i in tweet_stp if i in words]).mean(axis = 0)
  test = tweet_vec_avg.reshape(1,100)
  pred = model.predict(test)
  sentiment = np.argmax(pred)
  return (label_dict[sentiment])

In [113]:
sentiment_analyze('I should be sleep, but im not! thinking about an old friend who I want. but he\'s married now.  he wants me 2! scandalous!')



'Fear'