In [1]:
import numpy as np
import pandas as pd

In [65]:
from keras.preprocessing import text
from keras.utils import pad_sequences

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, SimpleRNN
from keras.callbacks import ModelCheckpoint

In [2]:
data = pd.read_csv('/content/judge-1377884607_tweet_product_company.csv',encoding ='cp437')

In [3]:
pd.set_option('display.max_colwidth',None)

In [4]:
data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive emotion


In [5]:
data.shape

(9093, 3)

In [6]:
data.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

In [7]:
#dropping the 'emotion in tweet is directed at' column as required in the assignment pdf

In [8]:
data.drop('emotion_in_tweet_is_directed_at',axis = 1 ,inplace = True)

In [9]:
#dropping the entire row containing single null value in 'tweet text' column

In [10]:
data.dropna(axis = 0,how='any' ,inplace = True)

In [11]:
data.isna().sum()

tweet_text                                            0
is_there_an_emotion_directed_at_a_brand_or_product    0
dtype: int64

In [12]:
data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",Negative emotion
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,Positive emotion
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,Negative emotion
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Positive emotion


In [13]:
data.shape

(9092, 2)

In [14]:
#encoding target with a dictionary

In [15]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [16]:
dict1 = {'Negative emotion':0,'Positive emotion':1,'I can\'t tell':2,'No emotion toward brand or product':3}

In [17]:
data['is_there_an_emotion_directed_at_a_brand_or_product'] = data['is_there_an_emotion_directed_at_a_brand_or_product'].map(dict1)

In [18]:
data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",0
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",1
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,1
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,0
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",1


In [19]:
data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

3    5388
1    2978
0     570
2     156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [20]:
data.rename(columns = {'is_there_an_emotion_directed_at_a_brand_or_product':'label'},inplace=True)

In [21]:
data.columns

Index(['tweet_text', 'label'], dtype='object')

In [22]:
data.head(3)

Unnamed: 0,tweet_text,label
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",0
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",1
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,1


In [23]:
data['label'].isna().sum()

0

In [24]:
#tokenizing

In [26]:
tokenizer = text.Tokenizer()

In [27]:
tokenizer.fit_on_texts(list(data['tweet_text']))

In [28]:
tokenized_text = tokenizer.texts_to_sequences(data['tweet_text'])

In [29]:
#padding

In [31]:
x = pad_sequences(tokenized_text,maxlen=100)

In [32]:
#splitting data into training and testing datasets

In [33]:
y = pd.get_dummies(data['label'])

In [35]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

Creating RNN

In [39]:
model = Sequential()

In [40]:
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim=250,input_length=100))
model.add(LSTM(5))
model.add(Dropout(0.5))
model.add(Dense(20,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))

In [41]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [42]:
checkpoint = ModelCheckpoint('model-{epoch:03d}.model', monitor='val_loss', verbose=0, save_best_only=True, mode='auto')

In [43]:
x_train.shape

(7273, 100)

In [44]:
history = model.fit(x_train ,y_train, epochs=25, callbacks=[checkpoint], validation_split=0.2,batch_size=100)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [46]:
model.evaluate(x_test,y_test)



[1.4691511392593384, 0.6311159729957581]

In [47]:
#reason for overfitting

In [48]:
data['label'].value_counts()

3    5388
1    2978
0     570
2     156
Name: label, dtype: int64

In [49]:
#the model overfits because of the huge imbalance between the classes

Writing a function for predicting new texts

In [61]:
dict2 = {0 : 'Negative emotion',1 : 'Positive emotion',2 : 'I can\'t tell',3 : 'No emotion toward brand or product'}

In [62]:
def predict_sentiment(text):
  tok_text = tokenizer.texts_to_sequences([text])
  pad_text = pad_sequences(tok_text,maxlen=100)
  pred = model.predict(pad_text)
  class_lbl = np.argmax(pred)
  label_txt = dict2[class_lbl]
  return label_txt


In [63]:
txt1 = 'its just horrible!'

In [64]:
predict_sentiment(txt1)



'Negative emotion'