In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten
from keras.callbacks import EarlyStopping

[nltk_data] Downloading package stopwords to C:\Users\Darryl
[nltk_data]     See\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,content,sentiment
0,1960353321,"@bex_1210 holy crap, I need to see that! Too b...",happy
1,1753218598,HAPPY MOTHER'S DAY to all of the wonderful wom...,happy
2,1961368089,Wishing I could be in NOLA this weekend oh we...,happy
3,1961456147,"What a day! #dayofservice completed, and now a...",happy
4,1962477969,@JamesMurphy anything to sell an album. poor t...,sad


In [4]:
def clean_content(content):
    content = re.sub('@[^\s]+','',content)
    alpha_only = re.sub("[^a-zA-Z]",' ',content) #"[^a-zA-Z]" this regex will remove any non-alphabetical char as they are not significant
    words = alpha_only.lower().split()
    stop = set(stopwords.words('english'))
    #from the dataframe we can see 'user' word is quite common in the tweets, which is basically used for tagging someone in the tweet
    #so I will be removing that

    sig_words = [word for word in words if not word in stop]
    return(" ".join(sig_words))

In [5]:
df['clean_content']  = df['content'].apply(lambda content: clean_content(content))
train = df
train.head()

Unnamed: 0,id,content,sentiment,clean_content
0,1960353321,"@bex_1210 holy crap, I need to see that! Too b...",happy,holy crap need see bad gotta wait tomorrow
1,1753218598,HAPPY MOTHER'S DAY to all of the wonderful wom...,happy,happy mother day wonderful women great relaxfu...
2,1961368089,Wishing I could be in NOLA this weekend oh we...,happy,wishing could nola weekend oh well tuesday
3,1961456147,"What a day! #dayofservice completed, and now a...",happy,day dayofservice completed aching clearing tre...
4,1962477969,@JamesMurphy anything to sell an album. poor t...,sad,anything sell album poor thing


In [6]:
df

Unnamed: 0,id,content,sentiment,clean_content
0,1960353321,"@bex_1210 holy crap, I need to see that! Too b...",happy,holy crap need see bad gotta wait tomorrow
1,1753218598,HAPPY MOTHER'S DAY to all of the wonderful wom...,happy,happy mother day wonderful women great relaxfu...
2,1961368089,Wishing I could be in NOLA this weekend oh we...,happy,wishing could nola weekend oh well tuesday
3,1961456147,"What a day! #dayofservice completed, and now a...",happy,day dayofservice completed aching clearing tre...
4,1962477969,@JamesMurphy anything to sell an album. poor t...,sad,anything sell album poor thing
...,...,...,...,...
23995,1957523762,I have been playing skate for two hours. Now i...,happy,playing skate two hours need get actually skat...
23996,1965999020,im wearing a certain tye dye tshirt at the mom...,sad,im wearing certain tye dye tshirt moment misse...
23997,1694258339,@DeepaPrabhu Thanks and thanks,happy,thanks thanks
23998,1964535265,@scottisafool I had a analog tuner the MC tea...,sad,analog tuner mc team gave years ago inbox driv...


In [7]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train['clean_content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 22328 unique tokens.


In [8]:
X = tokenizer.texts_to_sequences(train['clean_content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (24000, 250)


In [9]:
Y = pd.get_dummies(df['sentiment']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (24000, 4)


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(21600, 250) (21600, 4)
(2400, 250) (2400, 4)


In [11]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

ValueError: Input 0 is incompatible with layer lstm_2: expected ndim=3, found ndim=2

In [26]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 1.377
  Accuracy: 0.486


In [51]:
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=MAX_NB_WORDS, 
                           output_dim=embedding_dim, 
                           input_length=MAX_SEQUENCE_LENGTH))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(4, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 250, 50)           2500000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 12500)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 10)                125010    
_________________________________________________________________
dense_8 (Dense)              (None, 20)                220       
_________________________________________________________________
dense_9 (Dense)              (None, 30)                630       
_________________________________________________________________
dense_10 (Dense)             (None, 4)                 124       
Total params: 2,625,984
Trainable params: 2,625,984
Non-trainable params: 0
___________________________________________

In [52]:
history = model.fit(X_train, Y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, Y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, Y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, Y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: ignored