In [1]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding

In [2]:
df = pd.read_csv(r"D:\School\CSIT 321 FINAL YEAR PROJECT\Datasetexploration\tweetsSample.csv", encoding = "ISO-8859-1")

In [3]:
df.head(20)

Unnamed: 0,text,Sentiment,RelaxTense
0,Once again a beautiful morning is bursting ont...,0,0
1,"@alexismmitchell dang, I accidentally unfollow...",0,0
2,was supposed to wake up at 6. woke up at 9. go...,0,4
3,"@DH_NET Sorry to hear about that, Jen. Know ho...",0,0
4,Kate is going to win! If she doesn't then i wi...,0,2
5,@serpah - I haz the geekflu too.,0,0
6,@donotwant interview went well! except i'll ha...,0,0
7,shall miss taping tomorrow... darn school. hm...,0,0
8,"ages since last little Twit, Hello everyone i'...",0,0
9,Everything tastes bland. fever &amp; flu-off ...,0,0


In [4]:
#Sampled 50% of each positive and negative from original kaggle dataset
df.Sentiment.value_counts()

4    50000
0    50000
Name: Sentiment, dtype: int64

In [5]:
df.RelaxTense.value_counts()

0    80095
2    14064
4     5841
Name: RelaxTense, dtype: int64

In [6]:
#Train the first binary model with a sample of 50000 positive tweets and 50000 negative tweets
sentimentBinaryModelTrain = df[['text', 'Sentiment']]

In [7]:
sentimentBinaryModelTrain.text = sentimentBinaryModelTrain.text.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [8]:
sentiment_label = sentimentBinaryModelTrain.Sentiment.factorize()
sentiment_label

(array([0, 0, 0, ..., 1, 1, 1], dtype=int64),
 Int64Index([0, 4], dtype='int64'))

In [29]:
# prepare text for natural language processing
# Assign a number to each word in each sentences and replace each word with their respective assigned numbers
# Use word embedding to capture the context of the word in a sentence

tweet = sentimentBinaryModelTrain.text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)     # Updates internal vocabulary based on a list of texts

vocab_size = len(tokenizer.word_index) + 1

encoded_docs = tokenizer.texts_to_sequences(tweet) # Replaces words in sentence with their respective numbers

padded_sequence = pad_sequences(encoded_docs, maxlen=200) # Padded sequence is needed as length of tweets varies


In [10]:
# Build model

embedding_vector_length = 32

model = Sequential()

model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
#Long Short-Term Memory Network
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [11]:
history = model.fit(padded_sequence, sentiment_label[0],
                   validation_split=0.2, epochs=3, batch_size=32)


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
test_word = input("Enter a sentence to test:")

tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)
tw

Enter a sentence to test:We're the party of love, freedom, liberty, and Americanism to name a few 


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [34]:
prediction = int(model.predict(tw).round().item())
sentiment_label[1][prediction]


4

In [36]:
prediction

1

In [30]:
#Save the model
import joblib
joblib.dump(tokenizer, "PN_data_tokenizer.joblib")
model.save("PNmodel.h5")

In [18]:
#train another binary NN for relex-tense vector
RT =  df[['text', 'RelaxTense']]
RT = RT[RT['RelaxTense'] != 0]
#sample 5000 of each relax and tense labels for a balanced set
RTBinaryModelTrain = pd.concat([RT[RT.RelaxTense==2].sample(5000),
                                RT[RT.RelaxTense==4].sample(5000)])

In [20]:
RT_label = RTBinaryModelTrain.RelaxTense.factorize()
RT_label

(array([0, 0, 0, ..., 1, 1, 1], dtype=int64),
 Int64Index([2, 4], dtype='int64'))

In [31]:
# prepare text for natural language processing

tweet = RTBinaryModelTrain.text.values
tokenizerRT = Tokenizer(num_words=5000)
tokenizerRT.fit_on_texts(tweet)     

vocab_size = len(tokenizerRT.word_index) + 1

encoded_docs = tokenizerRT.texts_to_sequences(tweet) 

padded_sequenceRT = pad_sequences(encoded_docs, maxlen=200) # Padded sequence is needed as length of tweets varies


In [22]:
# Build model

embedding_vector_length = 32

model = Sequential()

model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
#Long Short-Term Memory Network
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
#another binary model to classify between relaxed and tensed
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [25]:
history = model.fit(padded_sequenceRT, RT_label[0],
                   validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
test_word = input("Enter a sentence to test:")

tw = tokenizerRT.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)
tw

Enter a sentence to test:chilling at home


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [27]:
prediction = int(model.predict(tw).round().item())
RT_label[1][prediction]
#2 means relax 4 means tense

2

In [32]:
joblib.dump(tokenizer, "RT_data_tokenizer.joblib")
model.save("RTmodel.h5")

#to do:

parameter grid for tuning
report on NN architecture