In [1]:
import pandas as pd

## 1. Load the provided CSV file “Sentiment.csv” and process this file as needed to handle text data.

In [132]:
dataset = pd.read_csv("Sentiment.csv")

In [133]:
dataset.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


In [134]:
#Selecting the text and sentiment columns for further analysis

dataset = dataset[['text','sentiment']] 
dataset.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [135]:
# Using lambda expression to convert string to lowercase

dataset['text'] = dataset['text'].apply(lambda x: x.lower()) 
print(dataset['text'])

0        rt @nancyleegrahn: how did everyone feel about...
1        rt @scottwalker: didn't catch the full #gopdeb...
2        rt @tjmshow: no mention of tamir rice and the ...
3        rt @robgeorge: that carly fiorina is trending ...
4        rt @danscavino: #gopdebate w/ @realdonaldtrump...
                               ...                        
13866    rt @cappy_yarbrough: love to see men who will ...
13867    rt @georgehenryw: who thought huckabee exceede...
13868    rt @lrihendry: #tedcruz as president, i will a...
13869    rt @jrehling: #gopdebate donald trump says tha...
13870    rt @lrihendry: #tedcruz headed into the presid...
Name: text, Length: 13871, dtype: object


In [136]:
# importing regular expression library

import re

dataset['text'] = dataset['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x))) 
print(dataset['text'])

0        rt nancyleegrahn how did everyone feel about t...
1        rt scottwalker didnt catch the full gopdebate ...
2        rt tjmshow no mention of tamir rice and the go...
3        rt robgeorge that carly fiorina is trending  h...
4        rt danscavino gopdebate w realdonaldtrump deli...
                               ...                        
13866    rt cappy_yarbrough love to see men who will ne...
13867    rt georgehenryw who thought huckabee exceeded ...
13868    rt lrihendry tedcruz as president i will alway...
13869    rt jrehling gopdebate donald trump says that h...
13870    rt lrihendry tedcruz headed into the president...
Name: text, Length: 13871, dtype: object


In [137]:
# Itering over the data frame 
# using dataset.iterrows()

for idx, row in dataset.iterrows(): 
	row[0] = row[0].replace('rt',' ')
    
print(dataset['text'])

0          nancyleegrahn how did everyone feel about th...
1          scottwalker didnt catch the full gopdebate l...
2          tjmshow no mention of tamir rice and the gop...
3          robgeorge that carly fiorina is trending  ho...
4          danscavino gopdebate w realdonaldtrump deliv...
                               ...                        
13866      cappy_yarbrough love to see men who will nev...
13867      georgehenryw who thought huckabee exceeded t...
13868      lrihendry tedcruz as president i will always...
13869      jrehling gopdebate donald trump says that he...
13870      lrihendry tedcruz headed into the presidenti...
Name: text, Length: 13871, dtype: object


### Preprocessing Data

In [138]:
#Cleaning and removing punctuations

import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
dataset['text']= dataset['text'].apply(lambda x: cleaning_punctuations(x))
dataset['text'].head()

0      nancyleegrahn how did everyone feel about th...
1      scottwalker didnt catch the full gopdebate l...
2      tjmshow no mention of tamir rice and the gop...
3      robgeorge that carly fiorina is trending  ho...
4      danscavino gopdebate w realdonaldtrump deliv...
Name: text, dtype: object

In [139]:
#Cleaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_repeating_char(x))
dataset['text'].tail()

13866      cappyyarbrough love to see men who will neve...
13867      georgehenryw who thought huckabee exceeded t...
13868      lrihendry tedcruz as president i will always...
13869      jrehling gopdebate donald trump says that he...
13870      lrihendry tedcruz headed into the presidenti...
Name: text, dtype: object

In [140]:
#Cleaning and removing URL’s

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_URLs(x))
dataset['text'].tail()

13866      cappyyarbrough love to see men who will neve...
13867      georgehenryw who thought huckabee exceeded t...
13868      lrihendry tedcruz as president i will always...
13869      jrehling gopdebate donald trump says that he...
13870      lrihendry tedcruz headed into the presidenti...
Name: text, dtype: object

In [141]:
# Removing stopwords

nltk.download('stopwords')
from nltk.corpus import stopwords  

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split()])
dataset['text'] = dataset['text'].apply(lambda text: cleaning_stopwords(text))
dataset['text'].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    nancyleegrahn how did everyone feel about the ...
1    scottwalker didnt catch the full gopdebate las...
2    tjmshow no mention of tamir rice and the gopde...
3    robgeorge that carly fiorina is trending hours...
4    danscavino gopdebate w realdonaldtrump deliver...
Name: text, dtype: object

In [142]:
#Cleaning and removing Numeric numbers

def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)
dataset['text'] = dataset['text'].apply(lambda x: cleaning_numbers(x))
dataset['text'].tail()

13866    cappyyarbrough love to see men who will never ...
13867    georgehenryw who thought huckabee exceeded the...
13868    lrihendry tedcruz as president i will always t...
13869    jrehling gopdebate donald trump says that he d...
13870    lrihendry tedcruz headed into the presidential...
Name: text, dtype: object

In [143]:
# Applying Stemming

import nltk
st = nltk.LancasterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data
dataset['text']= dataset['text'].apply(lambda x: stemming_on_text(x))
dataset['text'].head()

0    nancyleegrahn how did everyone feel about the ...
1    scottwalker didnt catch the full gopdebate las...
2    tjmshow no mention of tamir rice and the gopde...
3    robgeorge that carly fiorina is trending hours...
4    danscavino gopdebate w realdonaldtrump deliver...
Name: text, dtype: object

In [144]:
# Applying Lemmatizer

lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data
dataset['text'] = dataset['text'].apply(lambda x: lemmatizer_on_text(x))
dataset['text'].head()

0    nancyleegrahn how did everyone feel about the ...
1    scottwalker didnt catch the full gopdebate las...
2    tjmshow no mention of tamir rice and the gopde...
3    robgeorge that carly fiorina is trending hours...
4    danscavino gopdebate w realdonaldtrump deliver...
Name: text, dtype: object

### Tokenization

In [146]:
import nltk
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

max_features = 2000 
tokenizer = Tokenizer(num_words=max_features, split=' ') 
tokenizer.fit_on_texts(dataset['text'].values) 
X = tokenizer.texts_to_sequences(dataset['text'].values) 

dataset['text'].head()

0    nancyleegrahn how did everyone feel about the ...
1    scottwalker didnt catch the full gopdebate las...
2    tjmshow no mention of tamir rice and the gopde...
3    robgeorge that carly fiorina is trending hours...
4    danscavino gopdebate w realdonaldtrump deliver...
Name: text, dtype: object

In [147]:
# Using pad_sequence to convert reviews into equal length

from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X) 
print('X.shape = ', X.shape)

X.shape =  (13871, 28)


## 2. Build the Keras model that you have in the PPT use case.

In [69]:
# x.shape is a 2-tuple which represents the shape of x. In this case it is (13871, 28).
# x.shape[0] gives the first element in that tuple, which is 13871 (number of rows in an array).

embed_dim = 128
lstm_out = 196

model = keras.Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[0])) 
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
model.add(Dense(3,activation='Softmax')) 
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
model.summary()


Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 13871, 128)        256000    
                                                                 
 lstm_7 (LSTM)               (None, 196)               254800    
                                                                 
 dense_3 (Dense)             (None, 3)                 591       
                                                                 
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


In [82]:
# Building Keras Model

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Embedding, Dense
from tensorflow.keras.layers import LSTM

# x.shape[1] gives the second element in the tuple, which is 28 (number of columns in an array).

embed_dim = 128
lstm_out = 196

model = keras.Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1])) 
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
model.add(Dense(3,activation='Softmax')) 
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])    
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 28, 128)           256000    
                                                                 
 lstm_10 (LSTM)              (None, 196)               254800    
                                                                 
 dense_6 (Dense)             (None, 3)                 591       
                                                                 
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________


## 3. Train and save the model and use the saved model to predict on new text data (ex, “A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump”)

In [70]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import model_from_json

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(dataset['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)


In [83]:
embed_dim = 128
lstm_out = 196

def createmodel():
    model = keras.Sequential()
    model.add(Embedding(max_features, embed_dim,input_length = X.shape[1])) 
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
    model.add(Dense(3,activation='Softmax')) 
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])   
    return model

In [84]:
batch_size = 32
model = createmodel()
model.fit(X_train, y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
291/291 - 21s - loss: 0.8241 - accuracy: 0.6461 - 21s/epoch - 74ms/step
Epoch 2/5
291/291 - 17s - loss: 0.6794 - accuracy: 0.7097 - 17s/epoch - 57ms/step
Epoch 3/5
291/291 - 16s - loss: 0.6132 - accuracy: 0.7462 - 16s/epoch - 55ms/step
Epoch 4/5
291/291 - 16s - loss: 0.5693 - accuracy: 0.7626 - 16s/epoch - 56ms/step
Epoch 5/5
291/291 - 16s - loss: 0.5232 - accuracy: 0.7785 - 16s/epoch - 55ms/step


<keras.callbacks.History at 0x2840f050c70>

In [89]:
# Evaluating Loss and Accuracy of model

score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size=batch_size)
print('Loss= ',score)
print('Accuracy= ', acc)

144/144 - 2s - loss: 0.8191 - accuracy: 0.6806 - 2s/epoch - 11ms/step
Loss=  0.8191332817077637
Accuracy=  0.6806465983390808


In [90]:
# Serialize model to JSON

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
    #serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved model to disk")
    

Saved model to disk


In [91]:
# Load json and create model

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [96]:
tweet = ["A lot of good things are happening. We are respected again throughout the world, and that's a great thing.@realDonaldTrump"]

# Vectorizing the tweet by the r-fitted tokenizer instance
tweet = tokenizer.texts_to_sequences(tweet)

# Padding the tweet to have exactly the same shape as input
tweet  =pad_sequences(tweet, maxlen = 28, dtype= 'int32', value = 0)
print(tweet)


[[  0   0   0   0   0   0   0   0   0   0   0   0   7 442   5 146 288  35
   29  35 367   2 348   8   7 153 264  23]]


In [97]:
sentiment = loaded_model.predict(tweet, batch_size=1, verbose=2)
print(sentiment)

#The conclusion of the sentiment analysis is the tweet is positive with 0.5 and negative with 0.4 and neutral with 0.09.

1/1 - 0s - 41ms/epoch - 41ms/step
[[0.4109609  0.08536554 0.50367355]]


## 4. Apply the code on spam data set available in the source code (text classification on the spam.csv data set)

In [148]:
# Importing spam dataset

data = pd.read_csv("spam.csv", encoding='latin-1')

In [149]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [150]:
# Converting string to lowercase 
data['v2'] = data['v2'].apply(lambda x: x.lower())

# regex to match a string of characters that are not a letters or numbers called regex substitution
data['v2'] = data['v2'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
data['v2'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: v2, dtype: object

In [151]:
for idx, row in data.iterrows(): 
	row[0] = row[0].replace('rt',' ') 

In [152]:
max_features = 2000 
tokenizer = Tokenizer(num_words=max_features, split=" ") 
tokenizer.fit_on_texts(data['v2'].values) 
X = tokenizer.texts_to_sequences(data['v2'].values)
data['v2'].head(10)

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
5    freemsg hey there darling its been 3 weeks now...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile 11 months or more u r entitled...
Name: v2, dtype: object

In [153]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X)
print(X)

[[   0    0    0 ...   67   58  137]
 [   0    0    0 ...  443    6 1823]
 [   0    0    0 ...  459   79  382]
 ...
 [   0    0    0 ...   12   19  231]
 [   0    0    0 ...  198   12   50]
 [   0    0    0 ...    1   41  258]]


In [154]:
embed_dim = 128
lstm_out = 196

def spammodel():  
    model = keras.Sequential()
    model.add(Embedding(max_features, embed_dim,input_length = X.shape[1])) 
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) 
    model.add(Dense(2,activation='Softmax')) 
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) 
    return model


In [155]:
print(model.summary())

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_17 (Embedding)    (None, 152, 128)          256000    
                                                                 
 lstm_15 (LSTM)              (None, 196)               254800    
                                                                 
 dense_11 (Dense)            (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [156]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['v1'])
y = to_categorical(integer_encoded)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(3733, 152) (3733, 2)
(1839, 152) (1839, 2)


In [157]:
batch_size = 32
model = spammodel()
model.fit(X_train, y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
117/117 - 55s - loss: 0.1631 - accuracy: 0.9429 - 55s/epoch - 473ms/step
Epoch 2/5
117/117 - 54s - loss: 0.0405 - accuracy: 0.9869 - 54s/epoch - 462ms/step
Epoch 3/5
117/117 - 59s - loss: 0.0195 - accuracy: 0.9933 - 59s/epoch - 500ms/step
Epoch 4/5
117/117 - 60s - loss: 0.0109 - accuracy: 0.9965 - 60s/epoch - 511ms/step
Epoch 5/5
117/117 - 36s - loss: 0.0054 - accuracy: 0.9992 - 36s/epoch - 308ms/step


<keras.callbacks.History at 0x2841b73dee0>

In [158]:
score,acc = model.evaluate(X_test, y_test, verbose = 2, batch_size=batch_size)
print('Loss= ',score)
print('Accuracy= ', acc)

58/58 - 3s - loss: 0.1039 - accuracy: 0.9837 - 3s/epoch - 46ms/step
Loss=  0.10387454926967621
Accuracy=  0.9836868047714233
