## Header Files

In [26]:
import pandas as pd
import numpy as np
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras import layers, regularizers
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Dropout

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
stop.extend(['@', '.', '#', 'user'])

## Loading Train and Test Data

In [2]:
train_data_add = "./Data/train.csv"
test_data_add = "./Data/test.csv"
train_data = pd.read_csv(train_data_add)
test_data = pd.read_csv(test_data_add)


## Checking Data

In [3]:
print("Training Data")
print(train_data.head(3))
print("Length of Test data = ", len(train_data))
print("\nTest Data")
print(test_data.head(3))
print("Length of Test data = ", len(test_data))

Training Data
   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
Length of Test data =  31962

Test Data
      id                                              tweet
0  31963  #studiolife #aislife #requires #passion #dedic...
1  31964   @user #white #supremacists want everyone to s...
2  31965  safe ways to heal your #acne!!    #altwaystohe...
Length of Test data =  17197


In [4]:
# Converting to Lower Case
for i in range(len(train_data)):
    train_data.iloc[i, 2] = train_data.iloc[i, 2].lower()
print(train_data.head(5))

   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation


## Tokenization

In [5]:
tokens = []
for i in range(len(train_data)):
    tokens.append(nltk.word_tokenize(train_data.iloc[i, 2]))
print("After Tokenizing          : ", tokens[0])

After Tokenizing          :  ['@', 'user', 'when', 'a', 'father', 'is', 'dysfunctional', 'and', 'is', 'so', 'selfish', 'he', 'drags', 'his', 'kids', 'into', 'his', 'dysfunction', '.', '#', 'run']


## Stop Word Removal

In [27]:
filtered_tokens = []
for i in range(len(tokens)):
    filtered_tokens.append([word for word in tokens[i] if word not in stop])
print("After Removing Stop Words : ", filtered_tokens[0])

tokenizer = Tokenizer(
    num_words = max_words,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)
tokenizer.fit_on_texts(filtered_tokens)

After Removing Stop Words :  ['father', 'dysfunctional', 'selfish', 'drags', 'kids', 'dysfunction', 'run']


NameError: name 'max_words' is not defined

## Lemmatization

In [7]:
lemmatizers = []
for i in range(len(filtered_tokens)):
    lemmatizers.append([lemmatizer.lemmatize(word) for word in filtered_tokens[i]])
print("After Lemmatization       : ", lemmatizers[0])

After Lemmatization       :  ['father', 'dysfunctional', 'selfish', 'drag', 'kid', 'dysfunction', 'run']


In [8]:
"""
 The next step currently is converting these words into numbers based on occurances and padding with zeros for length  
"""
a=0
for i in range(len(lemmatizers)):
    a=max(a,len(lemmatizers[i]))

In [9]:
print(a)

114


## Finding Highest frequency of words and creating the int dict

In [11]:
lemmatizers_to_int = Counter(total_words)
total_word_count = len(total_words)
sorted_order = lemmatizers_to_int.most_common(total_word_count)
print(lemmatizers_to_int)
vocab_to_index = {w: i+1 for i, (w, c) in enumerate(sorted_order)}
print(vocab_to_index)





## Encoding to numbers

In [12]:
num_encoded_reviews = []
for i in range(len(lemmatizers)):
    num_encoded_reviews.append([vocab_to_index[word] for word in lemmatizers[i]])
print(num_encoded_reviews)

[[28, 15345, 2549, 3411, 149, 7456, 361], [122, 5279, 2281, 39, 11, 368, 551, 11, 1099, 7457, 4091, 7458, 15346, 9704], [33, 3159], [94, 5, 13, 43, 13, 16, 110, 1, 1, 1, 111, 108], [2716, 4, 1278, 241], [427, 9705, 437, 955, 258, 6165, 210, 631, 313, 3707, 132, 15347, 23, 15348], [2070, 76, 15349], [102, 184, 62, 62, 15350, 39, 11, 78, 904, 184, 1100, 163, 1279, 3708, 15351, 45], [1, 1, 1, 5, 1220, 1, 1, 1, 9706, 1077, 2411, 1363, 15352, 18], [647, 1, 22, 3, 3412, 1], [119, 1172, 2282, 1173, 1824, 37, 306, 21, 9707, 987, 5280, 183, 3709, 183, 195, 90, 123, 113, 148], [2549, 84, 9708, 2943, 717, 15353, 2549, 1419, 1364, 5], [23, 38, 440, 20, 1, 1, 15354, 15355], [1614, 307, 7459, 1500, 184, 9709, 1244, 85, 5281, 25, 1825], [441, 1, 1174, 9710, 1678, 7460, 6166, 7460], [3710, 9, 2944, 15356, 7461, 2944, 15357, 352], [26, 1679, 26, 27], [389, 820, 1], [57, 1, 603, 36, 211, 133, 1826, 4, 2412, 32, 30], [68, 2, 1737, 760, 177, 7462], [325, 30, 4092, 1827, 15358, 496, 1280, 5282, 873, 252, 1

## Padding and truncating

In [22]:
padded_reviews = pad_sequences(num_encoded_reviews, maxlen=90)
print(padded_reviews)
y_train = np.asarray(np.array(train_data.iloc[:,1])).astype('float32').reshape((-1,1))


[[    0     0     0 ...   149  7456   361]
 [    0     0     0 ...  7458 15346  9704]
 [    0     0     0 ...     0    33  3159]
 ...
 [    0     0     0 ...  7208    44    82]
 [    0     0     0 ...  1490  1491   491]
 [    0     0     0 ...     0   121   131]]


In [23]:
ping=[]
for i in range(len(num_encoded_reviews)):
    ping.append((num_encoded_reviews[i],y_train[i]))
random.shuffle(ping)

training_set = ping[:20000]
testing_set = ping[20000:]

In [24]:
# Now we are ready to train our model through LSTM network
model = Sequential()
model.add(layers.Embedding(len(total_words),100))
model.add(layers.LSTM(30))
model.add(layers.Dense(1,activation='softmax', input_shape=(50,)))

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_mode1_11-02-2021.hdf5",monitor='val-accuracy',save_best_only = True, save_weights_only= False)

# Our vectorized labels
model.fit(np.array(padded_reviews), y_train, validation_split=0.2 , epochs=50, callbacks=[checkpoint1], verbose=1 )

Epoch 1/50
 15/800 [..............................] - ETA: 4:43 - loss: 0.6684 - accuracy: 0.0735

KeyboardInterrupt: 

In [None]:
count=0
for i in range(len(train_data.iloc[:,1])):
    if(train_data.iloc[i,1]==1):
        count=count+1

In [None]:
count,len(train_data.iloc[:,1])-count

In [None]:
len(train_data.iloc[:,1])


In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim = len(vocab_to_index)+1, output_dim = 256, input_length = 70))
model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(1, activation = 'softmax'))
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

In [None]:
batch_size = 512
epochs = 8
X_train= np.array(padded_reviews)
y_train = np.array(train_data.iloc[:,1])

In [None]:
history = model_lstm.fit(
    X_train,
    y_train,
    validation_split = 0.1,
    epochs = 8,
    batch_size = 512
)

In [None]:
X_train[0]