## Keras Libraries

In [18]:

from keras.preprocessing.text import Tokenizer,one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding



## Plotly Libraries

In [19]:
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

## Othe Libraries

In [55]:
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.manifold import TSNE

## Reading the dataset

In [97]:
from pandas import read_csv
df = pd.read_csv('sample.csv',encoding='latin-1', sep = ',', names = ['label', 'content'], error_bad_lines=False)

In [98]:
df.head(2)

Unnamed: 0,label,content
0,label,content
1,Bachelors Degree,A


## Cleanning

In [99]:
#df= df.dropna()
df[df["label"].apply(lambda x: x.isnumeric())] #dropping null values
df[df.label.apply(lambda x: x !="")] #filtering out rows with non-numeric characters in the "label" column
df[df.content.apply(lambda x: x !="")] #filterin out rows with empty comments

Unnamed: 0,label,content
0,label,content
1,Bachelors Degree,A
2,Bachelors Degree,B
3,Bachelors Degree,S
4,Bachelors Degree,BS
...,...,...
56,Associate Degree,HISTORY
57,Associate Degree,NON MAT
58,Associate Degree,PHYSICS
59,Associate Degree,SPANISH


## Tokenizing and creating sentences
* In this part I divide a sentence into a list of words. 

* The "Tokenizer" function in KLeras, as it name says, tokenizes sentences.

* The "texts_to_sequences" makes a sentence of words.

In [124]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['content'])
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1c8944a8388>

In [125]:
num_words = len(tokenizer.word_index) + 1
num_words

67

In [126]:
sequences = tokenizer.texts_to_sequences(df['content'])
print(len(sequences))
print(sequences)

61
[[15], [2], [4], [6], [9], [16], [17], [18], [19], [20], [21], [22], [9, 10], [23], [24], [25], [26], [27], [28], [29], [30], [7, 31], [32], [7, 11], [33], [34], [35], [3, 36], [3, 37], [3, 38], [3, 39], [3, 40], [1, 2, 8], [1, 2, 6], [1, 2, 41], [1, 42], [1, 43], [1, 4, 2], [1, 4, 8], [1, 4, 11], [1, 12, 8], [1, 12, 10], [1, 44, 6], [13, 45], [13, 46], [47, 48], [49, 50], [51, 14], [7, 52], [5, 53], [5, 54], [5, 55], [5, 56], [57, 14], [58], [59], [60], [61, 62], [63], [64], [65, 66]]


In [127]:
data = pad_sequences(sequences, num_words)
print(data)

[[ 0  0  0 ...  0  0 15]
 [ 0  0  0 ...  0  0  2]
 [ 0  0  0 ...  0  0  4]
 ...
 [ 0  0  0 ...  0  0 63]
 [ 0  0  0 ...  0  0 64]
 [ 0  0  0 ...  0 65 66]]


In [129]:
np.set_printoptions(threshold=np.inf)


In [130]:
encoded_docs = tokenizer.sequences_to_matrix(sequences, mode='binary')
print(encoded_docs)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 

## Building the LSTM Neural Network

In [33]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=50)) 
### an embedding layer to expand tokens, allowing NN to represent the word in a meaningful way ###
######Embedding(vocabolary size, dimension of embedding, MAX length of each sentence)####
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) # 
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.fit(data, np.array(labels), validation_split=0.4, epochs=3)


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Train on 6397 samples, validate on 4265 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x1c8876f2048>