In [53]:
from numpy import array
from numpy import asarray
from numpy import zeros

import pandas as pd
import numpy as np
import string, os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# load Data

In [2]:
# define documents
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']

# define class labels
labels = array([1,1,1,1,1,0,0,0,0,0])

In [151]:
from pandas import read_csv
df = pd.read_csv('sample.csv',encoding='latin-1', header=0,  sep = ',', names = ['label', 'content'], error_bad_lines=False)
df.head(3)

Unnamed: 0,label,content
0,Bachelors Degree,A
1,Bachelors Degree,B
2,Bachelors Degree,S


In [142]:
#cleaning
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

#df= df.dropna()
df[df["label"].apply(lambda x: x.isnumeric())] #dropping null values
df[df.label.apply(lambda x: x !="")] #filtering out rows with non-numeric characters in the "label" column
df[df.content.apply(lambda x: x !="")] #filterin out rows with empty comments
df.head(3)

Unnamed: 0,label,content
0,Bachelors Degree,A
1,Bachelors Degree,B
2,Bachelors Degree,S


## Remove the stop words

In [143]:
def filter_stop_words(sentences, stop_words):
    for i, sentence in enumerate(sentences):
        new_sent = [word for word in sentence.split() if word not in stop_words]
        sentences[i] = ' '.join(new_sent)
    return sentences

stop_words = set(stopwords.words("english"))-set(['a','A','i','I','t','T','as','AS','S','s'])
df['content'] = filter_stop_words(df['content'].str.lower(), stop_words)
df['content']

0             a
1             b
2             s
3            bs
4            bt
5            bw
6            cs
7            dm
8           ene
9           zbm
10          zbt
11         bs/p
12         bsac
13         bsad
14         bsfs
15        aasts
16        aasvt
17        aaswe
18        aauct
19        appit
20        as.ds
21        asche
22        as-ed
23        aslas
24        asmlt
25        assoc
26       bsb/pj
27       bsb/pm
28       bsb/ps
29       bsb/rf
30       bsb/rm
31       m.a.e.
32       m.a.s.
33       m.a.t.
34       m.acc.
35       m.arch
36       m.b.a.
37       m.b.e.
38       m.b.ed
39       m.c.e.
40       m.c.p.
41       m.d.s.
42      aati&la
43      aati&tm
44      aatm&nt
45      aats&et
46      adn/aas
47      as.math
48      ata cis
49      ata cst
50      ata ece
51      ata obt
52      eet.aas
53      english
54      geology
55      history
56      non mat
57      physics
58      spanish
59    elder law
Name: content, dtype: object

# lemmatization

## In contrast to stemming, lemmatization is a lot more powerful. It looks beyond word reduction and considers a language’s full vocabulary to apply a morphological analysis to words, aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.
### https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

# it is removing s at the end as the plurar sign!


In [149]:
# import the steming libraries to be used

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# initialise the lemmatizer


def lemmatize(sentences):
    lemmatizer = WordNetLemmatizer()
    for i, sentence in enumerate(sentences):
        new_sent = [lemmatizer.lemmatize(w) for w in sentence.split()]
        sentences[i] = ' '.join(new_sent)
    return sentences


# apply the lemmatizer function on the content column

df['content'] = lemmatize(df['content'].str.lower())
df['content']

0             a
1             b
2             s
3             b
4            bt
5            bw
6             c
7            dm
8           ene
9           zbm
10          zbt
11         bs/p
12         bsac
13         bsad
14         bsfs
15        aasts
16        aasvt
17        aaswe
18        aauct
19        appit
20        as.ds
21        asche
22        as-ed
23        aslas
24        asmlt
25        assoc
26       bsb/pj
27       bsb/pm
28       bsb/ps
29       bsb/rf
30       bsb/rm
31       m.a.e.
32       m.a.s.
33       m.a.t.
34       m.acc.
35       m.arch
36       m.b.a.
37       m.b.e.
38       m.b.ed
39       m.c.e.
40       m.c.p.
41       m.d.s.
42      aati&la
43      aati&tm
44      aatm&nt
45      aats&et
46      adn/aas
47      as.math
48       ata ci
49      ata cst
50      ata ece
51      ata obt
52      eet.aas
53      english
54      geology
55      history
56      non mat
57       physic
58      spanish
59    elder law
Name: content, dtype: object

# Lemmatization is better than Stemming

## check https://www.guru99.com/stemming-lemmatization-python-nltk.html

In [152]:
# Stemming is a method of normalization of words in Natural Language Processing. It is a technique in which a set of words in a sentence are converted into a sequence to shorten its lookup. In this method, the words having the same meaning but have some variations according to the context or sentence are normalized.
# https://www.guru99.com/stemming-lemmatization-python-nltk.html

def portStemmer(sentences):
    porter = PorterStemmer()
    for i, sentence in enumerate(sentences):
        new_sent = [porter.stem(w) for w in sentence.split()]
        sentences[i] = ' '.join(new_sent)
    return sentences


# apply the Stemming function on the content column

df['content'] = portStemmer(df['content'].str.lower())
df['content']

0             a
1             b
2             s
3            bs
4            bt
5            bw
6            cs
7            dm
8           ene
9           zbm
10          zbt
11         bs/p
12         bsac
13         bsad
14          bsf
15         aast
16        aasvt
17         aasw
18        aauct
19        appit
20         as.d
21         asch
22          as-
23         asla
24        asmlt
25        assoc
26       bsb/pj
27       bsb/pm
28        bsb/p
29       bsb/rf
30       bsb/rm
31       m.a.e.
32       m.a.s.
33       m.a.t.
34       m.acc.
35       m.arch
36       m.b.a.
37       m.b.e.
38       m.b.ed
39       m.c.e.
40       m.c.p.
41       m.d.s.
42      aati&la
43      aati&tm
44      aatm&nt
45      aats&et
46       adn/aa
47      as.math
48       ata ci
49      ata cst
50      ata ece
51      ata obt
52       eet.aa
53      english
54       geolog
55      histori
56      non mat
57       physic
58      spanish
59    elder law
Name: content, dtype: object

# Tokenize - integer encode - Pad

In [139]:
def get_sequence_of_tokens(docs):   

    # prepare tokenizer
    t = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', 
                  char_level=False, oov_token=None, document_count=0)

    ## tokenize it to words. 
    t.fit_on_texts(docs)
    
    # Get our data word index
    vocab_size = len(t.word_index) + 1
    print('vocab_size: '+str(vocab_size))

    # integer encode the documents.   
    encoded_docs = t.texts_to_sequences(docs)
    print(encoded_docs)

    # Get max training sequence length
    max_length = max([len(x) for x in encoded_docs])
    print('max_length: '+str(max_length))

    # Pad the training sequences
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post', truncating='post')
    print(padded_docs)
    
    return padded_docs, max_length, vocab_size


In [140]:
padded_docs, max_length, vocab_size= get_sequence_of_tokens(docs)


vocab_size: 15
[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
max_length: 4
[[ 6  2  0  0]
 [ 3  1  0  0]
 [ 7  4  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 5  4  0  0]
 [11  3  0  0]
 [ 5  1  0  0]
 [12 13  2 14]]


In [130]:
padded_docs, max_length, vocab_size= get_sequence_of_tokens(df['content'])

None
vocab_size: 66
[[2], [4], [6], [9], [15], [16], [17], [18], [19], [20], [21], [9, 10], [22], [23], [24], [25], [26], [27], [28], [29], [7, 30], [31], [7, 11], [32], [33], [34], [3, 35], [3, 36], [3, 37], [3, 38], [3, 39], [1, 2, 8], [1, 2, 6], [1, 2, 40], [1, 41], [1, 42], [1, 4, 2], [1, 4, 8], [1, 4, 11], [1, 12, 8], [1, 12, 10], [1, 43, 6], [13, 44], [13, 45], [46, 47], [48, 49], [50, 14], [7, 51], [5, 52], [5, 53], [5, 54], [5, 55], [56, 14], [57], [58], [59], [60, 61], [62], [63], [64, 65]]
max_length: 3
[[ 2  0  0]
 [ 4  0  0]
 [ 6  0  0]
 [ 9  0  0]
 [15  0  0]
 [16  0  0]
 [17  0  0]
 [18  0  0]
 [19  0  0]
 [20  0  0]
 [21  0  0]
 [ 9 10  0]
 [22  0  0]
 [23  0  0]
 [24  0  0]
 [25  0  0]
 [26  0  0]
 [27  0  0]
 [28  0  0]
 [29  0  0]
 [ 7 30  0]
 [31  0  0]
 [ 7 11  0]
 [32  0  0]
 [33  0  0]
 [34  0  0]
 [ 3 35  0]
 [ 3 36  0]
 [ 3 37  0]
 [ 3 38  0]
 [ 3 39  0]
 [ 1  2  8]
 [ 1  2  6]
 [ 1  2 40]
 [ 1 41  0]
 [ 1 42  0]
 [ 1  4  2]
 [ 1  4  8]
 [ 1  4 11]
 [ 1 12  8]
 

# load the whole embedding pretrain word dict into memory - glove.6B.100d

In [153]:
t

<keras_preprocessing.text.Tokenizer at 0x26788095b08>

In [8]:
# load the whole embedding into memory
embeddings_index = dict()
f = open(r'C:\Users\voghoei\Python\glove.6B/glove.6B.100d.txt', encoding="utf8")
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

print('Loaded %s word vectors.' % len(embeddings_index))

# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

Loaded 400000 word vectors.


# Create the model with embeding - creade more dimention of data

In [24]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
flatten_4 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 401       
Total params: 1,901
Trainable params: 401
Non-trainable params: 1,500
_________________________________________________________________
None
Accuracy: 80.000001


In [28]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
model.add(e)

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))


model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 4, 100)            1500      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 32)             9632      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 32)             0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 2, 32)             0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                

# using text to matrix


In [58]:
def get_matrix_of_tokenSequence(doc):
    # prepare tokenizer
    Tok = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

    ## tokenize it to words. 
    Tok.fit_on_texts(docs)

    # Get our data word index
    vocab_size = len(t.word_index) + 1
    print('vocab_size: '+str(vocab_size))

    # integer encode the documents.   
    encoded_docs = Tok.texts_to_sequences(docs)
    print(encoded_docs)
    
    #matrix length(pad) is the number of vocab and each 1 is represent if that word is appear in the sentence or not

    sequence_matrix_encode = Tok.sequences_to_matrix(encoded_docs, mode='binary')
    print(sequence_matrix_encode[1:4])
    
    return sequence_matrix_encode, max_length, vocab_size

In [59]:
sequence_matrix_encode, max_length, vocab_size = get_matrix_of_tokenSequence(docs)

vocab_size: 15
[[6, 2], [3, 1], [7, 4], [8, 1], [9], [10], [5, 4], [11, 3], [5, 1], [12, 13, 2, 14]]
[[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [60]:
e1 = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)

# compare to one_hot 
### we can not count the vocabulary size as we process sentence by sentence

In [14]:
# integer encode the documents
vocab_size = 15   # our gusse or calculated before

encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[2, 8], [7, 6], [13, 13], [1, 6], [7], [7], [7, 13], [1, 7], [7, 6], [8, 5, 8, 7]]
[[ 2  8  0  0]
 [ 7  6  0  0]
 [13 13  0  0]
 [ 1  6  0  0]
 [ 7  0  0  0]
 [ 7  0  0  0]
 [ 7 13  0  0]
 [ 1  7  0  0]
 [ 7  6  0  0]
 [ 8  5  8  7]]
