In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import nltk
import re
import string
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import KeyedVectors

import warnings
  
warnings.filterwarnings(action = 'ignore')

In [2]:
data = pd.read_csv('../input/training1600000processednoemoticoncsv/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [3]:
data.columns = ['Label', 'ids','date','flag','user','Text']
data['Label'][data['Label']==4]=1
del data['ids']
del data['date']
del data['flag']
del data['user']

#data.drop('ids',axis=1)
#data.drop('date',axis=1)
#data.drop('flag',axis=1)
#data.drop('user',axis=1)

In [4]:
data.head()


Unnamed: 0,Label,Text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [5]:
data.Label.unique()


array([0, 1])

In [6]:
data = data[:-600000]
data=data[600000:]
data.shape
#(39999,2)


(399999, 2)

In [7]:
pos = []
neg = []
for l in data.Label:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [8]:
data['Pos']= pos
data['Neg']= neg
#data_neg = data[data['Label'] == 0]
#data_pos = data[data['Label'] == 1]
#data['Pos']=data['Label'][:1000]
#data['Neg']=data['Label'][:1000]

In [9]:
data.head()


Unnamed: 0,Label,Text,Pos,Neg
600000,0,@kikialakiki me too! and got socks on,0,1
600001,0,must stop watching the machinist as i fall asl...,0,1
600002,0,Proper want to be on LiveLounge one day,0,1
600003,0,FREE Fourstar jacket courtesy of Spike Jonze! ...,0,1
600004,0,Ohh boy. Today is going to be interesting. Why...,0,1


In [10]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))

In [11]:
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean]

In [12]:
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

In [13]:
#from nltk.corpus import stopwords
#stoplist = stopwords.words('english')

nltk.download("stopwords")
stop_words = set(nltk.corpus.stopwords.words('english'))
exclude_words = set(("never", "not","nor"))
new_stop_words = stop_words.difference(exclude_words)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in new_stop_words]

In [15]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens]

In [16]:
result = [' '.join(sen) for sen in filtered_words]


In [17]:
data['Text_Final'] = result


In [18]:
data['tokens'] = filtered_words


In [19]:
data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg']]


In [20]:
data[:4]


Unnamed: 0,Text_Final,tokens,Label,Pos,Neg
600000,kikialakiki got socks,"[kikialakiki, got, socks]",0,0,1
600001,must stop watching machinist fall asleepcant h...,"[must, stop, watching, machinist, fall, asleep...",0,0,1
600002,proper want livelounge one day,"[proper, want, livelounge, one, day]",0,0,1
600003,free fourstar jacket courtesy spike jonze real...,"[free, fourstar, jacket, courtesy, spike, jonz...",0,0,1


In [21]:
data_train, data_test = train_test_split(data, test_size=0.25, random_state=42)


In [22]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

2380884 words total, with a vocabulary size of 248986
Max sentence length is 51


In [23]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

796350 words total, with a vocabulary size of 110277
Max sentence length is 53


In [24]:

from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
models = KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)
word2vec=models

#word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
#word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [25]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [26]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)


In [27]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [28]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 248955 unique tokens.


In [29]:
num_words=len(TRAINING_VOCAB)
print(num_words)

248986


In [30]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [31]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(248956, 300)


In [32]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [33]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [34]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [35]:
label_names = ['Pos', 'Neg']


In [36]:
y_train = data_train[label_names].values


In [37]:
x_train = train_cnn_data
y_tr = y_train

In [38]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 50, 300)      74686800    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 49, 200)      120200      embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 48, 200)      180200      embedding[0][0]                  
______________________________________________________________________________________________

In [39]:
num_epochs = 8
batch_size = 38

#num_epochs = 15
#batch_size = 25

In [40]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [41]:
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)
#input=["i am sad"]
#[n p]
#[0.56 0.9]




In [42]:
labels = [1, 0]


In [43]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [44]:
print(test_cnn_data[:10])

[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     4 19936  6155  1277   287   190     6  1915   108  2572
    169   115]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
   1693  1466]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0 65991    55    13    22  7577
    996  2067]
 [    0     0     0     0     0     0     0     0     0     0     0

In [45]:
from sklearn.metrics import classification_report

acc=sum(data_test.Label==prediction_labels)/len(prediction_labels)

y_pred = model.predict(test_cnn_data, batch_size=1024, verbose=1)
y_pred_bool = []
for p in y_pred:
    y_pred_bool.append(labels[np.argmax(p)])
    
y_test = data_test['Label']

# y_test are actual values
# y_pred_bool are predicted values

print(classification_report(y_test, y_pred_bool))

              precision    recall  f1-score   support

           0       0.76      0.78      0.77     50025
           1       0.77      0.75      0.76     49975

    accuracy                           0.77    100000
   macro avg       0.77      0.77      0.77    100000
weighted avg       0.77      0.77      0.77    100000



In [46]:
data_test.Label.value_counts()

0    50025
1    49975
Name: Label, dtype: int64

In [47]:
import joblib
import pickle
model.save("mymodel.h5")

In [48]:
from keras.models import load_model
m=load_model("mymodel.h5")

In [49]:
pred=["i am happy"]

pred_sequences = tokenizer.texts_to_sequences(pred)
pred_cnn_data = pad_sequences(pred_sequences, maxlen=MAX_SEQUENCE_LENGTH)

pred_res=model.predict(pred_cnn_data)
pred_labels=[]
for p in pred_res:
    #print(p)
    pred_labels.append(labels[np.argmax(p)])

print(pred_labels)

if pred_labels[0]==0:
    print('Negative')
    
if pred_labels[0]==1:
    print('Positive')

[1]
Positive
