In [0]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
import bz2
import re
from tqdm import tqdm
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dropout,Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
# Any results you write to the current directory are saved as output.

In [0]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [0]:
train_file = bz2.BZ2File('/content/drive/My Drive//train/test.ft.txt.bz2')
train_lines = train_file.readlines()
train_lines = [x.decode('utf-8') for x in train_lines]
# Load from the file
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_train, y_train = shuffle(reviews_train, y_train)
y_train = np.array(y_train)
print(y_train[1])


In [0]:
print(train_lines[7])


In [0]:
max_features = 8192
maxlen = 128
embed_size = 64

In [0]:
X_train,X_test,y_train,y_test=train_test_split(reviews_train,y_train,train_size=0.80,test_size=0.20,random_state=43)

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
print("tokenizing done!!")
token_train = tokenizer.texts_to_sequences(X_train)
print("sequencing done!")
X_train = pad_sequences(token_train, maxlen=maxlen)
print("padding done!!")


tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_test)
print("tokenizing test data done!!")
token_train = tokenizer.texts_to_sequences(X_test)
print("sequencing done!")
X_test = pad_sequences(token_train, maxlen=maxlen)
print("padding  test data done!!")
X_train[0]



In [0]:

def createLSTM():
    model=Sequential()
    model.add(Embedding(max_features,embed_size))
    model.add(LSTM(256,return_sequences=True))
    model.add(LSTM(512))
    model.add(Dense(500,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100,activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2,activation='sigmoid'))
    return model

In [0]:
model=createLSTM()
model.summary()

In [0]:
np.shape(X_train),np.shape(y_train)

In [0]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
weight_path="early_weights.hdf5"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping=EarlyStopping(patience=3,monitor='val_loss',mode="min",verbose=1)
callbackslist = [checkpoint, early_stopping]
model.fit(X_train,y_train,batch_size=1024,epochs=20, shuffle = True, validation_split=0.20,verbose=1,callbacks=callbackslist)

In [0]:
model.evaluate(X_test,y_test,batch_size=1024)

In [0]:

test=["it is the right choice"]
print(test)
print('RESULT:')
pred=model.predict(pad_sequences(tokenizer.texts_to_sequences(test),maxlen=maxlen))
print(pred)