In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

import re
from bs4 import BeautifulSoup
from nltk.corpus import RegexpTokenizer
from nltk.corpus import stopwords


['unlabeledTrainData.tsv', 'labeledTrainData.tsv', 'testData.tsv', 'sampleSubmission.csv']


In [2]:
#train and test data
train = pd.read_csv('../input/labeledTrainData.tsv', delimiter = '\t', encoding = 'utf-8')
test = pd.read_csv('../input/testData.tsv', delimiter = '\t', encoding = 'utf-8')


In [3]:
#reviews 
review = train['review']

In [4]:
#removing html tags

review_text = [re.sub(r'\<.*\>', "", i) for i in review]

In [5]:
#removing punctuations
filter_punc = RegexpTokenizer(r'\w+')
list_filter = [filter_punc.tokenize(i) for i in review_text]
    

In [6]:
#convert everything to lower case
review_lower = []
for words in list_filter:
    lower_case = [i.lower() for i in words]
    review_lower.append(lower_case)

In [7]:
#lemmatization 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


In [8]:
#remove StopWords!
stop_words = set(stopwords.words('english'))
text = []
for words in review_lower:
    clean_words = []
    for word in words:
        if word not in stop_words and not word.isdigit():
            word = lemmatizer.lemmatize(word)
            clean_words.append(word)
    text.append(clean_words)

In [9]:
string_review = []
for words in text:
    string = ""
    for word in words:
        string = string + " " + word
    string_review.append(string)
    

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(string_review)
list_seq = tokenizer.texts_to_sequences(string_review)
X = pad_sequences(list_seq, maxlen=130)

Using TensorFlow backend.


In [11]:
y = list(train['sentiment'])

In [12]:
from keras.layers import Dense, LSTM, Bidirectional, GlobalMaxPool1D, Embedding, Dropout
from keras.models import Sequential 

max_feat = 6000
embd_size = 128
model = Sequential()
model.add(Embedding(max_feat,embd_size ))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3
model.fit(X,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f43d834eb38>

In [13]:
model.save('ReviewPredictor.h5')

In [14]:
#cleaning test reviews
reviews_y = test['review']
#removing HTML Tags
review_text_y = [re.sub(r'\<.*\>', "", i) for i in reviews_y]


In [15]:
#removing punctuation
list_filter_y = [filter_punc.tokenize(i) for i in review_text_y]
#convert everything to lower case
review_lower_y = []
for words in list_filter_y:
    lower_case_y = [i.lower() for i in words]
    review_lower_y.append(lower_case_y)

    

In [16]:
#removing stopwords 
text = []
for words in review_lower_y:
    clean_words = []
    for word in words:
        if word not in stop_words and not word.isdigit():
            word = lemmatizer.lemmatize(word)
            clean_words.append(word)
    text.append(clean_words)

In [17]:
#conversion to string
string_review = []
for words in text:
    string = ""
    for word in words:
        string = string + " " + word
    string_review.append(string)

In [18]:
list_tokenized_test = tokenizer.texts_to_sequences(string_review)
X_test = pad_sequences(list_tokenized_test, maxlen=130)

In [19]:
prediction = model.predict(X_test)
y_pred = (prediction > 0.5)

In [20]:
y_predictions = [int(i) for i in y_pred]

In [21]:
dataframe = {'id':test['id'], 'sentiment':y_predictions}
submission = pd.DataFrame(dataframe)
submission.to_csv('submissionData.csv')