In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# "header=0" indicates that the first line of the file contains column names, 
# "delimiter="\t" indicates that the fields are separated by tabs
train = pd.read_csv('labeledTrainData.tsv', header=0 , delimiter='\t')
test = pd.read_csv('testData.tsv', header=0, delimiter='\t')
print(train.shape)
print(train.columns.values)
train.head()

(25000, 3)
['id' 'sentiment' 'review']


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
print(test.shape)
print(test.columns.values)
test.head()

(25000, 2)
['id' 'review']


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [4]:
print(train['review'][0])

With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally star

# Data Cleaning and Text Preprocessing

In [5]:
from bs4 import BeautifulSoup # for removing HTML
import re # for using regular expressions
import nltk
nltk.download('stopwords') # Download text data sets, including stop words
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yenni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def review_to_words(raw_review):
    # Remove HTML
    review = BeautifulSoup(raw_review).get_text()
    # Remove anything not letters
    review = re.sub('[^a-zA-Z]',' ', review)
    # Convert to lower case, split into individual words
    review = review.lower().split()
    
    # Remove stopwords
    # Convert a list of stopwords to a set (compute faster)
    stop_words = set(stopwords.words('English'))
    modified_review = [w for w in review if not w in stop_words ]
    
    # Join the words back into one string separated by space, and return the result.
    return(" ".join(modified_review))

In [7]:
num_review = train['review'].size
num_review

25000

In [8]:
new_review = []

for i in range(0, num_review):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Review %d of %d" % ( i+1, num_review))
        
    new_review.append(review_to_words(train['review'][i]))

Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


# Model

In [80]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(new_review)
list_tokenized_train = tokenizer.texts_to_sequences(new_review)

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = train['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
#model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation="relu")) #20
model.add(Dropout(0.05)) #0.05
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.1) #0.2

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x283e3218358>

# Submission

In [81]:
# Create an empty list and append the clean reviews one by one
num_test_reviews = len(test["review"])
clean_test_reviews = [] 

for i in range(0,num_test_reviews):
    if( (i+1) % 1000 == 0 ):
        print("Review %d of %d" % (i+1, num_test_reviews))
    clean_review = review_to_words( test["review"][i] )
    clean_test_reviews.append( clean_review )



Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Review 25000 of 25000


In [82]:
list_tokenized_test = tokenizer.texts_to_sequences(clean_test_reviews)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
prediction = model.predict(X_te)
y_pred = (prediction > 0.5)

'''
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, test["sentiment"])))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)
'''


'\nfrom sklearn.metrics import f1_score, confusion_matrix\nprint(\'F1-score: {0}\'.format(f1_score(y_pred, test["sentiment"])))\nprint(\'Confusion matrix:\')\nconfusion_matrix(y_pred, y_test)\n'

In [83]:
y_pred.astype(int)

array([[1],
       [0],
       [1],
       ...,
       [0],
       [1],
       [1]])

In [84]:
from pandas import Series
print(type(prediction))
print(type(y_pred))
print(y_pred.shape)
df_y_pred = pd.DataFrame(y_pred.astype(int))
print(type(test))
print(type(test["id"]))
df_y_pred.head()

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(25000, 1)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


Unnamed: 0,0
0,1
1,0
2,1
3,1
4,1


In [85]:
# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"], "sentiment":df_y_pred[0]} )

# Use pandas to write the comma-separated output file
output.to_csv( "submission_deep_learning.csv", index=False)