In [0]:
#libraries imported

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from keras.utils import to_categorical
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from sklearn.model_selection import train_test_split
from google.colab import drive

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
#reading dataset
dataset = pd.read_csv("https://raw.githubusercontent.com/cacoderquan/Sentiment-Analysis-on-the-Rotten-Tomatoes-movie-review-dataset/master/train.tsv", sep = '\t')
dataset = dataset.loc[:, 'Phrase':'Sentiment']
# dataset = dataset.drop(columns = ['PhraseId', 'SentenceId'])
documents = dataset.values.tolist()

In [0]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
punctuations = '-\'?:!.,;\"()'

remove_stopwords = True
useStemming = False
useLemma = True
removePuncs = True

for l in range(len(documents)):
  label = documents[l][1]
  tempReview = []
  for w in documents[l][0].split(' '):
    newWord = w
    if remove_stopwords and (w in stop_words):
      continue
    if removePuncs and (w in punctuations):
      continue
    if useStemming:
      newWord = porter.stem(newWord)
    if useLemma:
      newWord = wordnet_lemmatizer.lemmatize(newWord)
    tempReview.append(newWord)
    
    documents[l] = (' '.join(tempReview), label)

In [0]:
#bag of words . TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words = 'english', ngram_range= (1,1), max_features=1500)

In [0]:
#Splitting of raw data 
all_data = pd.DataFrame(documents, columns=['Phrase', 'Sentiment'])
x_train, x_test, y_train, y_test = train_test_split(all_data['Phrase'], all_data['Sentiment'], train_size = 0.7, shuffle = True, random_state = 2003)

In [0]:
#vectorization
X = vectorizer.fit_transform(all_data['Phrase'])
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)

In [0]:

x_train_np = x_train.toarray()
y_train_np = to_categorical(y_train)
x_test_np = x_test.toarray()
y_test_np = to_categorical(y_test)

In [0]:
x_train = np.expand_dims(x_train_np, axis=2)
x_test = np.expand_dims(x_test_np, axis=2)

In [0]:
#model
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

batch_size = 128
# inputs = inputs
# outputs =outputs
#create model
model = Sequential()
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu', input_shape=(x_train_np.shape[1],1)))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(Conv1D(filters = 128, kernel_size=1, activation='relu'))
model.add(MaxPooling1D(pool_size =2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(5, activation='softmax'))


In [77]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_14 (Conv1D)           (None, 1500, 128)         256       
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 1500, 128)         16512     
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 1500, 128)         16512     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 750, 128)          0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 96000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               9600100   
_________________________________________________________________
dense_7 (Dense)              (None, 5)                

In [69]:
#training

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.fit(x_train, y_train_np, validation_data=(x_test, y_test_np), epochs = 10, batch_size = 128)

Train on 109242 samples, validate on 46818 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa318a0a898>

In [0]:
train_accuracy = model.evaluate(x_train, y_train_np, verbose=0) 
test_accuracy = model.evaluate(x_test, y_test_np, verbose=0)

In [71]:
print("train accuracy: %.2f%%" % (train_accuracy[1]*100))
print("test_accuracy: %.2f%%" % (test_accuracy[1]*100))

train accuracy: 72.52%
test_accuracy: 60.43%


In [0]:

# predict probabilities for test set
#yhat_probs = model.predict(x_test, verbose=0)
# predict crisp classes for test set
#yhat_classes = model.predict_classes(x_test, verbose=0)
model.save("1110790_1dconv_reg.h5")

In [0]:
from keras.models import load_model
model = load_model("1110790_1dconv_reg.h5")

# New Section

In [74]:
#testing
model.summary()
y_pred = model.predict_classes(x_test, batch_size=128, verbose=0)


Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_14 (Conv1D)           (None, 1500, 128)         256       
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 1500, 128)         16512     
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 1500, 128)         16512     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 750, 128)          0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 96000)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               9600100   
_________________________________________________________________
dense_7 (Dense)              (None, 5)                

In [0]:
ro=np.argmax(y_test_np, axis=1)


In [76]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

accuracy = accuracy_score(ro, y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(ro, y_pred, average='weighted')
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(ro, y_pred, average='weighted')
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(ro, y_pred, average='weighted')
print('F1 score: %f' % f1)

Accuracy: 0.604340
Precision: 0.581449
Recall: 0.604340
F1 score: 0.579907
