In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from html import unescape
from nltk.corpus import stopwords
import string
import tensorflow as tf
import tensorflow_hub as hub
from nltk.tokenize import RegexpTokenizer

In [None]:
data = pd.read_csv('IMDB Dataset.csv')
# Keeping only the neccessary columns
X_train, X_test, Y_train, Y_test = train_test_split(data.review,data.sentiment, test_size = 0.1, random_state = 42)

In [None]:
df=pd.DataFrame([],columns=['review','sentiment'])
df.review=X_train
df.sentiment=Y_train

In [None]:
def clean_sentence(sentence):
      emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+", flags=re.UNICODE)
      #HappyEmoticons
      emoticons_happy = set([
      ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
      ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
      '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
      'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
      '<3'
      ])
      # Sad Emoticons
      emoticons_sad = set([
      ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
      ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
      ':c', ':{', '>:\\', ';('
      ])
      #combine sad and happy emoticons
      emoticons = emoticons_happy.union(emoticons_sad)
      # remove mention
      sentence=re.sub('@\w*', '', sentence)
      # remove hashtag
      sentence = re.sub('#', '', sentence)
      # remove retweet
      sentence = re.sub('RT[\s]+', '', sentence) 
      # remove hyperlink
      sentence = re.sub('https?:\/\/\S+', '', sentence)
      # remove special string of html content
      sentence=BeautifulSoup(unescape(sentence)).text
      # remove icon
      for icon in emoticons:
          sentence=sentence.replace(icon,'')
      # remove emoji
      sentence=emoji_pattern.sub(r'', sentence)
      sentence = re.sub(r"what's", "what is ", sentence)
      sentence = re.sub(r"\'s", " ", sentence)
      sentence = re.sub(r"\'ve", " have ", sentence)
      sentence = re.sub(r"can't", "cannot ", sentence)
      sentence = re.sub(r"n't", " not ", sentence)
      sentence = re.sub(r"i'm", "i am ", sentence)
      sentence = re.sub(r"\'re", " are ", sentence)
      sentence = re.sub(r"\'d", " would ", sentence)
      sentence = re.sub(r"\'ll", " will ", sentence)
      sentence = re.sub(r"\'scuse", " excuse ", sentence)
      sentence = re.sub('\W', ' ', sentence)
      sentence = re.sub('\s+', ' ', sentence)
      sentence=sentence.strip(' ')
      
      return sentence

In [None]:
def sen_tokenize(sentence):
    #Emoji patterns
    tokenizer = RegexpTokenizer(r"[A-Za-z0-9]\w*(?:['?]\w+)?")
    tokens=[]
    tokens+=tokenizer.tokenize(clean_sentence(sentence))
    # lower case token
    tokens=[w.lower() for w in tokens]
    return tokens

In [None]:
def filter_tokens(sentence):
    stop_words = set(stopwords.words('english'))
    tokens=sen_tokenize(sentence)
    filtered_tokens=[]
    # remove punctation
    for w in tokens:
        if w not in string.punctuation and w not in stop_words and len(w)>=3 and not(w.isnumeric()) :
            filtered_tokens.append(w)
    return filtered_tokens

In [None]:
 import nltk
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
for idx in range(len(df)):
  df.iloc[idx,0]=np.array(filter_tokens(df.iloc[idx,0]))
for i in range(len(df)):
    df.iloc[i,0]=' '.join(df.iloc[i,0])

In [None]:
df.review

40877    recently started watching show say really made...
18057    return jedi often remembered wrong rather righ...
19066    remember loved movie came years old commodore ...
20525    know last reviewer talking show pure entertain...
5847     beginning excited see movie poster possibly fu...
                               ...                        
11284    shadow magic recaptures joy amazement first mo...
44732    found movie quite enjoyable fairly entertainin...
38158    avoid one terrible movie exciting pointless mu...
860      production quite surprise absolutely love obsc...
15795    decent movie although little bit short time pa...
Name: review, Length: 45000, dtype: object

In [None]:
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(df['review'].values)
X = tokenizer.texts_to_sequences(df['review'].values,)
X = pad_sequences(X)

# reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# # Function takes a tokenized sentence and returns the words
# def sequence_to_text(list_of_indices):
#     # Looking up words in dictionary
#     words = [reverse_word_map.get(letter) for letter in list_of_indices]
#     return(words)

# # Creating texts 
# df.review = list(map(sequence_to_text, X))

In [None]:
# Load Pretrained Word2Vec
embed = hub.load("https://tfhub.dev/google/Wiki-words-250/2")

In [None]:
def get_word2vec_enc(reviews):
    """
    get word2vec value for each word in sentence.
    concatenate word in numpy array, so we can use it as RNN input
    """
    encoded_reviews = []
    for review in reviews:
        word2vec_embedding = np.array(embed(review))
        encoded_reviews.append(word2vec_embedding)
    return encoded_reviews


def get_padded_encoded_reviews(encoded_reviews):
    """
    for short sentences, we prepend zero padding so all input to RNN has same length
    """
    padded_reviews_encoding = []
    for enc_review in encoded_reviews:
        zero_padding_cnt = 640 - enc_review.shape[0]
        pad = np.zeros((zero_padding_cnt, 250))
        enc_review = np.concatenate((pad, enc_review), axis=0)
        padded_reviews_encoding.append(enc_review)
    return np.array(padded_reviews_encoding)


In [None]:
def get_weight_matrix(embedding, vocab):
	# total vocabulary size plus 0 for unknown words
	vocab_size = len(vocab) + 1
	# define weight matrix dimensions with all 0
	weight_matrix = np.zeros((vocab_size, 250))
	# step vocab, store vectors using the Tokenizer's integer mapping
	for word, i in vocab.items():
		weight_matrix[i] = embedding([word])
	return weight_matrix

In [None]:
weight_matrix=get_weight_matrix(embed, tokenizer.word_index)

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf_model = TfidfVectorizer(token_pattern=r"\w+['\w]*",ngram_range=(1,2),min_df=0.01,max_df=0.99) # specify parameters here
# tfidf_model.fit(data['review'])

In [None]:
# raw_data_tfidf=tfidf_model.transform(data['review'])
# vocab=tfidf_model.vocabulary_

# X=raw_data_tfidf
# X=X.toarray()
# X=[sorted(X[idx],reverse=True)[:20] for idx in range(len(X))]
# X=np.array(X)
# X = np.expand_dims(X, 2)

In [None]:
# print('Preparing embedding matrix')
# EMBEDDING_DIM = 250
# embedding_matrix = np.zeros((len(vocab), EMBEDDING_DIM))
# for word, i in vocab.items():
#         embedding_matrix[i] = np.array(embed([word]))

In [None]:
# trainsum=np.sum(raw_data_tfidf.toarray(),axis=1)
# w2v_tfidf=raw_data_tfidf.dot(embedding_matrix)
# for i in range(len(data)):
#     w2v_tfidf[i]=w2v_tfidf[i]/trainsum[i]

In [None]:
# X=[sorted(X[idx],reverse=True)[:20] for idx in range(len(X))]

In [None]:
# model = Sequential()
# model.add(LSTM(units=64,input_shape=(20, 1 ),dropout=0.2, recurrent_dropout=0.2))
# model.add(Dense(2,activation='softmax'))
# model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model.summary())

In [None]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
Y=le.fit_transform(df['sentiment'])
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(33750, 909) (33750,)
(11250, 909) (11250,)


In [None]:
from sklearn.utils import resample
X_train=pd.DataFrame(data=X_train[0:,0:],
    index=[i for i in range(X_train.shape[0])],
    columns=['f'+str(i) for i in range(X_train.shape[1])])
Y_train=pd.DataFrame(data=Y_train)

df = pd.concat([X_train, Y_train], axis=1)


# Separate majority and minority classes
# Separate majority and minority classes
df_majority = df[df.iloc[:,18]==0]
df_minority = df[df.iloc[:,18]==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=1234) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
X_train=np.array(df_upsampled.iloc[:,:18])
Y_train=np.array(df_upsampled.iloc[:,18])

In [None]:
X_train

array([[   0,    0,    0, ...,  234,  500,  389],
       [   0,    0,    0, ...,    6,    1,  654],
       [   0,    0,    0, ...,    1,   50,  205],
       ...,
       [   0,    0,    0, ...,  976, 1468,  452],
       [   0,    0,    0, ...,  147,  131,   27],
       [   0,    0,    0, ...,  345,   13, 1820]], dtype=int32)

In [None]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
vocab_size = len(tokenizer.word_index) + 1



model = Sequential()
# embedding_layer = Embedding(vocab_size, 250, weights=[weight_matrix], input_length=909, trainable=False)
embedding_layer = Embedding(vocab_size, 128,  input_length=909)
model.add(embedding_layer)
model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
from keras.callbacks import ModelCheckpoint
batch_size = 32

checkpoint = ModelCheckpoint('model_best_weights.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max', period=1)

model.fit(X_train, Y_train, epochs =5,validation_data = (X_test, 
Y_test), callbacks=[checkpoint] ,batch_size=batch_size, verbose = 2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 909, 128)          12585728  
_________________________________________________________________
conv1d (Conv1D)              (None, 905, 64)           41024     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 452, 64)           0         
_________________________________________________________________
flatten (Flatten)            (None, 28928)             0         
_________________________________________________________________
dense (Dense)                (None, 10)                289290    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 12,916,053
Trainable params: 12,916,053
Non-trainable params: 0
____________________________________________

KeyboardInterrupt: ignored

In [None]:
X_train

40877    I recently started watching this show, and I h...
18057    "Return of the Jedi" is often remembered for w...
19066    I remember I loved this movie when it came out...
20525    I don't know what the last reviewer is talking...
5847     From the very beginning I was so excited to se...
                               ...                        
11284    `Shadow Magic' recaptures the joy and amazemen...
44732    I found this movie to be quite enjoyable and f...
38158    Avoid this one! It is a terrible movie. So wha...
860      This production was quite a surprise for me. I...
15795    This is a decent movie. Although little bit sh...
Name: review, Length: 45000, dtype: object

In [None]:
embed_dim = 128
# vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
embedding_layer = Embedding(2000, 250,input_length=909, trainable=False)
model.add(embedding_layer)

model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2,return_sequences=True,input_shape=(909,250)))
model.add(Dense(128,kernel_initializer='HeNormal', input_dim=196,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 909, 250)          500000    
_________________________________________________________________
lstm (LSTM)                  (None, 909, 196)          350448    
_________________________________________________________________
dense (Dense)                (None, 909, 128)          25216     
_________________________________________________________________
dense_1 (Dense)              (None, 909, 1)            129       
Total params: 875,793
Trainable params: 375,793
Non-trainable params: 500,000
_________________________________________________________________
None


In [None]:

from keras.callbacks import ModelCheckpoint
batch_size = 32

checkpoint = ModelCheckpoint('model_best_weights.h5', monitor='val_accuracy', verbose=2, save_best_only=True, mode='max', period=1)

model.fit(X_train, Y_train, epochs =10,validation_data = (X_test, 
Y_test), callbacks=[checkpoint] ,batch_size=batch_size, verbose = 2)





Epoch 1/10


In [None]:
from sklearn.metrics import roc_auc_score
from keras.models import load_model

# load model
model = load_model('model_best_weights.h5')

In [None]:
model.evaluate(test,y_t,batch_size=32,verbose=2)

157/157 - 1s - loss: 0.3012 - accuracy: 0.8818


[0.3012043833732605, 0.8817999958992004]

In [None]:
roc_auc_score(y_test,y_pred)

0.7402548840048839

In [None]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)

111/111 - 1s - loss: 0.6869 - accuracy: 0.8226


In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for idx in range(len(X_test)):
    
    result = model.predict_classes(X_test[idx].reshape(1,X_test.shape[1]))[0][0]
   
    if result == Y_test[idx]:
        if result == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if Y_test[idx] == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")



pos_acc 64.58333333333334 %
neg_acc 85.32435306628855 %




array([[1]], dtype=int32)