In [1]:
# import library
import pandas as pd
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# import dataset
data = pd.read_csv("IMDB Dataset.csv", on_bad_lines='skip')

In [3]:
# display some data sample
data.sample(10)

Unnamed: 0,review,sentiment
32328,It's been over 30 years now but I still rememb...,negative
44102,When I watched this movie it was an afternoon ...,negative
43239,And I love it!!! Wonder Showzen will pick up a...,positive
29993,"Wow, this was another good spin off of the ori...",positive
18590,Drew Barrymore keeps seeing her alter-ego all ...,negative
27163,When you come across a gem of a movie like thi...,positive
13850,I have not seen this movie! At least not in it...,positive
1557,This movie is one of the most provocative Jesu...,positive
22034,I am a big Gone With The Wind nut but I was di...,positive
23293,"A trio of low-life criminals, led by Matt Dill...",negative


In [4]:
data['label'] = data['sentiment'].replace(['positive', 'negative'],['1', '0'])
data.sample(n=10)

Unnamed: 0,review,sentiment,label
21089,First I liked that movie. It seemed to me a ni...,negative,0
20050,This is a very cheaply made werewolf flick. Th...,negative,0
32716,"At initial thought, the concept of this show s...",positive,1
39761,"I loved this movie, I'll admit it. This has to...",negative,0
47829,Never viewed this film and enjoyed the singing...,positive,1
44113,While the story of a troubled kid turning to b...,positive,1
14543,"By no means a masterpiece, and far from Errol ...",positive,1
16944,So I was sick with the flu one Saturday and th...,negative,0
2358,"I'm not usually given to hyperbole, but after ...",negative,0
32437,My original review of this film was simply the...,negative,0


In [5]:
data['label'].value_counts()

label
1    25000
0    25000
Name: count, dtype: int64

#### Preprocessing Function

In [6]:
#Text Preprocessing for sentiment analysis
stopwords = nltk.corpus.stopwords.words('english')

def preprocess(text):
    
    # Generate list of words in the review (hashtags and punctuation removed)
    text = re.sub(r'https?://\S+|www\.\S+|@\w+|#\w+', '', text)
    
    # Remove number
    text = re.sub(r'\d+', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Convert emoji to text
    text = emoji.demojize(text)
    
    # Remove puctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    
    # Tokenize the text
    text = word_tokenize(text)
    
    # Remove empty and non-alphabetical tokens, and stopwords
    text = [t for t in text if t.isalpha() and t not in stopwords]
    
    # Replace negation token
    replacer = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(w) for w in text]
    
    return text
    

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

In [7]:
print(data['review'].iloc[12])
print(preprocess(data['review'].iloc[12]))

So im not a big fan of Boll's work but then again not many are. I enjoyed his movie Postal (maybe im the only one). Boll apparently bought the rights to use Far Cry long ago even before the game itself was even finsished. <br /><br />People who have enjoyed killing mercs and infiltrating secret research labs located on a tropical island should be warned, that this is not Far Cry... This is something Mr Boll have schemed together along with his legion of schmucks.. Feeling loneley on the set Mr Boll invites three of his countrymen to play with. These players go by the names of Til Schweiger, Udo Kier and Ralf Moeller.<br /><br />Three names that actually have made them selfs pretty big in the movie biz. So the tale goes like this, Jack Carver played by Til Schweiger (yes Carver is German all hail the bratwurst eating dudes!!) However I find that Tils acting in this movie is pretty badass.. People have complained about how he's not really staying true to the whole Carver agenda but we on

#### SVM using TfidVectorizer

In [8]:
# initialize tfidf vectorizer
vec = TfidfVectorizer(
    analyzer=preprocess,
    min_df=2,
    max_df=0.9,
    sublinear_tf=True,
    use_idf=True
)

# fit reviews data into vectorizer
tfidf_model = vec.fit(data['review'])
train_vec = vec.transform(data['review'])

In [9]:
# save trained vectorizer
vectorizer_file = 'tfid_vectorizer.sav'
pickle.dump(tfidf_model, open(vectorizer_file, 'wb'))

In [10]:
# split data to train and test
SEED = 4000
X_train, X_test, y_train, y_test = train_test_split(train_vec, data.label, test_size = 0.2, random_state = SEED)

In [11]:
# display the size of data
print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_train shape: {}'.format(y_train.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (40000, 68317)
X_test shape: (10000, 68317)
y_train shape: (40000,)
y_test shape: (10000,)


In [12]:
#Create a svm Classifier
tfidfvec_svm = SVC(random_state=39)
#Train the model using the training sets
tfidfvec_svm.fit(X_train, y_train.values.ravel())

In [13]:
# evaluate trained svm on test data
print('Result of SVM with TFIDF Vectorizer\n')
predictions = tfidfvec_svm.predict(X_test)
print(classification_report(predictions, y_test))
print('\n')
print('Confusion matrix: \n', confusion_matrix(predictions, y_test))
print('\n')
print('Accuracy score: ', accuracy_score(predictions, y_test))

Result of SVM with TFIDF Vectorizer



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


              precision    recall  f1-score   support

           0       0.89      0.91      0.90      4881
           1       0.91      0.89      0.90      5119

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



Confusion matrix: 
 [[4444  437]
 [ 562 4557]]


Accuracy score:  0.9001


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [14]:
# save model
with open('tfidf_svm', 'wb') as f:
    pickle.dump(tfidfvec_svm, f)

#### SVM using Count Vectorizer

In [15]:
# initialize count vectorizer
countvec = CountVectorizer(
    analyzer = preprocess,
    max_features = 20000,
    min_df = 2,
    max_df = 0.9,
    binary = True
)

# fit reviews data into count vectorizer
count_model = countvec.fit(data['review'])
train_countvec = countvec.transform(data['review'])

In [16]:
# split data into train and test data
SEED = 4000
X_train, X_test, y_train, y_test = train_test_split(train_countvec, data.label, test_size = 0.2, random_state = SEED)

In [17]:
# save trained count vectorizer
count_vectorizer_file = 'count_vectorizer.sav'
pickle.dump(count_model, open(count_vectorizer_file, 'wb'))

In [21]:
# train count svm using train data
countvec_svm = SVC(random_state = 39)
countvec_svm.fit(X_train, y_train.values.ravel())

In [22]:
# Evaluate trained count svm using test dataS
print("Result of SVM with Count Vectorizer\n")
predictions = countvec_svm.predict(X_test)
print(classification_report(predictions, y_test))
print('\n')
print('Confusion matrix: \n', confusion_matrix(predictions, y_test))
print('\n')
print('Accuracy score: ', accuracy_score(predictions, y_test))

Result of SVM with Count Vectorizer

              precision    recall  f1-score   support

           0       0.87      0.90      0.89      4852
           1       0.90      0.88      0.89      5148

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



Confusion matrix: 
 [[4366  486]
 [ 640 4508]]


Accuracy score:  0.8874


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [23]:
# save model
with open('count_svm', 'wb') as f:
    pickle.dump(countvec_svm, f)