In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np

In [2]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [4]:
from sklearn.model_selection import train_test_split

In [35]:
from sklearn.metrics import classification_report, confusion_matrix

In [71]:
import os
import re
import pickle
from tqdm import tqdm

In [11]:
# reading the CSV file
df = pd.read_csv('data/movie_data.csv', encoding='utf-8')
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


#### Preprocessing

In [12]:
# looking at the reviews
df.loc[0,'review']

'In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich famil

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
review       50000 non-null object
sentiment    50000 non-null int64
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


In [14]:
sample_txt = df.loc[0,'review'][0:50]

In [15]:
sample_txt

'In 1974, the teenager Martha Moxley (Maggie Grace)'

In [38]:
class Preprocess:
    
    def __init__(self):
        self.porter = PorterStemmer()
        self.stop_words = stopwords.words('english')
    
    #1
    def remove_special_char(self, text):
        reg_pattern = re.compile('<.*?>|[.,&:\\\']')
        return re.sub(reg_pattern, ' ', str(text))
    
    #2
    def to_lower_case(self, text):
        return text.lower()

    #3
    def tokenizer_simple(self, text):
        return text.split()
    
     #4
    def remove_stop_words(self, text):
        return [word for word in text
                    if word not in self.stop_words]
    
    #5
    def stem_porter(self, text, make_token_first=False):
        
        if make_token_first:
            return [self.porter.stem(w) for w in tokenizer_simple(text)]
        else:
            return [self.porter.stem(l) for l in text]
    
    #final
    def preprocess_text(self, text):
        return ' '.join(self.stem_porter(
            self.remove_stop_words(
            self.tokenizer_simple(
            self.to_lower_case(
            self.remove_special_char(text))))))

In [17]:
preprocess = Preprocess()
processed_sample_text = preprocess.preprocess_text(sample_txt)
processed_sample_text

'1974 teenag martha moxley (maggi grace)'

In [21]:
test = df.head(20)

In [22]:
rev=[]
for i in tqdm(range(df.shape[0])):
    rev.append(preprocess.preprocess_text(df.loc[i, 'review']))

100%|███████████████████████████████████████████████████████████████████████████| 50000/50000 [03:17<00:00, 252.74it/s]


In [23]:
rev = np.array(rev)
df['reviews'] = rev

In [24]:
df.drop('review', axis=1, inplace=True)

In [25]:
df.head()

Unnamed: 0,sentiment,reviews
0,1,1974 teenag martha moxley (maggi grace) move h...
1,0,ok realli like kri kristofferson usual easi go...
2,0,spoiler read think watch movi although would w...
3,1,hi peopl seen wonder movi im sure thet would l...
4,0,recent bought dvd forget much hate movi versio...


In [26]:
df.to_csv('data/movie_reviews_new.csv', encoding='utf-8', index=False)

In [6]:
df_new = pd.read_csv('data/movie_reviews_new.csv')

In [7]:
df_new.head()

Unnamed: 0,sentiment,reviews
0,1,1974 teenag martha moxley (maggi grace) move h...
1,0,ok realli like kri kristofferson usual easi go...
2,0,spoiler read think watch movi although would w...
3,1,hi peopl seen wonder movi im sure thet would l...
4,0,recent bought dvd forget much hate movi versio...


In [77]:
# stopwords
stop = stopwords.words('english')

In [78]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
      ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [54]:
# initializing Hashing Vectorizer

vect = HashingVectorizer(decode_error='ignore',
                            n_features=2**21,
                            preprocessor=None,
                            tokenizer=tokenizer)

In [55]:
# Stoichastic Gradient Descent Classifier with Logarithmic Penalty
clf = SGDClassifier(loss='log', random_state=1,
                    max_iter=1)

In [56]:
df_train, df_test = train_test_split(df_new, test_size=0.1,
                                     shuffle=False)

In [57]:
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [58]:
def stream_docs(data=df_train):
    
    for i in range(data.shape[0]):
        text = data.loc[i, 'reviews']
        label = int(data.loc[i, 'sentiment'])
        
        yield text, label

In [59]:
# initializing the Generator function that feeds data
train_docs = stream_docs(df_train)
test_docs = stream_docs(df_test)

In [60]:
def get_minibatch(text_stream,batch_size):
    X, y = [], []
    try:
        for _ in range(batch_size):
            txt, lbl = next(text_stream)
            
            X.append(txt)
            y.append(lbl)
    
    except StopIteration:
        return None, None
    
    return X, y

In [61]:
# Batch-Size for each minibatch
BATCH_SIZE=1000

# Number of classes in the 
classes = np.array([0, 1])

In [62]:
EPOCHS = int(df_train.shape[0]/BATCH_SIZE)

In [63]:
for _ in tqdm(range(EPOCHS)):
    X_train, y_train = get_minibatch(train_docs, batch_size=BATCH_SIZE)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)



  0%|                                                                                           | 0/45 [00:00<?, ?it/s]

  2%|█▊                                                                                 | 1/45 [00:00<00:23,  1.91it/s]

  4%|███▋                                                                               | 2/45 [00:01<00:22,  1.93it/s]

  7%|█████▌                                                                             | 3/45 [00:01<00:21,  1.94it/s]

  9%|███████▍                                                                           | 4/45 [00:02<00:21,  1.94it/s]

 11%|█████████▏                                                                         | 5/45 [00:02<00:20,  1.93it/s]

 13%|███████████                                                                        | 6/45 [00:03<00:20,  1.93it/s]

 16%|████████████▉                                                                      | 7/45 [00:03<00:19,  1.93it/s]

 18%|██████████████▊          

In [64]:
X_test, y_test = get_minibatch(test_docs, batch_size=5000)

In [65]:
type(X_test)

list

In [66]:
X_test = vect.transform(X_test)

In [67]:
pred = clf.predict(X_test)

In [68]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[2172  296]
 [ 334 2198]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      2468
           1       0.88      0.87      0.87      2532

   micro avg       0.87      0.87      0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [69]:
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.874


In [73]:
mvi_clf = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(mvi_clf):
    os.makedirs(mvi_clf)

In [80]:
# pickling
# dumping stopwords
pickle.dump(stop, open(os.path.join(mvi_clf, 'stopwords.pkl'), 'wb'),
               protocol=4)

In [81]:
# pickling
# dumping classifier
pickle.dump(clf, open(os.path.join(mvi_clf, 'classifier.plk'), 'wb'),
               protocol=4)