In [19]:
import pandas as pd
movies_df = pd.read_csv('../data/movie_data.csv', encoding='utf-8')
# df = df.rename(columns={"0": "review", "1": "sentiment"})
df = movies_df.copy()

df.head(3)

Unnamed: 0,review,sentiment
0,"Often tagged as a comedy, The Man In The White...",1
1,After Chaplin made one of his best films: Doug...,0
2,I think the movie was one sided I watched it r...,0


In [20]:
# defining a tokenizer func. that cleans the unprocessed text data 

import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words("english")

def tokenizer(text):
    # substituting/Removing any html tag elments alongs with it's contents
    # in our text
    text = re.sub('<[^>]*>', '', text)
    
    # getting all emotions signs
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    
    # removing all emoticons and appending at the end, also removing the
    # nose '-' symbol in ':-)' from consistensy
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    
    # tokenization
    tokenized = [w for w in text.split() if w not in stop]
    
    return tokenized


# define a generator func., stream_docs, that reads in and 
# returns one document at a time:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) +\


In [21]:
next(stream_docs(path='../data/movie_data.csv'))

('"Often tagged as a comedy, The Man In The White Suit is laying out far more than a chuckle here and there.<br /><br />Sidney Stratton is an eccentric inventor who isn\'t getting the chances to flourish his inventions on the world because nobody pays him notice, he merely is the odd ball odd job man about the place as it were. After bluffing his way into Birnley\'s textile mill, he uses their laboratory to achieve his goal of inventing a fabric that not only never wears out, but also never needs to be cleaned!. He is at first proclaimed a genius and those who ignored him at first suddenly want a big piece of him, but then the doom portents of an industry going bust rears its head and acclaim quickly turns to something far more scary.<br /><br />Yes the film is very funny, in fact some scenes are dam hilarious, but it\'s the satirical edge to the film that lifts it way above the ordinary to me. The contradictions about the advent of technology is a crucial theme here, do we want invent

In [22]:
# define a func., get_minibatch, that will take a document stream
# from the stream_docs func. and return a particular number of 
# documents specified by the size parameter:

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [23]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

In [24]:
# loss='log_loss' for logistic regression
clf = SGDClassifier(loss='log_loss', random_state=1)

doc_stream = stream_docs(path='../data/movie_data.csv')

### Training Loop

1. CSV File (movie_data.csv)
   │
   ▼
2. stream_docs → Yields one document at a time
   │
   ▼
3. get_minibatch → Groups documents into mini-batches (e.g., 1000 docs)
   │
   ▼
4. HashingVectorizer → Converts mini-batch to sparse matrix
   │
   ▼
5. SGDClassifier → Updates model weights with partial_fit
   │
   ▼
6. Repeat for all mini-batches → Trained model
   │
   ▼
7. Evaluate on test set → Accuracy score


In [28]:
from sklearn import __version__ as sklearn_version
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0,1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    
    if not X_train:
        break
        
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()
    
    
# we iterated over 45 mini-batches of documents where each mini-batch
# consists of 1,000 documents. 



In [26]:
X_test, y_test = get_minibatch(doc_stream, size=5000)

X_test = vect.transform(X_test)

print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.871


In [34]:
clf = clf.partial_fit(X_test, y_test)