In [1]:
#Train a logistic regression model (using partial fit and hashing vectorizer)
#serialize/save the model in a pickle file
#write an sqlite database
#based on Python Machine Learning Book

In [2]:
#import movie review data

import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df = df.rename(columns={"0": "review", "1": "sentiment"})

df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [3]:
#bag of words concepts
#transform sentences into bag of words vocabulary and sparse vectors
#call fit_transform on the count vectorizer

import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

#reads and returns one movie review at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [4]:
#test stream_docs, next(stream...) returns the text of the review
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [5]:
#returns a specific number of docs (size) from the doc stream
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [18]:
#in out of core learning, cannot use count vectorizer or tfidvectorizer
#so we use "hashing vectorizer"

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier


vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer) #using tokenizer from above

In [9]:
#in out of core learning using the SGDClassifier 
#from sklearn where we can apply partial_fit
#(not logistic regression from sklearn)

from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

clf = SGDClassifier(loss='log', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [10]:
#train the classifier in batches

import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train) #apply the hashing vectorizer from above
    clf.partial_fit(X_train, y_train, classes=classes) #fit
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:23


In [11]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test) #apply the hashing vectorizer from above
print(f'Accuracy: {clf.score(X_test, y_test):.3f}')

Accuracy: 0.868


In [13]:
#create a pickle file to save classifier in current state
#to serialize/deserialize python object structures

import pickle
import os

#create folder, make if doesn't exist
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

In [14]:
#dump the stopwords into a pickle file
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'),'wb'),protocol=4)

In [15]:
#dump the classifier into a pickle file
pickle.dump(clf,open(os.path.join(dest,'classifier.pkl'),'wb'),protocol=4)

In [16]:
#test pickl file

import os
os.chdir('movieclassifier')

In [17]:
import pickle
import re
import os
from vectorizer import vect

clf2 = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

In [18]:
clf2

SGDClassifier(loss='log', random_state=1)

In [19]:
import numpy as np
label = {0:'negative', 1:'positive'}

example = ["I love this movie. It's amazing."]

X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %\
      (label[clf2.predict(X)[0]], 
       np.max(clf2.predict_proba(X))*100))


Prediction: positive
Probability: 95.55%


In [22]:
#create new sqlite db

import sqlite3
import os

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [23]:
#test connection to sqlite database
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 10:10:10' AND DATETIME('now')")
results = c.fetchall()

conn.close()

In [24]:
print(results)

[('I love this movie', 1, '2022-01-17 16:40:34'), ('I disliked this movie', 0, '2022-01-17 16:40:34')]


In [None]:
#run this movie review trained model on twitter data as this data had emoticons
#later replace model with twitter trained model

In [9]:
import os
import tweepy as tweepy
import pandas as pd

In [117]:
#test model on twitter
######## Classify the list of live tweets

def classify(skiresort, clf):
    
    #get live tweets for skiresort entered
    live_tweets = get_tweets(skiresort)

    #convert to an array
    X = vect.transform(live_tweets['tweet'].values)

    #assign labels for 0/1
    label = {0: 'negative', 1: 'positive'}

    #make sentiment prediction for each tweet and add to df to print later
    live_tweets['sentiment prediction'] = clf.predict(X)
    live_tweets['probability'] = [max(i) for i in clf.predict_proba(X)]

    #return the mean probability of prediction (take max value)
    prediction_result = clf.predict_proba(X)
    prediction_result_meanvals = [np.mean(prediction_result[:,0]),\
                                  np.mean(prediction_result[:,1])]
    proba = np.max(prediction_result_meanvals)
    y_pred = prediction_result_meanvals.index(proba)
    return label[y_pred], proba, live_tweets

In [118]:
######## Get live tweets using twitter api
def _get_client():
    #client and token should be hidden
    return tweepy.Client(bearer_token='')

def get_tweets(skiresort):
    query = f'{skiresort} has:images lang:en -is:retweet'
    tweets = _get_client().search_recent_tweets(query=query, \
             tweet_fields=['context_annotations', 'created_at'], max_results=10)
    live_data = []
    for tweet in tweets.data:
        #print('text', tweet.text, ' \n ', 'created',tweet.created_at)
        live_data.append((tweet.text, tweet.created_at))
        
    return pd.DataFrame(live_data, columns=['tweet', 'created'])

In [123]:
a,b,c = classify('verbier', clf2)
print(a)
print(b)
c.head()

positive
0.6098229018312917


Unnamed: 0,tweet,created,sentiment prediction,probability
0,Verbier in #Switzerland is named the World's B...,2022-01-18 12:15:01+00:00,1,0.832967
1,Alice Michel shredding the mountain waves at V...,2022-01-18 11:57:57+00:00,1,0.642169
2,@ohreallytruly @Resister_1776 @lisapodcasts @b...,2022-01-18 09:36:56+00:00,1,0.555941
3,@cornes_nicole We'll always have Verbier. X ht...,2022-01-18 09:30:29+00:00,1,0.656413
4,Check out our website https://t.co/0ObG5pNQNX ...,2022-01-18 07:37:00+00:00,1,0.507262


In [113]:
print(y_pred, proba)
print(clf.predict(X))
print(clf.predict_proba(X))
print(prediction_result_meanvals)
live_tweets

1 0.6098229018312917
[1 1 1 1 1 1 1 1 1 1]
[[0.16703268 0.83296732]
 [0.35783142 0.64216858]
 [0.44405927 0.55594073]
 [0.34358685 0.65641315]
 [0.49273772 0.50726228]
 [0.42652214 0.57347786]
 [0.3870116  0.6129884 ]
 [0.42840131 0.57159869]
 [0.42806584 0.57193416]
 [0.42652214 0.57347786]]
[0.3901770981687082, 0.6098229018312917]


Unnamed: 0,tweet,created,sentiment prediction,probability
0,Verbier in #Switzerland is named the World's B...,2022-01-18 12:15:01+00:00,1,0.832967
1,Alice Michel shredding the mountain waves at V...,2022-01-18 11:57:57+00:00,1,0.642169
2,@ohreallytruly @Resister_1776 @lisapodcasts @b...,2022-01-18 09:36:56+00:00,1,0.555941
3,@cornes_nicole We'll always have Verbier. X ht...,2022-01-18 09:30:29+00:00,1,0.656413
4,Check out our website https://t.co/0ObG5pNQNX ...,2022-01-18 07:37:00+00:00,1,0.507262
5,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:21:10+00:00,1,0.573478
6,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:07:39+00:00,1,0.612988
7,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:31:48+00:00,1,0.571599
8,https://t.co/8GhgJPVmih A Sustainable Swiss Ch...,2022-01-17 18:12:50+00:00,1,0.571934
9,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:06:35+00:00,1,0.573478


In [108]:
print(clf.predict(X))
vals = clf.predict_proba(X)
print(vals)
[max(i) for i in vals]

[1 1 1 1 1 1 1 1 1 1]
[[0.16703268 0.83296732]
 [0.35783142 0.64216858]
 [0.44405927 0.55594073]
 [0.34358685 0.65641315]
 [0.49273772 0.50726228]
 [0.42652214 0.57347786]
 [0.3870116  0.6129884 ]
 [0.42840131 0.57159869]
 [0.42806584 0.57193416]
 [0.42652214 0.57347786]]


[0.8329673163888086,
 0.6421685757493234,
 0.555940733691984,
 0.6564131509245278,
 0.5072622764776225,
 0.5734778574964404,
 0.6129883974048228,
 0.5715986916146272,
 0.5719341610683206,
 0.5734778574964404]

In [100]:
print(proba, y_pred)
live_tweets

0.6098229018312917 1


Unnamed: 0,tweet,created,sentiment prediction,probability
0,Verbier in #Switzerland is named the World's B...,2022-01-18 12:15:01+00:00,1,0.832967
1,Alice Michel shredding the mountain waves at V...,2022-01-18 11:57:57+00:00,1,0.832967
2,@ohreallytruly @Resister_1776 @lisapodcasts @b...,2022-01-18 09:36:56+00:00,1,0.832967
3,@cornes_nicole We'll always have Verbier. X ht...,2022-01-18 09:30:29+00:00,1,0.832967
4,Check out our website https://t.co/0ObG5pNQNX ...,2022-01-18 07:37:00+00:00,1,0.832967
5,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:21:10+00:00,1,0.832967
6,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:07:39+00:00,1,0.832967
7,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:31:48+00:00,1,0.832967
8,https://t.co/8GhgJPVmih A Sustainable Swiss Ch...,2022-01-17 18:12:50+00:00,1,0.832967
9,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:06:35+00:00,1,0.832967


In [86]:
label, proba, live_tweets = classify('verbier',clf2)

[[0.44405927 0.55594073]
 [0.34358685 0.65641315]
 [0.49273772 0.50726228]
 [0.42652214 0.57347786]
 [0.3870116  0.6129884 ]
 [0.42840131 0.57159869]
 [0.42806584 0.57193416]
 [0.42652214 0.57347786]
 [0.48899765 0.51100235]
 [0.36077374 0.63922626]]


In [83]:
print('label', label, 'proba', proba)
live_tweets

positive 0.5773321731514809


Unnamed: 0,tweet,created,sentiment prediction
0,@ohreallytruly @Resister_1776 @lisapodcasts @b...,2022-01-18 09:36:56+00:00,1
1,@cornes_nicole We'll always have Verbier. X ht...,2022-01-18 09:30:29+00:00,1
2,Check out our website https://t.co/0ObG5pNQNX ...,2022-01-18 07:37:00+00:00,1
3,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:21:10+00:00,1
4,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:07:39+00:00,1
5,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:31:48+00:00,1
6,https://t.co/8GhgJPVmih A Sustainable Swiss Ch...,2022-01-17 18:12:50+00:00,1
7,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:06:35+00:00,1
8,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:04:07+00:00,1
9,Violinist on the slopes today in #Verbier #LaV...,2022-01-17 16:29:51+00:00,1


In [73]:
get_tweets('verbier')

Unnamed: 0,tweet,created
0,@ohreallytruly @Resister_1776 @lisapodcasts @b...,2022-01-18 09:36:56+00:00
1,@cornes_nicole We'll always have Verbier. X ht...,2022-01-18 09:30:29+00:00
2,Check out our website https://t.co/0ObG5pNQNX ...,2022-01-18 07:37:00+00:00
3,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:21:10+00:00
4,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 19:07:39+00:00
5,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:31:48+00:00
6,https://t.co/8GhgJPVmih A Sustainable Swiss Ch...,2022-01-17 18:12:50+00:00
7,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:06:35+00:00
8,A Sustainable Swiss Chalet That’s Modern Yet C...,2022-01-17 18:04:07+00:00
9,Violinist on the slopes today in #Verbier #LaV...,2022-01-17 16:29:51+00:00


In [28]:
import pickle
import re
import os


clf2 = pickle.load(open(os.path.join('ski_resort_webapp', 'pkl_objects', 'classifier.pkl'), 'rb'))

In [74]:
classify('verbier',clf2)

[1 1 1 1 1 1 1 1 1 1]
[[0.44405927 0.55594073]
 [0.34358685 0.65641315]
 [0.49273772 0.50726228]
 [0.42652214 0.57347786]
 [0.3870116  0.6129884 ]
 [0.42840131 0.57159869]
 [0.42806584 0.57193416]
 [0.42652214 0.57347786]
 [0.48899765 0.51100235]
 [0.36077374 0.63922626]]
  (0, 52833)	-0.22941573387056174
  (0, 246455)	-0.22941573387056174
  (0, 318163)	-0.22941573387056174
  (0, 519290)	0.22941573387056174
  (0, 824797)	-0.22941573387056174
  (0, 1001239)	0.22941573387056174
  (0, 1061207)	0.22941573387056174
  (0, 1104880)	-0.22941573387056174
  (0, 1202116)	0.22941573387056174
  (0, 1238013)	0.4588314677411235
  (0, 1240366)	-0.22941573387056174
  (0, 1462582)	-0.22941573387056174
  (0, 1743017)	-0.22941573387056174
  (0, 1779114)	-0.22941573387056174
  (0, 1810592)	-0.22941573387056174
  (0, 1893891)	0.22941573387056174   (0, 519290)	0.3779644730092272
  (0, 945236)	-0.3779644730092272
  (0, 1104880)	-0.3779644730092272
  (0, 1240366)	-0.3779644730092272
  (0, 1684928)	-0.377964473

('positive', 0.5773321731514809)

In [30]:
skiresort = 'verbier'
clf = clf2

#get live tweets for skiresort entered
live_tweets = get_tweets(skiresort)

#convert to an array
X = vect.transform(live_tweets['tweet'].values)

#assign labels for 0/1
label = {0: 'negative', 1: 'positive'}



In [32]:
clf.predict(X)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [35]:
prediction_result = clf.predict_proba(X)

In [40]:
prediction_result

array([[0.44405927, 0.55594073],
       [0.34358685, 0.65641315],
       [0.49273772, 0.50726228],
       [0.42652214, 0.57347786],
       [0.3870116 , 0.6129884 ],
       [0.42840131, 0.57159869],
       [0.42806584, 0.57193416],
       [0.42652214, 0.57347786],
       [0.48899765, 0.51100235],
       [0.36077374, 0.63922626]])

In [95]:
prediction_result_meanvals = [np.mean(prediction_result[:,0]),\
                              np.mean(prediction_result[:,1])]

print(prediction_result_meanvals)

print(np.max(prediction_result_meanvals))

[0.4223735949244718, 0.5776264050755281]
0.5776264050755281


In [67]:
prediction_result_meanvals

[0.4226678268485191, 0.5773321731514809]

In [70]:
max_value = max(prediction_result_meanvals)
max_index = prediction_result_meanvals.index(max_value)
print(max_value)
print(max_index)

0.5773321731514809
1
