In [1]:
# Download data from site -- http://ai.stanford.edu/~amaas/data/sentiment/
# p.234
#import pyprind # may need to do >sudo easy_install pip, then >pip install pyprind --user
import pandas as pd
import os
import pyprind

In [2]:
pwd = os.getcwd()
print(pwd+'/aclImdb/')
file = os.listdir(pwd)

/Users/whs/Documents/Data Journalism/Congressional Tweets/whs2k.github.io/aclImdb/


In [3]:
#################
# Start here    #
#################
#import pyprind
import pandas as pd
import os
pwd = os.getcwd()

df = pd.read_csv( 'movie_data.csv', encoding='utf-8')
df.columns = ['review', 'sentiment']

print(df.shape)

(49999, 2)


In [4]:
pwd+'/movie_data.csv'

'/Users/whs/Documents/Data Journalism/Congressional Tweets/whs2k.github.io/movie_data.csv'

In [5]:
df.head(5)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [6]:
# Bag of Word model
# 1. create a vocabulary of unique tokens (or words)
# 2. construct a feature vector for each document, features store count
#    of words per document

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer() #Instanstiate the count array

docs = np.array(['The sun is shining', 
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])

bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [7]:
# tf(t,d) - raw term frequencies (t: term, d: nos times term t appears in doc d)
# tf-idf(t,d) - term frequency inverse document frequency
# tf-idf = tf(t,d) * idf(t,x)  = tf(t,d) * log( [1+nd]/[1+df(d,t)] ) 


In [8]:
# TfidTransformer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer() #Instantiate Term Frequency invers

np.set_printoptions(precision=2)

print(tfidf.fit_transform(count.fit_transform(docs)).toarray()) #How much did the term appear in other documents?


[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [9]:
# so if the term "the" shows up lots of times, does that mean it's important?
# how can we make terms that shows up lots of times across documents, less important
# let's normalize by the times these terms show up across documents.

# employ : [nos of docs containing term "the" ]/[total nos of documents]

# if term appear often, give it less emphasis

# tf-idf(t,d) = tf(t,2)*(idf(t,d)+1)
# with idf(t,d) = log ([1+total nos of docs]/[1+nos of docs containing term t])


In [10]:
#Reg functions...to get rid of HTML Tags and emoticons

import re
def preprocessor(text): 
# find '<' then anything not '>' [^>], [^>]* 0 or more prefix, then close with '>'    
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) 
    # eyes[:,;,=], optional nose [-], and mouth[),(,D,P)]
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [11]:
tmp = 'is ;) :) seven.<br /><br />Title (Brazil): Not Available'

print(preprocessor(tmp))
#print(preprocessor('</a>This :) is :( a test :-)!' ))
#print(re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', '</a>This :) is :( a test :-)!'))

is seven title brazil not available;) :)


In [12]:
df['review'] = df['review'].apply(preprocessor) #use the apply method and send in the preprocessor function (applys the function to each row)


In [13]:
df.shape

(49999, 2)

In [14]:
df.tail(3)

Unnamed: 0,review,sentiment
49996,i don t even know where to begin on this one i...,0
49997,richard tyler is a little boy who is scared of...,0
49998,i waited long to watch this movie also because...,1


In [15]:
# p.242 Processing documents into tokens
# split the sentence/corpora into individual elements
def tokenizer(text):
    return text.split()

In [16]:
tokenizer('running like running and thus they run')

['running', 'like', 'running', 'and', 'thus', 'they', 'run']

In [17]:
# word stemming, tranforming word into their root form
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [18]:
tokenizer_porter('running like running and thus they run')


['run', 'like', 'run', 'and', 'thu', 'they', 'run']

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/whs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
from nltk.corpus import stopwords
stop = stopwords.words('english')  # stop words have little meaning eg. a, is, and, has, etc. 
[w for w in tokenizer_porter('a runner likes running and runs a lot') 
 if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [21]:
# pg. 244
# Training a Logistic Regression model for document classification
# (X,y)
#X_train = df.loc[:25000, 'review'].values
#y_train = df.loc[:25000, 'sentiment'].values

#X_test  = df.loc[25000:, 'review'].values
#y_test  = df.loc[25000:, 'sentiment'].values

X_train = df.loc[:2500, 'review'].values
y_train = df.loc[:2500, 'sentiment'].values

X_test  = df.loc[2500:5000, 'review'].values
y_test  = df.loc[2500:5000, 'sentiment'].values

print(y_test.shape)

(2501,)


In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tfidf = TfidfVectorizer(strip_accents = None, 
                       lowercase = False)


In [24]:
param_grid = [
              {'vect__ngram_range':[(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer], #, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [100]}, # 0.1, 1.0, 10.0, 100.0]},
              
              {'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer], #, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C':[100]} #[0.1, 1.0,10.0,100.0]}
                ]


In [25]:
lr_tfidf = Pipeline([ ('vect', tfidf) ,
                      ('clf',  LogisticRegression(random_state=0))])



In [26]:
gs_lr_tfidf = GridSearchCV( lr_tfidf, param_grid, #sends each subset to a different core
                          scoring = 'accuracy',
                          cv = 5, verbose = 1,
                          n_jobs = -1) # n_jobs -1 uses all computer cores

In [27]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(2501,) (2501,)
(2501,) (2501,)


In [28]:
#Takes like 60 seconds

gs_lr_tfidf.fit(X_train, y_train) 

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   13.8s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...0>], 'vect__use_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [29]:
print('The Best parameter set: %s' % gs_lr_tfidf.best_params_)


The Best parameter set: {'clf__C': 100, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x118d75730>}


In [30]:
print('CV Accuracy: %.3f'
     % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))


CV Accuracy: 0.855
Test Accuracy: 0.848


# Shiz getting real - Tweet Time

In [31]:
import json
import urllib
import urllib.request
data = urllib.request.urlopen(url_json).read()
output = json.loads(data)
dfJ = pd.DataFrame(output)
#print (output)
dfJ.head()

for date in dates:
    date=date.strftime("%Y-%m-%d")
    date_json=date+'.json'
    url='https://alexlitel.github.io/congresstweets/data/'
    url_json=url+date_json
    data = urllib.request.urlopen(url_json).read()
    output = json.loads(data)
    dfDate = pd.DataFrame(output)
    predict=np.mean(clf.predict(dfDate['text']))
    print('sucess'+date)
    

    


NameError: name 'url_json' is not defined

In [None]:
#####Predict Time 
#https://alexlitel.github.io/congresstweets/



### Let's import our data
df_20170629 = pd.read_json('2017-06-29.json')
df_20170628 = pd.read_json('2017-06-28.json')
df_20170627 = pd.read_json('2017-06-27.json')
df_20170626 = pd.read_json('2017-06-26.json')
df_20170625 = pd.read_json('2017-06-25.json')
df_20170624 = pd.read_json('2017-06-24.json')
df_20170623 = pd.read_json('2017-06-23.json')
df_20170622 = pd.read_json('2017-06-22.json')

#clf.predict(X_test[:10])
#X_test[:10]
predict_20170629=np.mean(clf.predict(df_20170629['text']))
predict_20170628=np.mean(clf.predict(df_20170628['text']))
predict_20170627=np.mean(clf.predict(df_20170627['text']))
predict_20170626=np.mean(clf.predict(df_20170626['text']))
predict_20170625=np.mean(clf.predict(df_20170625['text']))
predict_20170624=np.mean(clf.predict(df_20170624['text']))
predict_20170623=np.mean(clf.predict(df_20170623['text']))
predict_20170622=np.mean(clf.predict(df_20170622['text']))

df_plot=[predict_20170629, predict_20170628,
         predict_20170627, predict_20170626,
         predict_20170625,predict_20170624,
         predict_20170623,predict_20170622]

dates = pd.date_range('20170622', periods=8)


dates
df_plot

In [None]:
#set pictue
#Picture Cedits Alina Oleynik
from IPython.display import Image
smiley=Image("Smiley.png")
frowney=Image("Frowney.png")

if predict_20170629 > .5:
    facePic = smiley 
else:
    facePic = frowney
    
facePic

import urllib.request
with urllib.request.urlopen('https://github.com/whs2k/whs2k.github.io/blob/master/Frowney.png?raw=true') as url:
    facePic = url.read()
#I'm guessing this would output the html source code?
#print(s)
outfile = open('facePic.png','wb')
outfile.write(facePic)
outfile.close()
#facePic
 

In [95]:
#X-Axis - Days
#max_year=df['fiscal_year'].max()
#min_year=df['fiscal_year'].min()
#years=np.linspace(min_year, max_year, (max_year-min_year+1))

import matplotlib.pyplot as plt
thfont = {'fontname':'Tahoma'}
x=linspace(1,6,7)

plt.plot(dates, df_plot,'#daccc9', label='EMZ Estimates')
plt.xlabel('Date',**thfont)
plt.ylabel('Sentiment',**thfont)


plt.savefig('todaysMood.png')


NameError: name 'linspace' is not defined

# STop Boi

In [None]:
#IMpotant Git Commands
#git add 

In [None]:
from datetime import datetime
from threading import Timer

x=datetime.today()
y=x.replace(day=x.day+1, hour=1, minute=0, second=0, microsecond=0)
delta_t=y-x

secs=delta_t.seconds+1

def hello_world():
    print ("hello world")
    #...

t = Timer(secs, hello_world)
t.start()

# Skip this part of the model

In [None]:
# p.246
# Working with bigger data -- online algos and out-of-core learning

import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
def tokenizer(text):  # converts reviews to lower case, take out non-words, and put emoticons at the end
    text = re.sub('<[^>]*>', '', text) #
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path): #brings in the reviews
    with open(path, 'r', encoding='utf-8')  as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2]) 
            yield text, label 


In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y
        

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer #New trick with hashing vectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error = 'ignore',
                         n_features = 2**21,
                         preprocessor = None,
                         tokenizer = tokenizer)

clf = SGDClassifier(loss='log', random_state = 1, n_iter = 1)

doc_stream = stream_docs(path ='/Users/whs/Documents/Grad/Machine Learning/Class 4/movie_data.csv')


In [None]:
#import pyprind
classes = np.array([0, 1])
print(classes)
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes) 


In [None]:
X_test, y_test = get_minibatch(doc_stream, size = 500)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))



In [None]:
df_20170628 = pd.read_json('2017-06-28.json')

# Shiz getting real - Tweet Time

In [32]:
import datetime as dt
from datetime import date, timedelta

#Ceate a vaiarble: todays_tweets = 
#today=dt.datetime.today().strftime("%m/%d/%Y")
today = dt.date.today().strftime("%Y-%m-%d")
yesterday = dt.date.today() - timedelta(1)
yesterday=yesterday.strftime("%Y-%m-%d")

In [33]:
#Ceate a vaiarble: todays_tweets = 
today=dt.datetime.today().strftime("%Y-%m-%d")
yesterday_json=yesterday+'.json'
url='https://alexlitel.github.io/congresstweets/data/'
url_json=url+yesterday_json
#https://alexlitel.github.io/congresstweets/data/2017-07-04.json
print(url_json)

https://alexlitel.github.io/congresstweets/data/2017-07-04.json


In [47]:
#Create Dates / A-axis
from datetime import date
d1 = date(2017, 6, 22)
d0 = date.today()
delta = d0 - d1
periods=delta.days
dates = pd.date_range('20170622', periods=periods)

for date in dates:
    print(date)
print(dates)



2017-06-22 00:00:00
2017-06-23 00:00:00
2017-06-24 00:00:00
2017-06-25 00:00:00
2017-06-26 00:00:00
2017-06-27 00:00:00
2017-06-28 00:00:00
2017-06-29 00:00:00
2017-06-30 00:00:00
2017-07-01 00:00:00
2017-07-02 00:00:00
2017-07-03 00:00:00
2017-07-04 00:00:00
DatetimeIndex(['2017-06-22', '2017-06-23', '2017-06-24', '2017-06-25',
               '2017-06-26', '2017-06-27', '2017-06-28', '2017-06-29',
               '2017-06-30', '2017-07-01', '2017-07-02', '2017-07-03',
               '2017-07-04'],
              dtype='datetime64[ns]', freq='D')


In [38]:
dfJson.head()

Unnamed: 0,id,link,screen_name,source,text,time
0,882086618294755328,https://www.twitter.com/brianschatz/statuses/8...,brianschatz,Twitter for iPhone,This is worth your time. I promise. https://tw...,2017-07-04T00:00:12-04:00
1,882096723123986436,https://www.twitter.com/TheNightGallery/status...,auctnr1,Twitter for iPhone,RT @TheNightGallery Almost time for me to star...,2017-07-04T00:40:22-04:00
2,882096666257629185,https://www.twitter.com/FSMidwest/statuses/882...,auctnr1,Twitter for iPhone,RT @FSMidwest Luke Voit after his first Major ...,2017-07-04T00:40:08-04:00
3,882096316117135360,https://www.twitter.com/RonHarrisMN/statuses/8...,Grace4NY,Twitter for iPhone,RT @RonHarrisMN Our standards for our children...,2017-07-04T00:38:44-04:00
4,882094376201011202,https://www.twitter.com/NateSilver538/statuses...,RepJayapal,Twitter for iPhone,RT @NateSilver538 It's almost as though local ...,2017-07-04T00:31:02-04:00


In [41]:
#Test Before Loop
#predict=np.mean(clf.predict(dfJson['text']))
import json
import urllib
import urllib.request
data = urllib.request.urlopen(url_json).read()
output = json.loads(data)
dfJson = pd.DataFrame(output)
#print (output)
dfJson.head()
dfJson['text'].head(10)

0    And that's a wrap, folks! #FY18NDAA is approve...
1    #FY18NDAA passes out of committee at 11:58 pm ...
2    RT @reporterjoe HASC overwhelmingly passes 201...
3    This fight to defeat the GOP “health care” pla...
4    @HASCRepublicans Proud many priorities for #IN...
5    Just finished marking up the NDAA in record ti...
6    .@realDonaldTrump lost the popular vote in Ame...
7    Very proud to be part of Congress that is focu...
8    RT @davepell Don’t believe the fear mongers. D...
9    Pleased to see @realDonaldTrump lawyers conced...
Name: text, dtype: object

In [93]:
#Cread new pivot dfPlot
dfPlot = pd.DataFrame()

In [None]:
yesterday_json=yesterday+'.json'
url='https://alexlitel.github.io/congresstweets/data/'
url_json=url+yesterday_json

In [89]:
#dates=dates.strftime("%Y-%m-%d")
date_json=date+'.json'
url='https://alexlitel.github.io/congresstweets/data/'
url_json=url+date_json
data = urllib.request.urlopen(url_json).read()
 

In [94]:
import json
import urllib
import urllib.request
data = urllib.request.urlopen(url_json).read()
output = json.loads(data)
dfJson = pd.DataFrame(output)
#print (output)
dfJson.head()

for date in dates:
    #dates=dates.strftime("%Y-%m-%d")
    date_json=date+'.json'
    url='https://alexlitel.github.io/congresstweets/data/'
    url_json=url+date_json
    data = urllib.request.urlopen(url_json).read()
    output = json.loads(data)
    dfJson = pd.DataFrame(output)
    predict = np.mean(clf.predict(dfJson['text']))
    dfPlot.append(predict)
    #dfPlot[date] = predict#add to dfPlot
    #dfPlot.set_value(1, dfPlot[date], 'date')
    
    print('sucess'+date)



TypeError: cannot concatenate a non-NDFrame object

In [91]:
dfPlot
#del dfPlot[:]
#dates

[0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871,
 0.49258908861557871]

In [39]:
#####Predict Time 
#https://alexlitel.github.io/congresstweets/



### Let's import our data
df_20170629 = pd.read_json('2017-06-29.json')
df_20170628 = pd.read_json('2017-06-28.json')
df_20170627 = pd.read_json('2017-06-27.json')
df_20170626 = pd.read_json('2017-06-26.json')
df_20170625 = pd.read_json('2017-06-25.json')
df_20170624 = pd.read_json('2017-06-24.json')
df_20170623 = pd.read_json('2017-06-23.json')
df_20170622 = pd.read_json('2017-06-22.json')

#clf.predict(X_test[:10])
#X_test[:10]
predict_20170629=np.mean(clf.predict(df_20170629['text']))
predict_20170628=np.mean(clf.predict(df_20170628['text']))
predict_20170627=np.mean(clf.predict(df_20170627['text']))
predict_20170626=np.mean(clf.predict(df_20170626['text']))
predict_20170625=np.mean(clf.predict(df_20170625['text']))
predict_20170624=np.mean(clf.predict(df_20170624['text']))
predict_20170623=np.mean(clf.predict(df_20170623['text']))
predict_20170622=np.mean(clf.predict(df_20170622['text']))

df_plot=[predict_20170629, predict_20170628,
         predict_20170627, predict_20170626,
         predict_20170625,predict_20170624,
         predict_20170623,predict_20170622]

dates = pd.date_range('20170622', periods=8)


dates
df_plot

[0.4967698061883713,
 0.47105357784969437,
 0.48583523282318464,
 0.47185493010955798,
 0.48214285714285715,
 0.59193954659949621,
 0.49723756906077349,
 0.49258908861557871]