In [1]:
import requests
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# inline plotting
%matplotlib inline


# Vectorize, Transform
def make_X(data, min_df_in, stopwords, lowercase,ngram):
    vectorizer = CountVectorizer(min_df=min_df_in, stop_words= stopwords, lowercase= lowercase, ngram_range=ngram)
    term_doc_matrix = vectorizer.fit_transform(data)
    term_doc_matrix = term_doc_matrix.todense()
    term_doc_matrix = pd.DataFrame(term_doc_matrix)
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(term_doc_matrix)
    tfidf = pd.DataFrame(tfidf.toarray())
    return tfidf, vectorizer, transformer

# Send scores to a google docs
def post_score(name, your_score):
    send = 'https://docs.google.com/forms/d/e/1FAIpQLSfOdYRNhf_z3PsHDxMu-IoqaUbUaI9uSHflExgZuBoC1HNvtQ/formResponse?'    
    send += 'entry.278237990=' + name
    send += '&entry.415269798=' + str(your_score)
    r = requests.post(send)
    print(r)

In [34]:
df.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,fresh,114709,Time Out,"So ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,The year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,A winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,The film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"An entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [2]:
# Read Data
critics = pd.read_csv('resources/critics.csv')
critics = critics[critics.fresh != 'none']
df = critics.copy().dropna()
# split
X = df.quote
y = df.fresh == 'fresh'

In [26]:
# lower all case
X_lower = X.apply(lambda x: x.lower())

In [35]:
import spacy
# import en_core_web_md #you have to download this from 
nlp = spacy.load('en_core_web_sm')
data_spacy = df

In [37]:
data_spacy['quote'] = data_spacy['quote'].apply(lambda x: x.lower())
data_spacy.head()

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title
1,Derek Adams,fresh,114709,Time Out,"so ingenious in concept, design and execution ...",2009-10-04,9559,Toy story
2,Richard Corliss,fresh,114709,TIME Magazine,the year's most inventive comedy.,2008-08-31,9559,Toy story
3,David Ansen,fresh,114709,Newsweek,a winning animated feature that has something ...,2008-08-18,9559,Toy story
4,Leonard Klady,fresh,114709,Variety,the film sports a provocative and appealing st...,2008-06-09,9559,Toy story
5,Jonathan Rosenbaum,fresh,114709,Chicago Reader,"an entertaining computer-generated, hyperreali...",2008-03-10,9559,Toy story


In [44]:
parsed = nlp.pipe(iter(data_spacy['quote']), batch_size=1, n_threads=4)
stop = []
# Create Features
tokens, lemma, parts, stopword = [], [], [], []

for parsed_doc in parsed:
    tokens.append([n.text for n in parsed_doc])
    lemma.append([n.lemma_ for n in parsed_doc])
    parts.append([n.pos_ for n in parsed_doc])
    stopword.append([n.is_stop for n in parsed_doc])

# Assign Parsed into Dataframe
data_spacy['tokens'] = tokens     
data_spacy['tokens_stopwords'] = data_spacy['tokens'].apply(lambda x: [item for item in x if item not in stop])
data_spacy['lemma'] = lemma
data_spacy['pos'] = parts
data_spacy['stopword'] = stopword

# Describe resulting frame
data_spacy.head(2)

Unnamed: 0,critic,fresh,imdb,publication,quote,review_date,rtid,title,tokens,tokens_stopwords,lemma,pos,stopword
1,Derek Adams,fresh,114709,Time Out,"so ingenious in concept, design and execution ...",2009-10-04,9559,Toy story,"[so, ingenious, in, concept, ,, design, and, e...","[so, ingenious, in, concept, ,, design, and, e...","[so, ingenious, in, concept, ,, design, and, e...","[ADV, ADJ, ADP, NOUN, PUNCT, NOUN, CCONJ, NOUN...","[True, False, True, False, False, False, True,..."
2,Richard Corliss,fresh,114709,TIME Magazine,the year's most inventive comedy.,2008-08-31,9559,Toy story,"[the, year, 's, most, inventive, comedy, .]","[the, year, 's, most, inventive, comedy, .]","[the, year, 's, most, inventive, comedy, .]","[DET, NOUN, PART, ADV, ADJ, NOUN, PUNCT]","[True, False, False, True, False, False, False]"


In [47]:
stopwords_list = ['is','the','a','an','has','have','the','of','to','and','a','in'] # your work here

In [62]:
x_in = df['quote'] # YOUR TURN:  <--- Your engineered features go here.
X2, vectorizer_2, transformer_2 = make_X(
    x_in,
    min_df_in= 10, # Option.
    stopwords=stopwords_list, # Option edit from above.
    lowercase = False, # Option.
    ngram = (1,1) # Option. 
)

In [74]:
temp = pd.get_dummies(df.critic)
temp.shape

(14770, 621)

In [76]:
df.shape

(14770, 13)

In [None]:
# Create Hold data set to judge predictive ability on new data
seed = 100
X_train, X_test,y_train, y_test = train_test_split(X2,y, test_size=0.15, random_state=seed)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [55]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(class_weight='balanced')
lr.fit(df_pca15.values[:,:n_components], df_pca15.digit_id)
# clf = MultinomialNB(alpha=1) # A smoothing parameter.

clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [56]:
print("Accuracy on Training Data: %f" % clf.score(X_train,y_train))
print("Accuracy on Test Data: %f" % clf.score(X_test,y_test))

Accuracy on Training Data: 0.833997
Accuracy on Test Data: 0.753159


In [52]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_pred= y_pred, y_true=y_test)
cm / cm.sum()

array([[ 0.20126354,  0.19314079],
       [ 0.06001805,  0.54557762]])

In [53]:
print(classification_report(y_pred = y_pred, y_true = y_test))

             precision    recall  f1-score   support

      False       0.77      0.51      0.61       874
       True       0.74      0.90      0.81      1342

avg / total       0.75      0.75      0.73      2216



In [54]:
name = "Baseline"
result = clf.score(X_test,y_test)
print('%s, Accuracy score of: %f' % (name, result))

Baseline, Accuracy score of: 0.746841
