# Baseline Bag of Words Classifier

In [92]:
import pandas as pd
import numpy as np
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import nltk
import sys
from sqlalchemy import create_engine
import string
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import pickle
from nltk import sent_tokenize, pos_tag, wordpunct_tokenize
from nltk.corpus import wordnet as wn
from nltk import NaiveBayesClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
# get the data from db
engine = create_engine('postgresql://teresaborcuch@localhost:5433/capstone')
query = "SELECT DISTINCT ON(title) title, date, author, body, link, section FROM ny_times;"
data = pd.read_sql(query, engine)
data.head(1)

Unnamed: 0,title,date,author,body,link,section
0,$5 Million for a Super Bowl Ad. Another Millio...,20170129,Sapna Maheshwari,"This month, Anheuser-Busch InBev hosted a doze...",http://www.nytimes.com/2017/01/29/business/5-m...,business


In [77]:
# make documents
def make_documents(col, labels):
    pr = NLTKPreprocessor()
    text = pr.transform(col)
    def word_feats(words):
        return dict([(word, True) for word in words])
    documents = zip([word_feats(x) for x in text], labels)
    return documents

In [78]:
labels = ['opinion' if x == 'opinion' else 'non-opinion' for x in data['section']]
documents = make_documents(data['title'], labels)

In [79]:
# identify all opinions and non
op_docs = []
non_op_docs = []

for i in documents:
    if i[1] == 'opinion':
        op_docs.append(i)
    elif i[1] == 'non-opinion':
        non_op_docs.append(i)

In [80]:
len(op_docs)

192

In [81]:
len(non_op_docs)

877

In [90]:
# 82% of articles are opinion so this is the number to beat
1- 192.0/(192+877)

0.8203928905519177

In [82]:
# make train and test sets
opcutoff = len(op_docs) * 3/4
nonopcutoff = len(non_op_docs) * 3/4

train_docs = op_docs[:opcutoff] + non_op_docs[:nonopcutoff]
test_docs = op_docs[opcutoff:] + non_op_docs[nonopcutoff:]

In [83]:
classifier = NaiveBayesClassifier.train(train_docs)
print 'accuracy: ', nltk.classify.util.accuracy(classifier, test_docs)

accuracy:  0.664179104478


In [84]:
classifier.show_most_informative_features()

Most Informative Features
                  muslim = True           opinio : non-op =     11.8 : 1.0
                  donald = True           opinio : non-op =      8.7 : 1.0
                 housing = True           opinio : non-op =      7.6 : 1.0
                    deal = True           opinio : non-op =      7.6 : 1.0
                  toward = True           opinio : non-op =      7.6 : 1.0
                  public = True           opinio : non-op =      7.6 : 1.0
                    neil = True           opinio : non-op =      6.4 : 1.0
                  bannon = True           opinio : non-op =      6.4 : 1.0
                  thomas = True           opinio : non-op =      4.5 : 1.0
                    hate = True           opinio : non-op =      4.5 : 1.0


In [85]:
# try for article bodies
documents = make_documents(data['body'], labels)

In [86]:
# identify all opinions and non
op_docs = []
non_op_docs = []

for i in documents:
    if i[1] == 'opinion':
        op_docs.append(i)
    elif i[1] == 'non-opinion':
        non_op_docs.append(i)

# make train and test sets
opcutoff = len(op_docs) * 3/4
nonopcutoff = len(non_op_docs) * 3/4

train_docs = op_docs[:opcutoff] + non_op_docs[:nonopcutoff]
test_docs = op_docs[opcutoff:] + non_op_docs[nonopcutoff:]

In [87]:
classifier = NaiveBayesClassifier.train(train_docs)
print 'accuracy: ', nltk.classify.util.accuracy(classifier, test_docs)
classifier.show_most_informative_features()

accuracy:  0.858208955224
Most Informative Features
            incompetence = True           opinio : non-op =     16.6 : 1.0
                 foolish = True           opinio : non-op =     16.6 : 1.0
              politicize = True           opinio : non-op =     13.6 : 1.0
               misguided = True           opinio : non-op =     13.6 : 1.0
                     alt = True           opinio : non-op =     13.6 : 1.0
                     ore = True           opinio : non-op =     13.6 : 1.0
                 obesity = True           opinio : non-op =     13.6 : 1.0
                  modify = True           opinio : non-op =     13.6 : 1.0
                 cruelty = True           opinio : non-op =     13.6 : 1.0
               disregard = True           opinio : non-op =     13.6 : 1.0


# Using NLTKPreprocessor + SGDClassifier

In [95]:
def identity(arg):
    return arg

In [5]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords = None, punct = None, lower = True, strip = True):
        self.lower = lower
        self.strip = strip
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.punct = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self
    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]
    def transform(self, X):
        return[
            list(self.tokenize(doc)) for doc in X
        ]
    def tokenize(self, document):
        document = document.encode('ascii', errors = 'replace')
        #document = ''.join([ch.lower() for ch in text if ch not in string.punctuation])
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)): 
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token
                if token in self.stopwords:
                    continue
                if all(char in self.punct for char in token):
                    continue
                lemma = self.lemmatize(token, tag)
                yield lemma
                
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

In [93]:
def build_and_evaluate(X, y, 
                       classifier = SGDClassifier, outpath = None, verbose = True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()
            
        model = Pipeline([
                ('preprocessor', NLTKPreprocessor()),
                ('vectorizer', TfidfVectorizer(
                    tokenizer = identity, preprocessor=None, 
                        lowercase = False)),
                ('classifier', classifier)
            ])

        model.fit(X,y)
        return model
    
    labels = LabelEncoder()
    y = labels.fit_transform(y)
    
    if verbose: 
        print "Building for evaluation"
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    model = build(classifier, X_train, y_train)
    
    if verbose:
        print "Classification Report: \n"
        
    y_pred = model.predict(X_test)
    print classification_report(y_test, y_pred, target_names = labels.classes_)
    
    if verbose:
        print "Building complete model and saving ..."
        
    model = build(classifier, X, y)
    model.labels_ = labels
    
    print "Done"
    
    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))
        
    return model

In [102]:
from operator import itemgetter

In [105]:
def show_most_informative_features(model, text=None, n=20):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append(
            "Classified as: {}".format(model.predict([text]))
        )
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)

In [96]:
X = data['title']
y = labels

model = build_and_evaluate(X,y, outpath = None)

Building for evaluation
Classification Report: 

             precision    recall  f1-score   support

non-opinion       0.84      0.88      0.86       174
    opinion       0.32      0.25      0.28        40

avg / total       0.74      0.76      0.75       214

Building complete model and saving ...
Done


In [107]:
features = show_most_informative_features(model)
print features

4.3877       disorder    -4.1639            say
4.0675        protect    -3.4971      nominates
3.6424          laura    -3.1159              u
3.5982     antibiotic    -3.0363      sometimes
3.5460         donald    -2.9509          fight
3.4113       together    -2.7617          robot
3.3582       election    -2.6408           amid
3.3207        housing    -2.5663         review
3.2687          sugar    -2.3755            try
3.2282        tension    -2.3661            end
3.1741        prepare    -2.3563            far
3.1130     journalist    -2.3146           plan
3.0835            lie    -2.3108           role
3.0449          march    -2.2773           town
3.0400            lab    -2.2701        couture
3.0364        calming    -2.2552         mexico
3.0222    embarrassed    -2.2144        meeting
3.0213           mile    -2.2110            get
2.9883           cede    -2.1925          black
2.9656       everyone    -2.1907           visa


In [None]:
# try with article bodies
X = data['body']
y = data.section
model = build_and_evaluate(X,y,outpath = None)

Building for evaluation
Classification Report: 



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


               precision    recall  f1-score   support

         arts       0.75      0.64      0.69        14
  automobiles       0.00      0.00      0.00         1
        books       0.33      0.50      0.40         2
     business       0.44      0.57      0.50         7
       dining       0.00      0.00      0.00         0
    education       0.90      0.82      0.86        11
      fashion       0.67      0.67      0.67         3
       health       0.00      0.00      0.00         1
      insider       0.00      0.00      0.00         2
         jobs       0.90      0.75      0.82        12
     magazine       0.00      0.00      0.00         6
       movies       0.86      0.74      0.79        42
     nyregion       0.50      0.33      0.40         3
   obituaries       0.83      1.00      0.91        30
      opinion       1.00      0.20      0.33         5
   realestate       0.20      1.00      0.33         1
      science       1.00      0.75      0.86         8
       sp

In [110]:
features = show_most_informative_features(model)
print features

4.7358         editor    -9.6527            say
3.9959          trump    -4.1980             mr
3.3537      president    -2.6711              m
2.9853         macron    -2.1833        twitter
2.9059          brave    -2.0960         monday
2.5556          voter    -2.0685        company
2.4291   chlorpyrifos    -1.9425          movie
2.3064        vietnam    -1.8501           plan
2.2595          right    -1.8163        senator
2.1817            jan    -1.8110      statement
2.1280         agency    -1.6829        premium
2.1177          truth    -1.6529            add
2.0745        spanish    -1.6000         accord
2.0030         bannon    -1.5571     university
1.9893          sugar    -1.5512          group
1.9612     bangladesh    -1.5510           visa
1.9352          skull    -1.5009           also
1.9247      mikhalkov    -1.4838            ice
1.8651        blowout    -1.4658     remittance
1.8529        reality    -1.4528             dr


# Bigrams

In [121]:
from nltk import BigramCollocationFinder
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [112]:
pr = NLTKPreprocessor()

In [114]:
X_t = pr.transform(X)

In [116]:
len(X_t)

1069

In [128]:
bigrams = []
for text in X:
    text = text.encode('ascii','ignore')
    tokens = nltk.word_tokenize(text)
    finder = BigramCollocationFinder.from_words(tokens)
    scored = finder.score_ngrams(bigram_measures.raw_freq)
    bigrams.append(scored)

In [129]:
bigrams[0]

[(('Super', 'Bowl'), 0.009796533534287867),
 ((',', 'the'), 0.006782215523737754),
 (('.', 'The'), 0.006028636021100226),
 ((',', 'which'), 0.00452147701582517),
 (('the', 'Super'), 0.00452147701582517),
 (('the', 'ad'), 0.00452147701582517),
 (('the', 'ads'), 0.00452147701582517),
 ((',', 'said'), 0.0037678975131876413),
 (('of', 'a'), 0.0037678975131876413),
 (('the', 'company'), 0.0037678975131876413),
 (('the', 'game'), 0.0037678975131876413),
 (('Bowl', 'ad'), 0.003014318010550113),
 (('last', 'year'), 0.003014318010550113),
 (('to', 'the'), 0.003014318010550113),
 ((',', 'and'), 0.002260738507912585),
 ((',', 'as'), 0.002260738507912585),
 (('Bowl', 'ads'), 0.002260738507912585),
 (('about', 'the'), 0.002260738507912585),
 (('companies', 'to'), 0.002260738507912585),
 (('for', 'companies'), 0.002260738507912585),
 (('for', 'the'), 0.002260738507912585),
 (('from', 'the'), 0.002260738507912585),
 (('said', ','), 0.002260738507912585),
 (('social', 'media'), 0.002260738507912585),
