In [2]:
import pandas as pd
import numpy as np
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import nltk
import sys
from sqlalchemy import create_engine
import string
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split, cross_val_score

In [3]:
engine = create_engine('postgresql://teresaborcuch@localhost:5433/capstone')

In [4]:
query = "SELECT DISTINCT ON(title) title, date, author, body, link, section FROM ny_times;"

In [5]:
data = pd.read_sql(query, engine)

In [6]:
data.head()

Unnamed: 0,title,date,author,body,link,section
0,$5 Million for a Super Bowl Ad. Another Millio...,20170129,Sapna Maheshwari,"This month, Anheuser-Busch InBev hosted a doze...",http://www.nytimes.com/2017/01/29/business/5-m...,business
1,"$60,000 in Tuition, and My Son Wants to Become...",20170112,Philip Galanes,My wife and I are spending a fortune to send o...,http://www.nytimes.com/2017/01/12/fashion/farm...,fashion
2,"1 Patient, 7 Tumors and 100 Billion Cells Equa...",20161207,Denise Grady,The remarkable recovery of a woman with advanc...,http://www.nytimes.com/2016/12/07/health/cance...,health
3,10 Unexpected Styling Tricks for Men,20170125,Alex Tudela,,http://www.nytimes.com/2017/01/25/t-magazine/f...,t-magazine
4,15 of the Best Journals by Our Reporters Aroun...,20161230,Barbara Tierney,Our foreign correspondents wrote about dozens ...,http://www.nytimes.com/2016/12/30/world/15-of-...,world


In [7]:
data.shape

(751, 6)

In [8]:
data.to_csv('/Users/teresaborcuch/Desktop/nyt_backup.csv', encoding = 'utf8')

In [9]:
for i in data['title']:
    i = i.encode('utf-8')

In [10]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
chars = string.punctuation + '’'

In [12]:
chars

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\xe2\x80\x99'

In [13]:
'’'

'\xe2\x80\x99'

In [14]:
print 0xe2

226


In [15]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = text.encode('ascii', errors = 'replace')
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [16]:
c_vec = TfidfVectorizer(decode_error = 'replace', tokenizer = tokenize, strip_accents = 'unicode', stop_words = 'english')
c_vec.fit(data['title'])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error='replace',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize at 0x11a2798c0>, use_idf=True,
        vocabulary=None)

In [17]:
v_titles = pd.DataFrame(c_vec.transform(data['title']).todense(), columns = c_vec.get_feature_names())

In [18]:
v_titles.head()

Unnamed: 0,1,10,100,1000,106,11th,15,16,16000,18900,...,youv,yuliya,zealand,zealou,zenbanx,zero,zhiqiang,zimmerman,zombi,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.5218,0.0,0.289898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.425791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.519427,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
v_titles.sum(axis=0).sort_values(ascending = False).head(5)

trump     29.455227
ban       10.114447
new        9.358870
immigr     9.314399
order      8.455221
dtype: float64

In [20]:
# make y opinion vs non-opinion
y = [1 if i == 'opinion' else 0 for i in data['section']]

In [21]:
# feature selection
rfecv = RFECV(estimator = DecisionTreeClassifier(), cv = 3, scoring = 'mean_squared_error')
rfecv.fit(v_titles, y)

RFECV(cv=3,
   estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
   estimator_params=None, scoring='mean_squared_error', step=1, verbose=0)

In [22]:
rfecv_cols = v_titles.columns[rfecv.support_]

In [23]:
len(rfecv_cols)

2041

In [24]:
X = v_titles[rfecv_cols]

In [28]:
# make train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

TypeError: Expected sequence or array-like, got estimator       america  american  amid  amour  amplifi  analog  ancestor  ancient  \
0    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
1    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
2    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
3    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
4    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
5    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
6    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
7    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
8    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
9    0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
10   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
11   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
12   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
13   0.361474  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
14   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
15   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
16   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
17   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
18   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
19   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
20   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
21   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
22   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
23   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
24   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
25   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
26   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
27   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
28   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
29   0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
..        ...       ...   ...    ...      ...     ...       ...      ...   
721  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
722  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
723  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
724  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
725  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
726  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
727  0.000000  0.391207   0.0    0.0      0.0     0.0       0.0      0.0   
728  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
729  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
730  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
731  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
732  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
733  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
734  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
735  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
736  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
737  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
738  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
739  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
740  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
741  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
742  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
743  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
744  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
745  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
746  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
747  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
748  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
749  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   
750  0.000000  0.000000   0.0    0.0      0.0     0.0       0.0      0.0   

     andrew      anew   ...        yemen  yen  yield  york     young  younger  \
0       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
1       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
2       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
3       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
4       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
5       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
6       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
7       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
8       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
9       0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
10      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.423583      0.0   
11      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
12      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
13      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
14      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
15      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
16      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
17      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
18      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
19      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
20      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
21      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
22      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
23      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
24      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
25      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
26      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
27      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
28      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
29      0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
..      ...       ...   ...          ...  ...    ...   ...       ...      ...   
721     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
722     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
723     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
724     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
725     0.0  0.000000   ...     0.351819  0.0    0.0   0.0  0.000000      0.0   
726     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
727     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
728     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
729     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
730     0.0  0.362871   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
731     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
732     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
733     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
734     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
735     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
736     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
737     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
738     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
739     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
740     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
741     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
742     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
743     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
744     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
745     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
746     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
747     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
748     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
749     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   
750     0.0  0.000000   ...     0.000000  0.0    0.0   0.0  0.000000      0.0   

        youth  youv    yuliya  zealand  
0    0.000000   0.0  0.000000      0.0  
1    0.000000   0.0  0.000000      0.0  
2    0.000000   0.0  0.000000      0.0  
3    0.000000   0.0  0.000000      0.0  
4    0.000000   0.0  0.000000      0.0  
5    0.000000   0.0  0.000000      0.0  
6    0.000000   0.0  0.000000      0.0  
7    0.000000   0.0  0.000000      0.0  
8    0.000000   0.0  0.000000      0.0  
9    0.000000   0.0  0.000000      0.0  
10   0.000000   0.0  0.000000      0.0  
11   0.000000   0.0  0.000000      0.0  
12   0.000000   0.0  0.000000      0.0  
13   0.000000   0.0  0.000000      0.0  
14   0.000000   0.0  0.000000      0.0  
15   0.000000   0.0  0.000000      0.0  
16   0.000000   0.0  0.000000      0.0  
17   0.000000   0.0  0.000000      0.0  
18   0.000000   0.0  0.000000      0.0  
19   0.000000   0.0  0.000000      0.0  
20   0.000000   0.0  0.000000      0.0  
21   0.000000   0.0  0.000000      0.0  
22   0.000000   0.0  0.000000      0.0  
23   0.000000   0.0  0.000000      0.0  
24   0.000000   0.0  0.000000      0.0  
25   0.000000   0.0  0.000000      0.0  
26   0.000000   0.0  0.000000      0.0  
27   0.000000   0.0  0.000000      0.0  
28   0.000000   0.0  0.000000      0.0  
29   0.000000   0.0  0.000000      0.0  
..        ...   ...       ...      ...  
721  0.000000   0.0  0.000000      0.0  
722  0.000000   0.0  0.000000      0.0  
723  0.000000   0.0  0.000000      0.0  
724  0.000000   0.0  0.000000      0.0  
725  0.000000   0.0  0.000000      0.0  
726  0.000000   0.0  0.000000      0.0  
727  0.000000   0.0  0.000000      0.0  
728  0.000000   0.0  0.000000      0.0  
729  0.414721   0.0  0.000000      0.0  
730  0.000000   0.0  0.362871      0.0  
731  0.000000   0.0  0.000000      0.0  
732  0.000000   0.0  0.000000      0.0  
733  0.000000   0.0  0.000000      0.0  
734  0.000000   0.0  0.000000      0.0  
735  0.000000   0.0  0.000000      0.0  
736  0.000000   0.0  0.000000      0.0  
737  0.000000   0.0  0.000000      0.0  
738  0.000000   0.0  0.000000      0.0  
739  0.000000   0.0  0.000000      0.0  
740  0.000000   0.0  0.000000      0.0  
741  0.000000   0.0  0.000000      0.0  
742  0.000000   0.0  0.000000      0.0  
743  0.000000   0.0  0.000000      0.0  
744  0.000000   0.0  0.000000      0.0  
745  0.000000   0.0  0.000000      0.0  
746  0.000000   0.0  0.000000      0.0  
747  0.000000   0.0  0.000000      0.0  
748  0.000000   0.0  0.000000      0.0  
749  0.000000   0.0  0.000000      0.0  
750  0.000000   0.0  0.000000      0.0  

[751 rows x 2041 columns]

In [None]:
# fit decision tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
score = np.mean(cross_val_score(dt, X_test, y_test, cv = 5, scoring = "accuracy"))

In [None]:
score

In [None]:
# 80% of the articles are opinion
1- 58.0/len(data)

# Naive Bayes

In [None]:
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [None]:
documents[0]

In [None]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [None]:
all_words

In [None]:
word_features = list(all_words)[:2000]

In [None]:
word_features

In [None]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [None]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
train_set[0]

## Preprocessing titles for NLTK Naive Bayes

In [29]:
# make these functions part of the list of words for documents
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        if stemmer.stem(item) not in nltk.corpus.stopwords.words('english'):
            stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = text.encode('ascii', errors = 'replace')
    text = ''.join([ch.lower() for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [30]:
# preprocessing for NLTK Naive Bayes
# titles have been tokenized, accents/punctuation/stopwords removed
#documents = [(tokenize(title), label) for title in data['title'] for label in y]
labels = ["opinion" if i == 'opinion' else "not opinion" for i in data['section']]
tokenized_titles = [tokenize(title) for title in data['title']]
documents = zip(tokenized_titles, labels)

In [31]:
# make a giant list of all the words in the titles
word_list = []
for title in data['title']:
    word_list.extend(tokenize(title))
all_words = nltk.FreqDist(w.lower() for w in word_list)

In [32]:
all_words.most_common(5)

[(u'trump', 156), (u'new', 37), (u'ban', 33), (u'say', 32), (u'immigr', 28)]

In [33]:
word_features = list(all_words)[:2000]

In [34]:
def doc_features(doc):
    doc_words = set(doc)
    features = {}
    for word in word_features:
        features[word] = (word in doc_words)
    return features

In [35]:
feature_sets = [(doc_features(d), c) for (d,c) in documents]

In [36]:
len(feature_sets)

751

In [37]:
# split into train and tests
train_set, test_set = feature_sets[90:], feature_sets[:90]

In [38]:
len(train_set)

661

In [39]:
# train the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [40]:
# test it on test set
print(nltk.classify.accuracy(classifier, test_set))

0.655555555556


In [41]:
# get most informative features
classifier.show_most_informative_features(50)

Most Informative Features
                  donald = True           opinio : not op =     17.1 : 1.0
                    dont = True           opinio : not op =     11.7 : 1.0
                   fraud = True           opinio : not op =     11.7 : 1.0
                   march = True           opinio : not op =      8.4 : 1.0
                    moor = True           opinio : not op =      8.4 : 1.0
                    neil = True           opinio : not op =      7.9 : 1.0
                  nomine = True           opinio : not op =      7.0 : 1.0
                    hate = True           opinio : not op =      5.0 : 1.0
                  outcri = True           opinio : not op =      5.0 : 1.0
                     way = True           opinio : not op =      5.0 : 1.0
                 assault = True           opinio : not op =      5.0 : 1.0
                  harlem = True           opinio : not op =      5.0 : 1.0
                   thoma = True           opinio : not op =      5.0 : 1.0

# Pipeline

In [171]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import pickle
from nltk import sent_tokenize, pos_tag, wordpunct_tokenize
from nltk.corpus import wordnet as wn

In [172]:
def identity(arg):
    return arg

In [173]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords = None, punct = None, lower = True, strip = True):
        self.lower = lower
        self.strip = strip
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.punct = set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        
    def fit(self, X, y=None):
        return self
    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]
    def transform(self, X):
        return[
            list(self.tokenize(doc)) for doc in X
        ]
    def tokenize(self, document):
        document = document.encode('ascii', errors = 'replace')
        #document = ''.join([ch.lower() for ch in text if ch not in string.punctuation])
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)): 
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token
                if token in self.stopwords:
                    continue
                if all(char in self.punct for char in token):
                    continue
                lemma = self.lemmatize(token, tag)
                yield lemma
                
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

In [183]:
def build_and_evaluate(X, y, 
                       classifier = MultinomialNB, outpath = None, verbose = True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()
            
        model = Pipeline([
                ('preprocessor', NLTKPreprocessor()),
                ('vectorizer', TfidfVectorizer(
                    tokenizer = identity, preprocessor=None, 
                        lowercase = False)),
                ('classifier', classifier)
            ])

        model.fit(X,y)
        return model
    
    labels = LabelEncoder()
    y = labels.fit_transform(y)
    
    if verbose: 
        print "Building for evaluation"
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    model = build(classifier, X_train, y_train)
    
    if verbose:
        print "Classification Report: \n"
        
    y_pred = model.predict(X_test)
    print classification_report(y_test, y_pred, target_names = labels.classes_)
    
    if verbose:
        print "Building complete model and saving ..."
        
    model = build(classifier, X, y)
    model.labels_ = labels
    
    print "Done"
    
    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))
        
    return model

In [193]:
def show_most_informative_features(model, text=None, n=20):
    # Extract the vectorizer and the classifier from the pipeline
    vectorizer = model.named_steps['vectorizer']
    classifier = model.named_steps['classifier']

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {}.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = model.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    # Get the top n and bottom n coef, name pairs
    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append(
            "Classified as: {}".format(model.predict([text]))
        )
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(
                cp, fnp, cn, fnn
            )
        )

    return "\n".join(output)

In [175]:
X = data['body']
y = ['opinion' if x == 'opinion' else 'non-opinion' for x in data['section']]
model = build_and_evaluate(X, y, outpath = None)

Building for evaluation
Classification Report: 

             precision    recall  f1-score   support

non-opinion       0.86      1.00      0.93       130
    opinion       0.00      0.00      0.00        21

avg / total       0.74      0.86      0.80       151

Building complete model and saving ...


In [185]:
X = data['title']
y = ['opinion' if x == 'opinion' else 'non-opinion' for x in data['section']]
model = build_and_evaluate(X, y, outpath = '/Users/teresaborcuch/capstone_project/model.pkl')

Building for evaluation
Classification Report: 

             precision    recall  f1-score   support

non-opinion       0.78      1.00      0.88       118
    opinion       0.00      0.00      0.00        33

avg / total       0.61      0.78      0.69       151

Building complete model and saving ...
Done
Model written out to /Users/teresaborcuch/capstone_project/model.pkl


In [194]:
with open('/Users/teresaborcuch/capstone_project/model.pkl', 'rb') as f:
    model = pickle.load(f)

show_most_informative_features(model, text = data['title'], n = 20)

AttributeError: 'MultinomialNB' object has no attribute 'transform'

# Naive Bayes on Article Body

In [61]:
# 83% of the articles are not opinion
1 - 127.0/751

0.8308921438082557

In [49]:
# preprocessing for NLTK Naive Bayes
# titles have been tokenized, accents/punctuation/stopwords removed
#documents = [(tokenize(title), label) for title in data['title'] for label in y]
labels = ["opinion" if i == 'opinion' else "not opinion" for i in data['section']]
tokenized_articles = [tokenize(body) for body in data['body']]
documents = zip(tokenized_articles, labels)

In [50]:
# make a giant list of all the words in the articles
word_list = []
for title in data['body']:
    word_list.extend(tokenize(title))
all_words = nltk.FreqDist(w.lower() for w in word_list)

In [51]:
print all_words.most_common(5)
word_features = list(all_words)[:2000]

[(u'wa', 4947), (u'said', 4379), (u'mr', 4066), (u'hi', 3405), (u'ha', 2878)]


In [52]:
feature_sets = [(doc_features(d), c) for (d,c) in documents]

In [53]:
# split into train and tests
train_set, test_set = feature_sets[90:], feature_sets[:90]

In [54]:
# train the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [55]:
# test it on test set
print(nltk.classify.accuracy(classifier, test_set))

0.722222222222


In [56]:
# get most informative features
classifier.show_most_informative_features(50)

Most Informative Features
        japaneseamerican = True           opinio : not op =      9.0 : 1.0
          counterproduct = True           opinio : not op =      8.4 : 1.0
                    tame = True           opinio : not op =      8.4 : 1.0
                   troll = True           opinio : not op =      7.0 : 1.0
                 prosper = True           opinio : not op =      6.5 : 1.0
                selfserv = True           opinio : not op =      5.0 : 1.0
             assemblyman = True           opinio : not op =      5.0 : 1.0
                   intak = True           opinio : not op =      5.0 : 1.0
                 perpetr = True           opinio : not op =      5.0 : 1.0
                 tyranni = True           opinio : not op =      5.0 : 1.0
                   ulcer = True           opinio : not op =      5.0 : 1.0
                 particl = True           opinio : not op =      5.0 : 1.0
               honeymoon = True           opinio : not op =      5.0 : 1.0

# Word Cloud

In [63]:
data.head(2)

Unnamed: 0,title,date,author,body,link,section
0,$5 Million for a Super Bowl Ad. Another Millio...,20170129,Sapna Maheshwari,"This month, Anheuser-Busch InBev hosted a doze...",http://www.nytimes.com/2017/01/29/business/5-m...,business
1,"$60,000 in Tuition, and My Son Wants to Become...",20170112,Philip Galanes,My wife and I are spending a fortune to send o...,http://www.nytimes.com/2017/01/12/fashion/farm...,fashion


In [None]:
# make file of opinion titles
f = open('/Users/teresaborcuch/capstone_project/opinion_titles.txt', 'w')
for i, row in data.iterrows():
    if row[5] == 'opinion':
        f.write(row[0].encode('utf-8'))
        f.write('\n')

In [64]:
# make file of opinion bodies
f = open('/Users/teresaborcuch/capstone_project/opinion_bodies.txt', 'w')
for i, row in data.iterrows():
    if row[5] == 'opinion':
        f.write(row[3].encode('utf-8'))
        f.write('\n')

In [65]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = open('/Users/teresaborcuch/capstone_project/opinion_bodies.txt').read()

wordcloud = WordCloud().generate(text)

plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('/Users/teresaborcuch/capstone_project/opinion__body_big.png')


wordcloud = WordCloud(max_font_size = 40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('/Users/teresaborcuch/capstone_project/opinion_body_sm.png')




In [None]:
# make file of non-opinion titles
f = open('/Users/teresaborcuch/capstone_project/nonopinion_titles.txt', 'w')
for i, row in data.iterrows():
    if row[5] != 'opinion':
        f.write(row[0].encode('utf-8'))
        f.write('\n')

In [66]:
# make file of non-opinion bodies
f = open('/Users/teresaborcuch/capstone_project/nonopinion_bodies.txt', 'w')
for i, row in data.iterrows():
    if row[5] != 'opinion':
        f.write(row[3].encode('utf-8'))
        f.write('\n')

In [67]:
# make non-opinion word cloud
text = open('/Users/teresaborcuch/capstone_project/nonopinion_bodies.txt').read()
wordcloud = WordCloud().generate(text)

plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('/Users/teresaborcuch/capstone_project/non_opinion_body_big.png')

wordcloud = WordCloud(max_font_size = 40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('/Users/teresaborcuch/capstone_project/non_opinion_body_sm.png')
