In [1]:
import nltk

In [2]:
import textblob

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline



In [4]:
# read yelp.csv into a DataFrame
url = 'fake_or_real_news.csv'
news = pd.read_csv(url, encoding='unicode-escape')

In [5]:
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillaryâs Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"â Kaydee King (@KaydeeKing) November 9, 2016...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [6]:
# define X and y
X = news.title
y = news.label
print y.value_counts(normalize=True)

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

REAL    0.500552
FAKE    0.499448
Name: label, dtype: float64


In [7]:
X_train[0]

u'You Can Smell Hillary\xe2\x80\x99s Fear'

In [8]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [9]:
print X_train_dtm.shape
print X_test_dtm.shape

(4751, 9163)
(1584, 9163)


In [10]:
print vect.get_feature_names()[:50]

[u'000', u'00pm', u'01', u'10', u'100', u'1000', u'100k', u'100m', u'100percentfedup', u'101', u'106', u'10k', u'11', u'116', u'117', u'11th', u'12', u'120', u'122', u'1227', u'124th', u'12th', u'13', u'130', u'1390', u'14', u'140', u'141', u'147', u'15', u'150', u'159', u'16', u'1612', u'16th', u'16\xe2', u'17', u'170', u'179', u'18', u'180', u'1862', u'19', u'1940s', u'1946', u'1951', u'1980', u'1980s', u'1984', u'199']


In [None]:
#df["text"] = df.text.str.decode("utf-8")

In [11]:
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape # has more features

(4751, 9163)

In [13]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
vect = CountVectorizer(ngram_range=(2, 2))
Xdtm = vect.fit_transform(X)
nb = MultinomialNB()
nb.fit(Xdtm, y)
nb.score(Xdtm, y)

0.99573796369376477

In [18]:
# make a countvectorizer for a train test split
vect = CountVectorizer(ngram_range=(2, 2))
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use multinomial naive bayes with document feature matrix, NOT the text column
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)

0.717171717172


In [22]:
from sklearn.grid_search import GridSearchCV



In [23]:
#make a pipeline 
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

In [36]:
#make a pipeline 
from sklearn.pipeline import make_pipeline
pipe1 = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [63]:
pipe1.steps

[('tfidfvectorizer',
  TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [26]:
def word_tokenize_lemma(text):
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [27]:
#Set range of parameters
param_grid = {}
param_grid["countvectorizer__max_features"] = [1000,5000,10000]
param_grid["countvectorizer__ngram_range"] = [(1,1), (1,2), (2,2)]
param_grid["countvectorizer__lowercase"] = [True, False]
param_grid["countvectorizer__analyzer"] = ["word", word_tokenize_lemma]

In [64]:
param_grid1 = {}
param_grid1["tfidfvectorizer__max_features"] = [1000,5000,10000]
param_grid1["tfidfvectorizer__ngram_range"] = [(1,1), (1,2), (2,2)]
param_grid1["tfidfvectorizer__lowercase"] = [True, False]

In [65]:
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [61]:
from sklearn.grid_search import GridSearchCV
grid1 = GridSearchCV(pipe1, param_grid1, cv=5, scoring='accuracy')

In [66]:
#This will take a while
grid.fit(X,y)

ValueError: Invalid parameter TfidfVectorizer for estimator Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [67]:
#This will take a while
grid1.fit(X,y)

ValueError: Invalid parameter tfidfVectorizer for estimator Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smoo...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [68]:
#Look at the best parameters and the best scores
print(grid.best_params_)
print(grid.best_score_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [44]:
#Look at the best parameters and the best scores
print(grid1.best_params_)
print(grid1.best_score_)

{}
0.813101815312


In [32]:
grid.get_params().keys()

['n_jobs',
 'verbose',
 'estimator__countvectorizer__vocabulary',
 'estimator__countvectorizer',
 'estimator__countvectorizer__token_pattern',
 'estimator__steps',
 'param_grid',
 'cv',
 'estimator__countvectorizer__binary',
 'scoring',
 'estimator__countvectorizer__analyzer',
 'estimator__multinomialnb__alpha',
 'estimator__countvectorizer__max_features',
 'pre_dispatch',
 'estimator__countvectorizer__strip_accents',
 'estimator__countvectorizer__stop_words',
 'estimator__multinomialnb__fit_prior',
 'estimator__countvectorizer__input',
 'fit_params',
 'estimator__countvectorizer__preprocessor',
 'refit',
 'iid',
 'estimator__countvectorizer__encoding',
 'estimator__countvectorizer__decode_error',
 'estimator__countvectorizer__tokenizer',
 'estimator__countvectorizer__dtype',
 'estimator__countvectorizer__ngram_range',
 'estimator__countvectorizer__min_df',
 'estimator__multinomialnb__class_prior',
 'estimator__countvectorizer__lowercase',
 'estimator',
 'error_score',
 'estimator__mul