# Fake News Assignment
**Authors**: Vilhelm Stiernstedt & Sharon Marín Salazar
<br>
**Date**: 20/05/2018

### Description
Classification problem of News Report (document) for classes (FAKE, REAL). Try text-related classifiers such as Naive Bayes, MaxEnt, SVM. Use NLTK+SKLearn, NLP Pre-processing, Classifiers and CV-evaluation.

#### Dataset
**fake_or_real_news_training:**
- ID: ID of the tweet
- Title: Title of the news report
- Text: Textual content of the news report
- Label: Target Variable [FAKE, REAL]
- X1, X2 additional fields

**fake_or_real_news_test:**
- ID, title and text
- Predict Label

#### Advices
- Take a look to the data
- Try the pre-processing methodologies we have seen in class
- TF-IDF seems to be better (but try it!)
- N-grams pay the effort
- Less than 90-92%? -> Try again

#### Plan
1. Variable analysis
    - Features
    - Other insight
2. Data Processing
    - Drop features
    - Label
3. Modelling
    - Navie Bayes
    - SGD
    - SVM
        - linear
        - rgb
        - poly
        - sigmod
4. Evaluation

## Import Libraries

In [1]:
import collections
import matplotlib.pyplot as plt
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.classify import MaxentClassifier
import numpy as np
import pandas as pd
import seaborn as sns
import re
import PipelineHelper # https://github.com/bmurauer/pipelinehelper/blob/master/pipelinehelper.py
from scipy.sparse import hstack
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import warnings

# download required nltk packages (NB. commented out)
# nltk.download()

# plot settings
%matplotlib inline

# pandas view settings -> see all contents of column
pd.set_option('display.max_colwidth', -1)

# Warning settings -> suppress depreciation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)



## Function Definitions

### Pipeline 

In [2]:
# function to build pipeline with multiple models
# https://github.com/bmurauer/pipelinehelper/blob/master/pipelinehelper.py

from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
from collections import defaultdict
import itertools

class PipelineHelper(BaseEstimator, TransformerMixin, ClassifierMixin):

    def __init__(self, available_models=None, selected_model=None, include_bypass=False):
        self.include_bypass = include_bypass
        self.selected_model = selected_model
        # this is required for the clone operator used in gridsearch
        if type(available_models) == dict:
            self.available_models = available_models
        # this is the case for constructing the helper initially
        else:
            # a string identifier is required for assigning parameters
            self.available_models = {}
            for (key, model) in available_models:
                self.available_models[key] = model

    def generate(self, param_dict={}):
        per_model_parameters = defaultdict(lambda: defaultdict(list))

        # collect parameters for each specified model
        for k, values in param_dict.items():
            model_name = k.split('__')[0]
            param_name = k[len(model_name)+2:]  # might be nested
            if model_name not in self.available_models:
                raise Exception('no such model: {0}'.format(model_name))
            per_model_parameters[model_name][param_name] = values

        ret = []

        # create instance for cartesion product of all available parameters for each model
        for model_name, param_dict in per_model_parameters.items():
            parameter_sets = (dict(zip(param_dict, x)) for x in itertools.product(*param_dict.values()))
            for parameters in parameter_sets:
                ret.append((model_name, parameters))

        # for every model that has no specified parameters, add the default model
        for model_name in self.available_models.keys():
            if model_name not in per_model_parameters:
                ret.append((model_name, dict()))

        # check if the stage is to be bypassed as one configuration
        if self.include_bypass:
            ret.append((None, dict(), True))
        return ret

    def get_params(self, deep=False):
        return {'available_models': self.available_models,
                'selected_model': self.selected_model,
                'include_bypass': self.include_bypass}

    def set_params(self, selected_model, available_models=None, include_bypass=False):
        include_bypass = len(selected_model) == 3 and selected_model[2]

        if available_models:
            self.available_models = available_models

        if selected_model[0] is None and include_bypass:
            self.selected_model = None
            self.include_bypass = True
        else:
            if selected_model[0] not in self.available_models:
                raise Exception('so such model available: {0}'.format(selected_model[0]))
            self.selected_model = self.available_models[selected_model[0]]
            self.selected_model.set_params(**selected_model[1])

    def fit(self, X, y=None):
        if self.selected_model is None and not self.include_bypass:
            raise Exception('no model was set')
        elif self.selected_model is None:
            # print('bypassing model for fitting, returning self')
            return self
        else:
            # print('using model for fitting: ', self.selected_model.__class__.__name__)
            return self.selected_model.fit(X, y)

    def transform(self, X, y=None):
        if self.selected_model is None and not self.include_bypass:
            raise Exception('no model was set')
        elif self.selected_model is None:
            # print('bypassing model for transforming:')
            # print(X[:10])
            return X
        else:
            # print('using model for transforming: ', self.selected_model.__class__.__name__)
            return self.selected_model.transform(X)

    def predict(self, x):
        if self.include_bypass:
            raise Exception('bypassing classifier is not allowed')
        if self.selected_model is None:
            raise Exception('no model was set')
        return self.selected_model.predict(x)


## Import Data

In [28]:
# set path to data
data_path = 'data/'

# load test and train
df_train = pd.read_csv(data_path+'training_clean.csv')
df_test = pd.read_csv(data_path+'fake_or_real_news_test.csv')

# set index
df_train.set_index('ID', inplace=True)
df_test.set_index('ID', inplace=True)

# define combined df
all_data = df_train.append(df_test)

## Data Processing 

### Text Processing
#### Stemmers & Lemmatizer

In [29]:
# define count vectorizer for modelling (different parameter inputs will be given in modelling)
count_vectorizer = CountVectorizer()

# define Snowball stemmer (different parameter inputs will be given in modelling
snowball_stemmer = SnowballStemmer("english")

# define new vectorizer function with snowball stemmer
class SnowballCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SnowballCountVectorizer, self).build_analyzer()
        return lambda doc: ([snowball_stemmer.stem(w) for w in analyzer(doc)])
    
# define new vectorizer function with Porter stemmer NLTK exten 
# (different parameter inputs will be given in modelling)
porter_stemmer = PorterStemmer(mode='NLTK_EXTENSIONS')

# define new vectorizer function with porter stemmer
class PorterCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(PorterCountVectorizer, self).build_analyzer()
        return lambda doc: ([porter_stemmer.stem(w) for w in analyzer(doc)])
    
# define lemmatizer
lemmatizer = WordNetLemmatizer()

# define new vectorizer function with stemmer
class LemmatizerCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmatizerCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w) for w in analyzer(doc)])

### Model Pipeline 1
In our first pipeline we will try our simple count vectorizer but also different type of stemmers with various inputs such as n-grams, remove stopword and convert to lowercase. We will also see if using tf-idf(Term Frequency times inverse document frequency) can increase our scroe along with some parameter tuning for our models. We hope that of these stemmers will be better than our baseline for all tested models. 

We will assess the following combinations:
    - count vectorizer
    - count vectorizer w. snowball stemmer
    - count vectorizer w. porter stemmer
    - count vectorizer w. lemmatizer
    
TF-IDF = If we want to reduce the weightage of more common words, we deploy our vectorizer into the TF-IDF transformer, which will assign more weight to less common words.

#### Variable Selection

In [30]:
# create different feature subsets
df_model = df_train.text

#### Label

In [31]:
# save label
label = df_train.label

#### Split training data

In [32]:
# split training data and labels into train and validation 80/20
x_train, x_validation, y_train, y_validation = train_test_split(df_model, label,
                                                                test_size=0.2, random_state=42)

In [58]:
# define pipeline (vectorizer, models)
pipeline = Pipeline([('vect', PipelineHelper([
                            ('counter', CountVectorizer()),
                            ('snowball_stemmer', SnowballCountVectorizer()),
                            ('porter_stemmer', PorterCountVectorizer()),
                            ('lemmatizer', LemmatizerCountVectorizer()),
                        ])),
                     ('tf-idf', TfidfTransformer()),
                     ('clf', PipelineHelper([
                            ('sgd', SGDClassifier(n_iter=1000)),
                            #('svm-lin', LinearSVC()),
                            #('svm-ker', SVC()),
                            ('multi_nb', MultinomialNB()),
                        ])),
                       ])

#### Parameters
We will extend the model parameters and hope to imporve our score.

In [59]:
# define pipline parameters
parameters = {'vect__selected_model': pipeline.named_steps['vect'].generate({
                  'counter__ngram_range': [(1, 2), (1, 3), (1, 4)],
                  'counter__stop_words': ('english', None),
                  'counter__lowercase': (True, False),
                  'snowball_stemmer__ngram_range': [(1, 2), (1, 3), (1, 4)],
                  'snowball_stemmer__stop_words': ('english', None),
                  'snowball_stemmer__lowercase': (True, False),
                  'porter_stemmer__ngram_range': [(1, 2), (1, 3), (1, 4)],
                  'porter_stemmer__stop_words': ('english', None),
                  'porter_stemmer__lowercase': (True, False),
                  'lemmatizer__ngram_range': [(1, 2), (1, 3), (1, 4)],
                  'lemmatizer__stop_words': ('english', None),
                  'lemmatizer__lowercase': (True, False),
                }),
              'tf-idf__use_idf': (True, False),
              'clf__selected_model': pipeline.named_steps['clf'].generate({
                    'sgd__alpha': (1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3),
                    'sgd__loss': ('hinge', 'squared_hinge', 'log'),
                    'sgd__l1_ratio': (0, 0.1, 0.25, 0.5, 0.75, 1.0),
                #  'svm-lin__penalty': ('l1', 'l2'),
                #  'svm-lin__loss': ('hinge', 'squared_hinge'),
                #  'svm-lin__C': (0.1, 1, 10, 50, 100, 500, 1000),
                #  'svm-ker__kernel': ('rbf', 'poly', 'sigmoid'),
                #  'svm-ker__C': (0.1, 1, 10, 50, 100, 500, 1000),
                    'multi_nb__alpha': (0.1, 0.25, 0.5, 0.75, 1)
                })
              }

#### GridSearch

In [60]:
# define random search grid with cv
rscv_clf = RandomizedSearchCV(estimator=pipeline, verbose=4,
                              param_distributions=parameters,
                              n_jobs=3, n_iter=20, cv=3, 
                              random_state=42)

# fit model based
rscv_clf_mod = rscv_clf.fit(x_train, y_train)

# get best score from CV
rscv_clf_mod.best_score_

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 1.0, 'loss': 'hinge', 'l1_ratio': 0.5}) 
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 1.0, 'loss': 'hinge', 'l1_ratio': 0.5}) 
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 1.0, 'loss': 'hinge', 'l1_ratio': 0.5}) 
[CV]  vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 1.0, 'loss': 'hinge', 'l1_ratio': 0.5}), score=0.507029053420806, total=11.4min
[CV] vect__selected_model=('lemmatizer', {'

[CV]  vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 2), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'log', 'l1_ratio': 0.75}), score=0.5028142589118199, total= 2.5min
[CV] vect__selected_model=('counter', {'ngram_range': (1, 3), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'squared_hinge', 'l1_ratio': 0.5}) 
[CV]  vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 2), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'log', 'l1_ratio': 0.75}), score=0.5028142589118199, total= 2.6min
[CV] vect__selected_model=('counter', {'ngram_range': (1, 3), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'squared_hinge', 'l1_ratio': 0.5}) 
[CV]  vect__selected_model=('counter',

[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed: 37.9min


[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 0.1, 'loss': 'hinge', 'l1_ratio': 1.0}) 
[CV]  vect__selected_model=('counter', {'ngram_range': (1, 3), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'squared_hinge', 'l1_ratio': 0.5}), score=0.4971857410881801, total= 1.3min
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 0.1, 'loss': 'hinge', 'l1_ratio': 1.0}) 
[CV]  vect__selected_model=('counter', {'ngram_range': (1, 3), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=False, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'squared_hinge', 'l1_ratio': 0.5}), score=0.4971857410881801, total= 1.3min
[CV] vect__selected_model=('snowball_stemmer', {'n

[CV]  vect__selected_model=('counter', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=True, clf__selected_model=('sgd', {'alpha': 0.1, 'loss': 'hinge', 'l1_ratio': 0.75}), score=0.49765698219306465, total= 3.0min
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=True, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'log', 'l1_ratio': 0.1}) 
[CV]  vect__selected_model=('counter', {'ngram_range': (1, 4), 'stop_words': None, 'lowercase': True}), tf-idf__use_idf=True, clf__selected_model=('sgd', {'alpha': 0.1, 'loss': 'hinge', 'l1_ratio': 0.75}), score=0.7345215759849906, total= 2.9min
[CV] vect__selected_model=('snowball_stemmer', {'ngram_range': (1, 4), 'stop_words': 'english', 'lowercase': True}), tf-idf__use_idf=True, clf__selected_model=('sgd', {'alpha': 100.0, 'loss': 'log', 'l1_ratio': 0.1}) 
[CV]  vect__selected_model=('counter', {'ngram_range': (1, 4), 'stop_wo

KeyboardInterrupt: 

In [61]:
# get parameters for best score from CV
rscv_clf_mod.best_params_

{'clf__selected_model': ('svm-ker', {'C': 10000.0, 'kernel': 'rbf'}),
 'vect__selected_model': ('porter_stemmer',
  {'lowercase': True, 'ngram_range': (1, 4), 'stop_words': None})}

In [50]:
# make predictions
rscv_clf_pred = rscv_clf_mod.best_estimator_.predict(x_validation)

# model evaluation
print(metrics.classification_report(y_validation, rscv_clf_pred, digits=3))

             precision    recall  f1-score   support

       FAKE       0.86      0.94      0.90       383
       REAL       0.94      0.86      0.90       417

avg / total       0.90      0.90      0.90       800



### Model Pipeline 2
Our second pipeline will introduce an tf-idf to see if some of our models improve.

#### Variable Selection

In [10]:
# subset for orginal text
df_model = df_train['title']

#### Label

In [12]:
# save label
label = df_train.label

#### Split training data

In [13]:
# split training data and labels into train and validation 80/20
x_train, x_validation, y_train, y_validation = train_test_split(df_model, label,
                                                                test_size=0.2, random_state=42)

In [14]:
# define pipeline (vectorizer, models)
pipeline = Pipeline([('vect', PipelineHelper([
                            ('counter', CountVectorizer()),
                            ('snowball_stemmer', SnowballCountVectorizer()),
                            ('porter_stemmer', PorterCountVectorizer()),
                            ('lemmatizer', LemmatizerCountVectorizer()),
                        ])),
                     ('tf-idf', TfidfTransformer()),
                     ('clf', PipelineHelper([
                            ('svm', SGDClassifier()),
                            ('multi_nb', MultinomialNB()),
                        ])),
                       ])

#### Parameters
We will extend the model parameters and hope to imporve our score.

In [15]:
# define pipline parameters
parameters = {'vect__selected_model': pipeline.named_steps['vect'].generate({
                  'counter__ngram_range': [(1, 1), (1, 2)],
                  'counter__stop_words': ('english', None),
                  'counter__lowercase': (True, False),
                  'snowball_stemmer__ngram_range': [(1, 1), (1, 2)],
                  'snowball_stemmer__stop_words': ('english', None),
                  'snowball_stemmer__lowercase': (True, False),
                  'porter_stemmer__ngram_range': [(1, 1), (1, 2)],
                  'porter_stemmer__stop_words': ('english', None),
                  'porter_stemmer__lowercase': (True, False),
                  'lemmatizer__ngram_range': [(1, 1), (1, 2)],
                  'lemmatizer__stop_words': ('english', None),
                  'lemmatizer__lowercase': (True, False),
                }),
              'tf-idf__use_idf': (True, False),
              'clf__selected_model': pipeline.named_steps['clf'].generate({
                  'svm__alpha': (0.5, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
                  'svm__loss': ('hinge', 'squared_hinge'),
                  'svm__l1_ratio': (0, 0.1, 0.25, 0.5, 0.75, 1.0),
                  'multi_nb__alpha': (0.1, 0.25, 0.5, 0.75, 1)
                })
              }

#### GridSearch

In [16]:
# define random search grid with cv
rscv_clf = RandomizedSearchCV(estimator=pipeline, verbose=1,
                              param_distributions=parameters,
                              n_jobs=3, n_iter=30, cv=3, 
                              random_state=42)

# fit model based
rscv_clf_mod = rscv_clf.fit(x_train, y_train)

# get best score from CV
rscv_clf_mod.best_score_

Fitting 3 folds for each of 30 candidates, totalling 90 fits






[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   31.2s




[Parallel(n_jobs=3)]: Done  90 out of  90 | elapsed:   55.1s finished


0.8227571115973742

In [49]:
# get parameters for best score from CV
rscv_clf_mod.best_params_

{'clf__selected_model': ('svm',
  {'alpha': 0.01, 'l1_ratio': 0.25, 'loss': 'hinge'}),
 'vect__selected_model': ('lemmatizer',
  {'lowercase': True, 'ngram_range': (1, 2), 'stop_words': 'english'})}

In [50]:
# make predictions
rscv_clf_pred = rscv_clf_mod.best_estimator_.predict(x_validation)

# model evaluation
print(metrics.classification_report(y_validation, rscv_clf_pred))

             precision    recall  f1-score   support

       FAKE       0.89      0.92      0.90       383
       REAL       0.93      0.89      0.91       417

avg / total       0.91      0.91      0.91       800

