# Fake News Assignment
**Authors**: Vilhelm Stiernstedt & Sharon Marín Salazar
<br>
**Date**: 20/05/2018

### Description
Classification problem of News Report (document) for classes (FAKE, REAL). Try text-related classifiers such as Naive Bayes, MaxEnt, SVM. Use NLTK+SKLearn, NLP Pre-processing, Classifiers and CV-evaluation.

#### Dataset
**fake_or_real_news_training:**
- ID: ID of the tweet
- Title: Title of the news report
- Text: Textual content of the news report
- Label: Target Variable [FAKE, REAL]
- X1, X2 additional fields

**fake_or_real_news_test:**
- ID, title and text
- Predict Label

#### Advices
- Take a look to the data
- Try the pre-processing methodologies we have seen in class
- TF-IDF seems to be better (but try it!)
- N-grams pay the effort
- Less than 90-92%? -> Try again

#### Plan
1. Variable analysis
    - Features
    - Other insight
2. Data Processing
    - Drop features
    - Label
3. Modelling
    - Navie Bayes
    - MaxEnt
    - SVM
4. Evaluation

## Import Libraries

In [1]:
import collections
import matplotlib.pyplot as plt
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.classify import MaxentClassifier
import numpy as np
import pandas as pd
import seaborn as sns
import re
import PipelineHelper # https://github.com/bmurauer/pipelinehelper/blob/master/pipelinehelper.py
from scipy.sparse import hstack
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import warnings

# download required nltk packages (NB. commented out)
# nltk.download()

# plot settings
%matplotlib inline

# pandas view settings -> see all contents of column
pd.set_option('display.max_colwidth', -1)

# Warning settings -> suppress depreciation warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)



## Function Definitions
### Analysis 

In [2]:
# define stop words - english
stop_words = set(stopwords.words('english'))

# define lemmatizer for simple analysis 
wordnet_lemmatizer = WordNetLemmatizer()

# create normalization function for analysis of title and text
def normalizer(text):
    clean_text = re.sub('[^\x00-\x7F]+', "", text) # remove non-ascii characters
    clean_text = re.sub('(\r)+', "",  clean_text) # remove newline characters
    clean_text = re.sub(r'@([A-Za-z0-9_]+)', "",  clean_text) # remove twitter handles
    clean_text = re.sub(r"(https|http)\S+", "",  clean_text) # remove hyperlinks
    clean_text = re.sub("[^a-zA-Z]", " ", clean_text) # remove all but letters remains
    tokens = nltk.word_tokenize(clean_text)[2:] # tokenize words
    lower_case = [l.lower() for l in tokens] # convert to lowercase
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case)) # filter stopwords
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result] # stem words with lemmatizer
    return lemmas

# define function to construct our ngrams for analysis of title and text
def ngrams(input_list):
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    quadgrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[3:]))]
    return bigrams+trigrams+quadgrams

# define function to count words for analysis of ngrams (bi, tri, quad) for title and text
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

# exclaimation counter for analysis of text (potentially introduce as new feautre)
def exclaimation_counter(article):
    nr_abs = article.count('!')
    text_len = len(article)
    nr_rel = nr_abs/text_len
    return nr_rel

### Pipeline 

In [4]:
# function to build pipeline with multiple models
# https://github.com/bmurauer/pipelinehelper/blob/master/pipelinehelper.py

from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
from collections import defaultdict
import itertools

class PipelineHelper(BaseEstimator, TransformerMixin, ClassifierMixin):

    def __init__(self, available_models=None, selected_model=None, include_bypass=False):
        self.include_bypass = include_bypass
        self.selected_model = selected_model
        # this is required for the clone operator used in gridsearch
        if type(available_models) == dict:
            self.available_models = available_models
        # this is the case for constructing the helper initially
        else:
            # a string identifier is required for assigning parameters
            self.available_models = {}
            for (key, model) in available_models:
                self.available_models[key] = model

    def generate(self, param_dict={}):
        per_model_parameters = defaultdict(lambda: defaultdict(list))

        # collect parameters for each specified model
        for k, values in param_dict.items():
            model_name = k.split('__')[0]
            param_name = k[len(model_name)+2:]  # might be nested
            if model_name not in self.available_models:
                raise Exception('no such model: {0}'.format(model_name))
            per_model_parameters[model_name][param_name] = values

        ret = []

        # create instance for cartesion product of all available parameters for each model
        for model_name, param_dict in per_model_parameters.items():
            parameter_sets = (dict(zip(param_dict, x)) for x in itertools.product(*param_dict.values()))
            for parameters in parameter_sets:
                ret.append((model_name, parameters))

        # for every model that has no specified parameters, add the default model
        for model_name in self.available_models.keys():
            if model_name not in per_model_parameters:
                ret.append((model_name, dict()))

        # check if the stage is to be bypassed as one configuration
        if self.include_bypass:
            ret.append((None, dict(), True))
        return ret

    def get_params(self, deep=False):
        return {'available_models': self.available_models,
                'selected_model': self.selected_model,
                'include_bypass': self.include_bypass}

    def set_params(self, selected_model, available_models=None, include_bypass=False):
        include_bypass = len(selected_model) == 3 and selected_model[2]

        if available_models:
            self.available_models = available_models

        if selected_model[0] is None and include_bypass:
            self.selected_model = None
            self.include_bypass = True
        else:
            if selected_model[0] not in self.available_models:
                raise Exception('so such model available: {0}'.format(selected_model[0]))
            self.selected_model = self.available_models[selected_model[0]]
            self.selected_model.set_params(**selected_model[1])

    def fit(self, X, y=None):
        if self.selected_model is None and not self.include_bypass:
            raise Exception('no model was set')
        elif self.selected_model is None:
            # print('bypassing model for fitting, returning self')
            return self
        else:
            # print('using model for fitting: ', self.selected_model.__class__.__name__)
            return self.selected_model.fit(X, y)

    def transform(self, X, y=None):
        if self.selected_model is None and not self.include_bypass:
            raise Exception('no model was set')
        elif self.selected_model is None:
            # print('bypassing model for transforming:')
            # print(X[:10])
            return X
        else:
            # print('using model for transforming: ', self.selected_model.__class__.__name__)
            return self.selected_model.transform(X)

    def predict(self, x):
        if self.include_bypass:
            raise Exception('bypassing classifier is not allowed')
        if self.selected_model is None:
            raise Exception('no model was set')
        return self.selected_model.predict(x)


## Import Data

In [6]:
# set path to data
data_path = 'data/'

# load test and train
df_train = pd.read_csv(data_path+'fake_or_real_news_training.csv')
df_test = pd.read_csv(data_path+'fake_or_real_news_test.csv')

# set index
df_train.set_index('ID', inplace=True)
df_test.set_index('ID', inplace=True)

# define combined df
all_data = df_train.append(df_test)

## Data Processing 

### Text Processing
#### Stemmers

In [12]:
# define count vectorizer for modelling (different parameter inputs will be given in modelling)
count_vectorizer = CountVectorizer()

# define Snowball stemmer (different parameter inputs will be given in modelling
snowball_stemmer = SnowballStemmer("english")

# define new vectorizer function with snowball stemmer
class SnowballCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SnowballCountVectorizer, self).build_analyzer()
        return lambda doc: ([snowball_stemmer.stem(w) for w in analyzer(doc)])
    
# define new vectorizer function with Porter stemmer NLTK exten 
# (different parameter inputs will be given in modelling)
porter_stemmer = PorterStemmer(mode='NLTK_EXTENSIONS')

# define new vectorizer function with porter stemmer
class PorterCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(PorterCountVectorizer, self).build_analyzer()
        return lambda doc: ([porter_stemmer.stem(w) for w in analyzer(doc)])
    
# define lemmatizer
lemmatizer = WordNetLemmatizer()

# define new vectorizer function with stemmer
class LemmatizerCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmatizerCountVectorizer, self).build_analyzer()
        return lambda doc: ([lemmatizer.lemmatize(w) for w in analyzer(doc)])

### Model Preprocessing

#### Variable Selection

In [13]:
df_model = df_train['text']

#### Label

In [14]:
# save label
label = df_train.label

#### Split training data

In [15]:
# split training data and labels into train and validation 80/20
x_train, x_validation, y_train, y_validation = train_test_split(df_model, label,
                                                                test_size=0.2, random_state=42)

### Model Pipeline 1
In our first pipeline we will try different type of vectorizer and stemmers:
    - count vectorizer
    - count vectorizer w. snowball stemmer
    - count vectorizer w. porter stemmer
    - count vectorizer w. lemmatizer

In [16]:
# define pipeline (vectorizer, models)
pipeline = Pipeline([('vect', PipelineHelper([
                            ('counter', CountVectorizer()),
                            ('snowball_stemmer', SnowballCountVectorizer()),
                            ('porter_stemmer', PorterCountVectorizer()),
                            ('lemmatizer', LemmatizerCountVectorizer()),
                        ])),
                     ('clf', PipelineHelper([
                            ('svm', SGDClassifier()),
                            ('multi_nb', MultinomialNB()),
                        ])),
                       ])

#### Parameters

In [17]:
# define pipline parameters
parameters = {'vect__selected_model': pipeline.named_steps['vect'].generate({
                  'counter__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
                  'counter__stop_words': ('english', None),
                  'counter__binary': (True, False),
                  'counter__lowercase': (True, False),
                  'counter__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
                  'counter__max_features': (5, 10, 20, 50, 100, None),
                  'snowball_stemmer__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
                  'snowball_stemmer__stop_words': ('english', None),
                  'snowball_stemmer__binary': (True, False),
                  'snowball_stemmer__lowercase': (True, False),
                  'snowball_stemmer__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
                  'snowball_stemmer__max_features': (5, 10, 20, 50, 100, None),
                  'porter_stemmer__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
                  'porter_stemmer__stop_words': ('english', None),
                  'porter_stemmer__binary': (True, False),
                  'porter_stemmer__lowercase': (True, False),
                  'porter_stemmer__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
                  'porter_stemmer__max_features': (5, 10, 20, 50, 100, None),
                  'lemmatizer__ngram_range': [(1, 2), (1, 3), (1, 4), (1, 5)],
                  'lemmatizer__stop_words': ('english', None),
                  'lemmatizer__binary': (True, False),
                  'lemmatizer__lowercase': (True, False),
                  'lemmatizer__max_df': (0.1, 0.25, 0.5, 0.75, 1.0),
                  'lemmatizer__max_features': (5, 10, 20, 50, 100, None),
                }),
              'clf__selected_model': pipeline.named_steps['clf'].generate({
                  'svm__alpha': (0.5, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6),
                  'svm__loss': ('hinge', 'squared_hinge'),
                  'svm__l1_ratio': (0, 0.1, 0.25, 0.5, 0.75, 1.0),
                  'multi_nb__alpha': (0.1, 0.25, 0.5, 0.75, 1)
                })
              }

#### GridSearch

In [18]:
# define random search grid with cv
rscv_clf = RandomizedSearchCV(estimator=pipeline, verbose=1,
                              param_distributions=parameters,
                              n_jobs=3, n_iter=100, cv=3, 
                              random_state=42)

# fit model based
rscv_clf_mod = rscv_clf.fit(x_train, y_train)

# get best score from CV
rscv_clf_mod.best_score_

Fitting 3 folds for each of 100 candidates, totalling 300 fits






[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 36.2min














KeyboardInterrupt: 