# NLP Assignment 1 (40% of grade): Text classification for Fake News Detection - SOLUTION Q5

In [1]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier, NaiveBayesClassifier
from sklearn.linear_model import LogisticRegression
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support # to report on precision and recall
import numpy as np

In [2]:
def convert_label(label):
    """Converts the multiple classes into two,
    making it a binary distinction between fake news and real."""
    # Converting the multiclass labels to binary labels
    labels_map = {
        'true': 'REAL',
        'mostly-true': 'REAL',
        'half-true': 'REAL',
        'false': 'FAKE',
        'barely-true': 'FAKE',
        'pants-fire': 'FAKE'
    }
    return labels_map[label]


def parse_data_line(data_line):
    # Should return a tuple of the label as just FAKE or REAL and the statement
    #print(data_line)
    return (convert_label(data_line[1]), data_line[2])

In [3]:
# for now use same functions as Q1-4
def load_data(path):
    """Load data from a tab-separated file and append it to raw_data."""
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            if line[0] == "Id":  # skip header
                continue
            (label, text) = parse_data_line(line)
            raw_data.append((text, label))

def split_and_preprocess_data(percentage):
    """Split the data between train_data and test_data according to the percentage
    and performs the preprocessing."""
    num_samples = len(raw_data)
    num_training_samples = int((percentage * num_samples))
    for (text, label) in raw_data[:num_training_samples]:
        train_data.append((to_feature_vector(pre_process(text)),label))
    for (text, label) in raw_data[num_training_samples:]:
        test_data.append((to_feature_vector(pre_process(text)),label))

# Q5: Trying different combinations of different preprocessing and feature-extraction techniques and select optimal combination
* For preprocessing:
    * Switch different techniques on and off for pre-processing, find best combination from 2 to the N possible combinations of technique, including:
        * separating out punctuation
        * punctuation removal
        * lowercasing
        * stopword removal
        * lemmatization
        * stemming
        * convert number words to integers or vice-versa
        * replacing numbers with NUM
        * replacing generic names like username handles with single token e.g. @username
* For features:
    * See which count/weighting for the tokens works best, including binary and BoW already tried in Q2 to weighted by sentence length, possibly using PPMI too.
    * Try different values of N for n-gram, starting with 1, then adding in 2 etc. incrementally (keep the lower orders)
    * Try different vocab sizes using min-df and max-df (swap out-of-vocab items with UNK token to preserve order/syntactic information?)
    

In [4]:
# some useful libraries
from nltk.stem import WordNetLemmatizer  # lemmatization
import nltk # for accessing the stopwords etc.
import re # regex
import string # other string operations
from textblob import TextBlob # for spelling correction

In [5]:
# preprocessing techniques which can be turned off and on:
preprocessing_switches = {
        "convert_usernames" : False,
        "separate_out_punctuation" : False,
        "convert_number_words_to_digits": False,
        "convert_numbers" : False,
        "remove_punctuation" : False,
        "convert_to_lowercase" : False,
        "remove_stopwords" : False,
        "apply_lemmatization" : False
    }

In [6]:
# different pre-processing techniques which get called altogether by pre_process

# method to deal with number words being normalized to digits 
# taken from https://github.com/ShailChoksi/text2digits
def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

def tokenize_text(text):
    if preprocessing_switches["separate_out_punctuation"]:
        text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text) # separates punctuation at ends of strings
        text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text) # separates punctuation at beginning of strings
    if preprocessing_switches["convert_numbers"]:
        text = re.sub('\d+', 'NUMBER',text)
    # print("tokenising:", text) # uncomment for debugging
    tokens = text.split()
    return tokens

def remove_characters_after_tokenization(tokens):
    # note preserving critical social media/twitter characters @ and #
    p = '[{}]'.format(re.escape(string.punctuation)+'\…').replace("@", "").replace("\#", "")
    #print(p)
    pattern = re.compile(p)
    filtered_tokens = [f for f in filter(None, [pattern.sub('', token) for token in tokens])]
    return filtered_tokens

def convert_to_lowercase(tokens):
    return [token.lower() for token in tokens if token.isalpha()]

def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

def apply_lemmatization(tokens, wnl=WordNetLemmatizer()):   
    return [wnl.lemmatize(token) for token in tokens]

def pre_process(text):
    """ Technique which will apply the techniques if they are set to 
    True in the global dict ::preprocessing_switches::
    """
    if preprocessing_switches["convert_usernames"]:
        text = re.sub("@[a-zA-Z0-9:.]+", "@username", text)
    if preprocessing_switches["convert_number_words_to_digits"]:
        text = text2int(text)
    tokens = tokenize_text(text)
    if preprocessing_switches["remove_punctuation"]:
        tokens = remove_characters_after_tokenization(tokens)
    if preprocessing_switches["convert_to_lowercase"]:
        tokens = convert_to_lowercase(tokens)
    if preprocessing_switches["remove_stopwords"]:
        tokens = remove_stopwords(tokens)
    if preprocessing_switches["apply_lemmatization"]:
        tokens = apply_lemmatization(tokens)
    return tokens

In [7]:
# try some of these combos
text = "RT @colonelkickhead: Another bloody instant restaurant week?!?! Seriously! They just jumped the shark riding two other sharks powered by sh…"
print(pre_process(text))
preprocessing_switches["convert_usernames"] = True
print(pre_process(text))
preprocessing_switches["convert_number_words_to_digits"] = True
print(pre_process(text))
preprocessing_switches["separate_out_punctuation"] = True
print(pre_process(text))
preprocessing_switches["convert_numbers"] = True
print(pre_process(text))
preprocessing_switches["remove_punctuation"] = True
print(pre_process(text))
preprocessing_switches["convert_to_lowercase"] = True
print(pre_process(text))
preprocessing_switches["remove_stopwords"] = True
print(pre_process(text))
preprocessing_switches["apply_lemmatization"] = True
print(pre_process(text))

['RT', '@colonelkickhead:', 'Another', 'bloody', 'instant', 'restaurant', 'week?!?!', 'Seriously!', 'They', 'just', 'jumped', 'the', 'shark', 'riding', 'two', 'other', 'sharks', 'powered', 'by', 'sh…']
['RT', '@username', 'Another', 'bloody', 'instant', 'restaurant', 'week?!?!', 'Seriously!', 'They', 'just', 'jumped', 'the', 'shark', 'riding', 'two', 'other', 'sharks', 'powered', 'by', 'sh…']
['RT', '@username', 'Another', 'bloody', 'instant', 'restaurant', 'week?!?!', 'Seriously!', 'They', 'just', 'jumped', 'the', 'shark', 'riding', '2', 'other', 'sharks', 'powered', 'by', 'sh…']
['RT', '@username', 'Another', 'bloody', 'instant', 'restaurant', 'week', '?!?!', 'Seriously', '!', 'They', 'just', 'jumped', 'the', 'shark', 'riding', '2', 'other', 'sharks', 'powered', 'by', 'sh…']
['RT', '@username', 'Another', 'bloody', 'instant', 'restaurant', 'week', '?!?!', 'Seriously', '!', 'They', 'just', 'jumped', 'the', 'shark', 'riding', 'NUMBER', 'other', 'sharks', 'powered', 'by', 'sh…']
['RT', 

In [8]:
# reset all to false to search for finding best combination
preprocessing_switches = {k: False for k in preprocessing_switches.keys()}

In [9]:
# Change cross-val function to allow first fold only option
from sklearn.metrics import classification_report

def cross_validate(dataset, folds, first_fold_only=False):
    results = []
    fold_size = int(len(dataset)/folds) + 1
    
    for i in range(0,len(dataset),int(fold_size)):
        # insert code here that trains and tests on the 10 folds of data in the dataset
        print("Fold start on items %d - %d" % (i, i+fold_size))
        
        fold_test_data = dataset[i:i+fold_size]   # get test split on this fold
        fold_train_data = dataset[:i] + dataset[i+fold_size:] # get train split on this fold
        classifier = train_classifier(fold_train_data) # train classifier on the training data
        y_true = [x[1] for x in fold_test_data] # get ground-truth labels
        y_pred = predict_labels([x[0] for x in fold_test_data], classifier) # use classifier to predict
        results.append(precision_recall_fscore_support(y_true, y_pred, average='weighted')) # get results
        # print(classification_report(y_true,y_pred))  # see classification report for fold
        
        #alternative: focus on the FAKE label accuracy only
        #report = classification_report(y_true, y_pred, output_dict=True)
        #results.append([report["FAKE"]['precision'], report["FAKE"]['recall'], report["FAKE"]['f1-score']]) # focus on FAKE
        if first_fold_only:
            break # quicker version only using one fold
        
    avg_results = [np.mean([x[0] for x in results]),
                   np.mean([x[1] for x in results]),
                   np.mean([x[2] for x in results])
                ]
    return avg_results

In [10]:
# For now just use the answer to Q2 for feature extraction (unigram bow binary)
global_feature_dict = {} # A global dictionary of features

# Solution
from collections import Counter

def to_feature_vector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    # Just returning all words in the 
    #feature_vector = Counter(tokens) # Bag-of-Words counts
    feature_vector = {x:1 for x in Counter(tokens).keys()}  # binary Set-of-Words
    global_feature_dict.update(feature_vector) # just add all counts
    return feature_vector

In [11]:
def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(data)

# PREDICTING LABELS GIVEN A CLASSIFIER

def predict_labels(samples, classifier):
    """Assuming preprocessed samples, return their predicted labels from the classifier model."""
    return classifier.classify_many(samples)

def predict_label_from_raw(sample, classifier):
    """Assuming raw text, return its predicted label from the classifier model."""
    return classifier.classify(to_feature_vector(preProcess(reviewSample)))

In [12]:
# Iterate over all combinations of pre-processing technique
from itertools import chain, combinations  # for powerset, to get all combinations

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

combos = [list(p) for p in powerset(preprocessing_switches.keys())]
best_f_score = 0  # initial best mean accuracy to beat
best_switches = []
results = []

if False:  # Takes some time to run- set to True to reproduce fully
    for switches in combos:
        preprocessing_switches = {k : False for k in preprocessing_switches.keys()}
        for switch in switches:
            preprocessing_switches[switch] = True
        print("*" * 30)
        print(preprocessing_switches)
        # loading reviews
        # initialize global lists that will be appended to by the methods below
        raw_data = []          # the filtered data from the dataset file
        train_data = []        # the pre-processed training data as a percentage of the total dataset
        test_data = []         # the pre-processed test data as a percentage of the total dataset


        # references to the data files
        data_file_path = 'fake_news.tsv'

        # Do the actual stuff (i.e. call the functions we've made)
        # We parse the dataset and put it in a raw data list
        print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
              "Preparing the dataset...",sep='\n')

        load_data(data_file_path) 

        # We split the raw dataset into a set of training data and a set of test data (80/20)
        # You do the cross validation on the 80% (training data)
        # We print the number of training samples and the number of features before the split
        print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
              "Preparing training and test data...",sep='\n')

        global_feature_dict = {} # A global dictionary of features

        split_and_preprocess_data(0.8)

        # let's look at the representation of the first instance of training:
        print(train_data[0])

        # We print the number of training samples and the number of features after the split
        print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
              "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


        all_scores = cross_validate(train_data, 10, first_fold_only=False)
        f_score = all_scores[2]
        print(f_score)
        results.append([(k,v) for k,v in preprocessing_switches.items()] + all_scores)
        print("*" * 40)
        #plot_heat_map_similarity(df)
        if f_score >= best_f_score:
            best_f_score = f_score
            best_switches = switches

    # make the preprocessing switches the best one:
    best_preprocessing_switches = {k : False for k in preprocessing_switches.keys()}
    for switch in best_switches:
        best_preprocessing_switches[switch] = True
    print("*" * 50)
    print("best f-score", best_f_score)
    print("best combo", best_preprocessing_switches) 

In [18]:
# let's display all the results in a pandas dataframe
import pandas as pd
if False:  # set as False if above search not run
    results = sorted(results, key=lambda x:x[-1], reverse=True) # sort results from best to worst f-score
    df = pd.DataFrame([[x[1] for x in row[:-3]] + row[-3:] for row in results],
                  columns=[x[0] for x in results[0][:-3]] + ["p", "r", "f-score"])
    display(df)

Unnamed: 0,convert_usernames,separate_out_punctuation,convert_number_words_to_digits,convert_numbers,remove_punctuation,convert_to_lowercase,remove_stopwords,apply_lemmatization,p,r,f-score
0,False,False,False,False,False,False,False,False,0.574330,0.574486,0.574071
1,True,False,False,False,False,False,False,False,0.574330,0.574486,0.574071
2,True,False,True,True,False,False,False,True,0.571949,0.573245,0.572172
3,False,False,True,True,False,False,False,True,0.571950,0.573245,0.572170
4,True,False,True,True,False,False,False,False,0.571698,0.572633,0.571775
5,False,False,True,True,False,False,False,False,0.571309,0.572267,0.571402
6,False,False,False,True,False,False,False,False,0.571051,0.571802,0.571096
7,True,False,False,True,False,False,False,False,0.571040,0.571802,0.571081
8,True,False,False,True,False,False,False,True,0.570599,0.571800,0.570781
9,False,False,False,True,False,False,False,True,0.569952,0.571190,0.570137


In [19]:
# which feature being true tends to help more?
if False:  # set as False if above search not run
    for key in preprocessing_switches.keys():
        print(key, sum(df[df[key]==True]['f-score']) / len(df[df[key]==True]))

convert_usernames 0.5611854745741123
separate_out_punctuation 0.5614473546115063
convert_number_words_to_digits 0.5609313888789945
convert_numbers 0.5622447694084758
remove_punctuation 0.560395938235528
convert_to_lowercase 0.5598332727974165
remove_stopwords 0.5591552225659394
apply_lemmatization 0.5602393378389651


In [20]:
# best options found for binary bag-of-words (by running above search)
preprocessing_switches = {'convert_usernames': True, 'separate_out_punctuation': False,
                          'convert_number_words_to_digits': False, 'convert_numbers': False,
                          'remove_punctuation': False, 'convert_to_lowercase': False,
                          'remove_stopwords': False, 'apply_lemmatization': False}

# Comments on pre-processing (for report)
* This cross-validation set-up for binary bag/set-of-words approach only had one setting which equalled the performance of the base line of a simple split on white space tokenization, which was using the **replacing usernames with @username** as the only preprocessing technique (with no other techniques). Given this reduces the number of features, all-else-being equal can be seen as better than no preprocessing at all despite the identical performance.
* However, it seems the **conversion of numbers (from word numbers to integers)** is the best preprocessing technique in combination with the others, followed by the separation of punctuation, then converting the usernames, so all three of these could be useful.
* This result may not hold when using other features/weightings and may need to be re-done.

# Q5: Get best feature extraction technique

In [23]:
# try different settings
count_weights = ["binary", "counts", "weighted"]
ngram_feature_range = range(1,6)
feature_set_sizes = [500,1000,100000,100000,1000000]

# global variables to be used by to_feature_vector
_WEIGHT_ = "binary"  # starting value/baseline
_N_ = 1  # starting value/baseline

In [49]:
def to_feature_vector(tokens):
    # SOLUTION: a method to extract different ngram sequences from tokens
    # and different weighting on those counts
    
    feature_vector_dict =  Counter()  # local feature vector for counts
    
    # collect the counts for all n in range (1,_N_)
    for n in range(1,_N_+1):
        new_tokens = ["<s>"]*(n-1) + tokens + ["</s>"]
        for i in range(n-1, len(new_tokens)):
            raw_ngram = " ".join(new_tokens[i-(n-1):i+1])
            #print(raw_ngram)
            n_gram = "{}@{}".format(n, raw_ngram)
            #print(n_gram)
            feature_vector_dict[n_gram]+=1
    
    # if _WEIGHT_ is 'counts' then this has already been done
    if _WEIGHT_ == "binary":
        feature_vector_dict = {x:1 for x in feature_vector_dict.keys()}  # binary Set-of-Words
    elif _WEIGHT_ == "weighted":
        # bag-of-words counts 
        feature_vector_dict = {x:feature_vector_dict[x]/(len(tokens)+1) for x in feature_vector_dict.keys()}
    
    for feat,v in feature_vector_dict.items():
        if not feat in global_feature_dict:
            global_feature_dict[feat] = 1
        else:
            global_feature_dict[feat] +=1
            
    return feature_vector_dict

In [27]:
results_feature_extraction = []
best_f_score = 0  # initial best mean accuracy to beat
best_feature_extraction = {"n": None, "w": None} # best settings

if False: # set to True to run. Takes a little bit of time n * w settings
    print(preprocessing_switches)
    for w in count_weights:
        for n in ngram_feature_range:
            #for k in feature_set_sizes:
            #    print(w,n,k)
            _N_ = n
            _WEIGHT_ = w
            raw_data = []          # the filtered data from the dataset file
            train_data = []        # the pre-processed training data as a percentage of the total dataset
            test_data = []         # the pre-processed test data as a percentage of the total dataset

            # references to the data files
            data_file_path = 'fake_news.tsv'

            # Do the actual stuff (i.e. call the functions we've made)
            # We parse the dataset and put it in a raw data list
            print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                  "Preparing the dataset...",sep='\n')

            load_data(data_file_path) 

            # We split the raw dataset into a set of training data and a set of test data (80/20)
            # You do the cross validation on the 80% (training data)
            # We print the number of training samples and the number of features before the split
            print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                  "Preparing training and test data...",sep='\n')
            global_feature_dict = {}
            split_and_preprocess_data(0.8)

            # We print the number of training samples and the number of features after the split
            print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                  "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


            all_scores = cross_validate(train_data, 10, first_fold_only=True)
            f_score = all_scores[2]
            print(f_score)
            print(w, n, all_scores)
            results_feature_extraction.append((w, n, all_scores))
            if f_score > best_f_score:
                best_f_score = f_score
                best_feature_extraction['w'] = w
                best_feature_extraction['n'] = n
    print("best f-score", best_f_score, "with best settings", best_feature_extraction)

In [31]:
if False:  # set to False if above search is not run
    # sort the results by best f-score:
    results_feature_extraction = sorted(results_feature_extraction, key=lambda x:x[-1][-1], reverse=True)
    df = pd.DataFrame([[x for x in row[:-1]] + list(row[-1]) for row in results_feature_extraction],
                  columns=["weight", "n", "p", "r", "f-score"])
    display(df)

Unnamed: 0,weight,n,p,r,f-score
0,counts,4,0.603976,0.607569,0.604007
1,binary,4,0.602991,0.606954,0.602984
2,binary,3,0.602647,0.605488,0.602902
3,binary,5,0.603206,0.60793,0.602482
4,counts,5,0.602194,0.606475,0.601909
5,counts,3,0.598229,0.60086,0.598437
6,binary,2,0.590362,0.59205,0.590649
7,weighted,4,0.603066,0.608789,0.588534
8,counts,2,0.588137,0.589492,0.588354
9,weighted,5,0.603644,0.609153,0.587914


In [54]:
# which feature counting/weights technique is most useful (in combination with other params)?
if False:  # set to False if above search is not run
    for w in count_weights:
        print(w, sum(df[df["weight"]==w]['f-score'])/5)

binary 0.5946175136448995
counts 0.5923506330604917
weighted 0.5868226306604838


In [32]:
# which ngram n value is most useful (in combination with other params)?
if False:  # set to False if above search is not run
    for n in range(1,6):
        print(n, sum(df[df["n"]==n]['f-score'])/3)

1 0.5754796790376412
2 0.5886115914396407
3 0.5962832258837417
4 0.5985085325336533
5 0.5974349333817814


# Comments on feature extraction (for report)
*  Using the best preprocessing technique found in the baseline unigram binary set-of-words setting, the best feature extraction techqniues were using an **n-gram range of (1,4)** and using **bag-of-words counts**. The f-score in cross-val has improved from the baseline 0.574 (unigram binary set-of-words) to **0.604**.
* On average in combination with all values of n-gram range, **binary set-of-words** settings on average did best, closely followed by bag-of-words counts, and with weighting by sentence length a bit further back.
* On average in combination with all values of weights, **n-gram range of (1,4)** settings on average did best, followed by (1,5), then (1,3).
* This result may not hold when using other preprocessing and may need to be re-done.

# Q5: Try joint pre-preprocessing and feature extraction optimization at once
* Note these are done on single fold (first fold) due to the number of combinations (256 * 4 * 3)

In [97]:
# change ngram range to avoid 1 as bigram+ clearly helpful
# will take a long time (256 * 4 * 3) settings, so will just do it on first fold
ngram_feature_range = range(2,6)  

results_feature_extraction_preprocess = []
best_f_score = 0
best_switches = []

if False: # takes time, set to True to run
    for w in count_weights[-1:]:
        for n in ngram_feature_range:
            #for k in feature_set_sizes:
            #    print(w,n,k)
            _N_ = n
            _WEIGHT_ = w

            for switches in combos:
                preprocessing_switches = {k : False for k in preprocessing_switches.keys()}
                for switch in switches:
                    preprocessing_switches[switch] = True
                print("*" * 30)
                print(preprocessing_switches)
                # loading reviews
                # initialize global lists that will be appended to by the methods below
                raw_data = []          # the filtered data from the dataset file
                train_data = []        # the pre-processed training data as a percentage of the total dataset
                test_data = []         # the pre-processed test data as a percentage of the total dataset


                # references to the data files
                data_file_path = 'fake_news.tsv'

                # Do the actual stuff (i.e. call the functions we've made)
                # We parse the dataset and put it in a raw data list
                print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                      "Preparing the dataset...",sep='\n')

                load_data(data_file_path) 

                # We split the raw dataset into a set of training data and a set of test data (80/20)
                # You do the cross validation on the 80% (training data)
                # We print the number of training samples and the number of features before the split
                print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                      "Preparing training and test data...",sep='\n')

                global_feature_dict = {} # A global dictionary of features

                split_and_preprocess_data(0.8)

                # let's look at the representation of the first instance of training:
                print(train_data[0])

                # We print the number of training samples and the number of features after the split
                print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
                      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


                all_scores = cross_validate(train_data, 10, first_fold_only=True)

                f_score = all_scores[2]
                print(w, n, preprocessing_switches, f_score)
                results_feature_extraction_preprocess.append([w, n] + [(k,v) for k,v in preprocessing_switches.items()] + all_scores)
                if f_score > best_f_score:
                    best_f_score = f_score
                    best_feature_extraction['w'] = w
                    best_feature_extraction['n'] = n
                    best_switches = switches
   
    best_preprocessing_switches = {k : False for k in preprocessing_switches.keys()}
    for switch in best_switches:
        best_preprocessing_switches[switch] = True
    print("best f-score", best_f_score, "with best feature settings", best_feature_extraction,
              "and best preprocessing settings", best_preprocessing_switches)

In [84]:
# sort results by best f-score
if False:  # set to False if above search is not run
    # sort the results by best f-score:
    results_feature_extraction_preprocess = sorted(results_feature_extraction_preprocess,
                                                   key=lambda x:x[-1], reverse=True)
    df = pd.DataFrame([[x for x in row[:2]] + [x[1] for x in row[2:-3]] + row[-3:] for row in results_feature_extraction_preprocess],
                  columns=["weight", "n"] + [x[0] for x in results_feature_extraction_preprocess[0][2:-3]] +  ["p", "r", "f-score"])
    display(df)


Unnamed: 0,weight,n,convert_usernames,separate_out_punctuation,convert_number_words_to_digits,convert_numbers,remove_punctuation,convert_to_lowercase,remove_stopwords,apply_lemmatization,p,r,f-score
0,counts,4,False,False,False,False,True,False,True,True,0.623970,0.630488,0.624761
1,binary,5,False,False,True,False,True,True,False,True,0.622798,0.629268,0.623668
2,binary,5,True,False,True,False,True,True,False,True,0.622798,0.629268,0.623668
3,binary,3,False,False,True,True,True,True,False,True,0.622188,0.625610,0.623408
4,binary,3,True,False,True,True,True,True,False,True,0.622188,0.625610,0.623408
5,binary,4,False,False,False,False,True,False,False,False,0.622028,0.626829,0.623348
6,binary,4,True,False,False,False,True,False,False,False,0.622028,0.626829,0.623348
7,binary,5,False,True,False,False,True,False,False,False,0.621106,0.626829,0.622313
8,binary,5,True,True,False,False,True,False,False,False,0.621106,0.626829,0.622313
9,counts,5,False,False,True,False,True,True,False,False,0.621106,0.626829,0.622313


In [85]:
# which feature counting/weights technique is most useful (in combination with other params)?
if False:  # set to False if above search is not run
    for w in count_weights:
        print(w, sum(df[df["weight"]==w]['f-score']) / len(df[df["weight"]==w]))

binary 0.5998666354471951
counts 0.5967058465038103
weighted 0.5977354419194072


In [87]:
# which ngram n value is most useful (in combination with other params)?
if False:  # set to False if above search is not run
    for n in range(2,6):
        print(n, sum(df[df["n"]==n]['f-score']) / len(df[df["n"]==n]))

2 0.5931914577999501
3 0.5990706244681215
4 0.6002498616327715
5 0.5998986212597056


In [88]:
# which feature being true tends to help more?
if False:  # set as False if above search not run
    for key in preprocessing_switches.keys():
        print(key, sum(df[df[key]==True]['f-score']) / len(df[df[key]==True]))

convert_usernames 0.5980895269054742
separate_out_punctuation 0.5981813991781045
convert_number_words_to_digits 0.5970608680605519
convert_numbers 0.5970483434596471
remove_punctuation 0.6017270909473051
convert_to_lowercase 0.5979548945941139
remove_stopwords 0.5949706694511793
apply_lemmatization 0.5976712061553849


# Comments on joint feature extraction and preprocessing optimizatino (for report)
* The result from the independent optimization on the full x-val is the same in that the best feature extraction settings were **n-gram range of (1,4)** and using **bag-of-words counts**
* However for feature preprocessing, the best techqniues were found to be **remove_punctuation**, **remove_stopwords** and **apply_lemmatization**, somewhat more preprocessing and normalization thatn before.
* On average in combination with all values of weights and preprocessing, the result is unchanged that **n-gram range of (1,4)** settings on average did best, followed by (1,5), then (1,3).
* On average in combination with all values of n-gram range, again, **binary set-of-words** settings on average did best, closely followed by bag-of-words counts, and with weighting by sentence length a bit further back.
* This result may not hold when using other preprocessing and may need to be re-done.

In [89]:
# Best settings from joint optimization:
_WEIGHT_ = 'counts'
_N_ =  4
preprocessing_switches = {'convert_usernames': False,
  'separate_out_punctuation': False,
  'convert_number_words_to_digits': False,
  'convert_numbers': False,
  'remove_punctuation': True,
  'convert_to_lowercase': False,
  'remove_stopwords': True,
  'apply_lemmatization': True}

In [90]:
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

global_feature_dict = {} # A global dictionary of features

split_and_preprocess_data(0.8)

# let's look at the representation of the first instance of training:
print(train_data[0])

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')


all_scores = cross_validate(train_data, 10, first_fold_only=False)
f_score = all_scores[2]
print(f_score)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
(Counter({'1@Says': 1, '1@Annies': 1, '1@List': 1, '1@political': 1, '1@group': 1, '1@support': 1, '1@thirdtrimester': 1, '1@abortion': 1, '1@demand': 1, '1@</s>': 1, '2@<s> Says': 1, '2@Says Annies': 1, '2@Annies List': 1, '2@List political': 1, '2@political group': 1, '2@group support': 1, '2@support thirdtrimester': 1, '2@thirdtrimester abortion': 1, '2@abortion demand': 1, '2@demand </s>': 1, '3@<s> <s> Says': 1, '3@<s> Says Annies': 1, '3@Says Annies List': 1, '3@Annies List political': 1, '3@List political group': 1, '3@political group support': 1, '3@group support thirdtrimester': 1, '3@support thirdtrimester abortion': 1, '3@thirdtrimester abortion demand': 1, '3@abortion demand </s>': 1, '4@<s> <s> <s> Says': 1, '4@<s> <s> Says Annies': 1, '4@<s> Says Annies List': 1, '4@Says Annies List political': 1, '4@Annies List political group': 1

Note there is a slight improvement on the full cross-val experiment f-score to **0.605**

# Q5 Hyperparameter optimization of the linear SVC

In [91]:
# Final hyperparameter tuning of the linearSVC on cross-val across the training data
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
parameters = [{
#'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], 
'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
'max_iter': [1,5,10, 50, 100, 500, 1000, 5000]}]

clf = GridSearchCV(
        LinearSVC(), parameters, scoring='accuracy'
    )

clf.fit(DictVectorizer().fit_transform([x[0] for x in train_data]), [x[1] for x in train_data])




GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
                          'max_iter': [1, 5, 10, 50, 100, 500, 1000, 5000]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [92]:
print(clf.best_params_)

{'C': 0.01, 'max_iter': 10}


In [93]:
_C_ = clf.best_params_["C"]
_MAX_ITER_ = clf.best_params_["max_iter"]

In [94]:
def train_classifier(data):
    print("Training Classifier...")
    pipeline =  Pipeline([
     ('svc', LinearSVC(C=_C_, max_iter=_MAX_ITER_))])
    return SklearnClassifier(pipeline).train(data)

In [95]:
# QUESTION 3 - Make sure there is a function call here to the
# crossValidate function on the training set to get your results
print(preprocessing_switches)
print("weights", _WEIGHT_)
print("n", _N_)
# loading reviews
# initialize global lists that will be appended to by the methods below
raw_data = []          # the filtered data from the dataset file
train_data = []        # the pre-processed training data as a percentage of the total dataset
test_data = []         # the pre-processed test data as a percentage of the total dataset


# references to the data files
data_file_path = 'fake_news.tsv'

# Do the actual stuff (i.e. call the functions we've made)
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing the dataset...",sep='\n')

load_data(data_file_path) 

# We split the raw dataset into a set of training data and a set of test data (80/20)
# You do the cross validation on the 80% (training data)
# We print the number of training samples and the number of features before the split
print("Now %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Preparing training and test data...",sep='\n')

split_and_preprocess_data(0.8)

# let's look at the representation of the first instance of training:
print(train_data[0])

# We print the number of training samples and the number of features after the split
print("After split, %d rawData, %d trainData, %d testData" % (len(raw_data), len(train_data), len(test_data)),
      "Training Samples: ", len(train_data), "Features: ", len(global_feature_dict), sep='\n')

cross_validate(train_data, 10)


{'convert_usernames': False, 'separate_out_punctuation': False, 'convert_number_words_to_digits': False, 'convert_numbers': False, 'remove_punctuation': True, 'convert_to_lowercase': False, 'remove_stopwords': True, 'apply_lemmatization': True}
weights counts
n 4
Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 10241 rawData, 0 trainData, 0 testData
Preparing training and test data...
(Counter({'1@Says': 1, '1@Annies': 1, '1@List': 1, '1@political': 1, '1@group': 1, '1@support': 1, '1@thirdtrimester': 1, '1@abortion': 1, '1@demand': 1, '1@</s>': 1, '2@<s> Says': 1, '2@Says Annies': 1, '2@Annies List': 1, '2@List political': 1, '2@political group': 1, '2@group support': 1, '2@support thirdtrimester': 1, '2@thirdtrimester abortion': 1, '2@abortion demand': 1, '2@demand </s>': 1, '3@<s> <s> Says': 1, '3@<s> Says Annies': 1, '3@Says Annies List': 1, '3@Annies List political': 1, '3@List political group': 1, '3@political group support': 1, '3@group support thirdtrimester'



Fold start on items 820 - 1640
Training Classifier...
Fold start on items 1640 - 2460
Training Classifier...
Fold start on items 2460 - 3280
Training Classifier...
Fold start on items 3280 - 4100
Training Classifier...
Fold start on items 4100 - 4920
Training Classifier...
Fold start on items 4920 - 5740
Training Classifier...
Fold start on items 5740 - 6560
Training Classifier...
Fold start on items 6560 - 7380
Training Classifier...
Fold start on items 7380 - 8200
Training Classifier...


[0.61408808399468, 0.6195500420521447, 0.6073098127444935]

Note there is another slight improvement on the full cross-val experiment f-score to **0.607**

# Evaluate on test set

In [96]:
# Finally, check the accuracy of your classifier by training on all the tranin data
# and testing on the test set
# Will only work once all functions are complete
functions_complete = True  # set to True once you're happy with your methods for cross val
if functions_complete:
    print(test_data[0])   # have a look at the first test data instance
    classifier = train_classifier(train_data)  # train the classifier
    test_true = [t[1] for t in test_data]   # get the ground-truth labels from the data
    test_pred = predict_labels([x[0] for x in test_data], classifier)  # classify the test data to get predicted labels
    final_scores = precision_recall_fscore_support(test_true, test_pred, average='weighted') # evaluate
    report = classification_report(test_true, test_pred, output_dict=True)
    print(classification_report(test_true, test_pred))
    print("Done training!")
    print("Precision: %f\nRecall: %f\nF Score:%f" % final_scores[:3])

(Counter({'1@The': 1, '1@Bush': 1, '1@tax': 1, '1@cut': 1, '1@helped': 1, '1@create': 1, '1@substantial': 1, '1@part': 1, '1@deficit': 1, '1@</s>': 1, '2@<s> The': 1, '2@The Bush': 1, '2@Bush tax': 1, '2@tax cut': 1, '2@cut helped': 1, '2@helped create': 1, '2@create substantial': 1, '2@substantial part': 1, '2@part deficit': 1, '2@deficit </s>': 1, '3@<s> <s> The': 1, '3@<s> The Bush': 1, '3@The Bush tax': 1, '3@Bush tax cut': 1, '3@tax cut helped': 1, '3@cut helped create': 1, '3@helped create substantial': 1, '3@create substantial part': 1, '3@substantial part deficit': 1, '3@part deficit </s>': 1, '4@<s> <s> <s> The': 1, '4@<s> <s> The Bush': 1, '4@<s> The Bush tax': 1, '4@The Bush tax cut': 1, '4@Bush tax cut helped': 1, '4@tax cut helped create': 1, '4@cut helped create substantial': 1, '4@helped create substantial part': 1, '4@create substantial part deficit': 1, '4@substantial part deficit </s>': 1}), 'REAL')
Training Classifier...
              precision    recall  f1-score   

