In [None]:
# Vijay Venkatesan
# CS 6120 (Natural Language Processing) 

In [1]:
# libraries

import pandas as pd
import numpy as np
import re
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Read in each of the csv files and store it in a dataframe
# Add a column to each dataframe called "RealNews?"
# Concatenate both dataframes in a new dataframe called df

df_real = pd.read_csv('True.csv')
df_real['RealNews?'] = True
df_fake = pd.read_csv('Fake.csv')
df_fake['RealNews?'] = False
df = pd.concat([df_real, df_fake], ignore_index = True)

In [3]:
# Verification that the combined dataframe was correctly created

df.head()

Unnamed: 0,title,text,subject,date,RealNews?
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True


In [4]:
# Verification that the combined dataframe contains the correct total number of rows -> 44,898

len(df)

44898

In [5]:
# Selection of only the rows where 'RealNews?' column is set to True

df[df['RealNews?']]

Unnamed: 0,title,text,subject,date,RealNews?
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",True
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",True
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",True
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",True


In [6]:
# Selection of only the rows where 'RealNews?' column is set to False

df[~df['RealNews?']]

Unnamed: 0,title,text,subject,date,RealNews?
21417,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",False
21418,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",False
21419,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",False
21420,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",False
21421,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",False
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",False
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",False
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",False
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",False


In [7]:
# Selection of only the 'title' column

df['title']

0        As U.S. budget fight looms, Republicans flip t...
1        U.S. military to accept transgender recruits o...
2        Senior U.S. Republican senator: 'Let Mr. Muell...
3        FBI Russia probe helped by Australian diplomat...
4        Trump wants Postal Service to charge 'much mor...
                               ...                        
44893    McPain: John McCain Furious That Iran Treated ...
44894    JUSTICE? Yahoo Settles E-mail Privacy Class-ac...
44895    Sunnistan: US and Allied ‘Safe Zone’ Plan to T...
44896    How to Blow $700 Million: Al Jazeera America F...
44897    10 U.S. Navy Sailors Held by Iranian Military ...
Name: title, Length: 44898, dtype: object

In [8]:
# Creation of a new column called 'document' which combines the content in the 'title' and 'text' columns

df['document'] = df[['title', 'text']].agg(' '.join, axis = 1)

In [9]:
# Normalization of the data in the 'document' column by lowercasing the text which helps reduce the feature space

df['document'] = df['document'].apply(lambda x : x.lower())

In [10]:
# Verification of the 'document' column in the dataframe

df['document'][0]

'as u.s. budget fight looms, republicans flip their fiscal script washington (reuters) - the head of a conservative republican faction in the u.s. congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on sunday and urged budget restraint in 2018. in keeping with a sharp pivot under way among republicans, u.s. representative mark meadows, speaking on cbs’ “face the nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in january. when they return from the holidays on wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the november congressional election campaigns approach in which republicans will seek to keep control of congress. president donald trump and his republicans want a big budget increase in military spending, while democrats also want proportional increases for non

In [12]:
# Performing the train-test split on the dataframe

df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 123, shuffle = True)

In [13]:
# Verification of df_train

df_train

Unnamed: 0,title,text,subject,date,RealNews?,document
25870,Andrea Tantaros Just Buried Fox News With Swo...,Fox News needs to take Andrea Tantaros sexual...,News,"September 28, 2016",False,andrea tantaros just buried fox news with swo...
24267,CNN Smacks Trump With Inauguration Day Rating...,Donald Trump just got humiliated by CNN with f...,News,"January 25, 2017",False,cnn smacks trump with inauguration day rating...
42157,OBAMA PRESSURED U.S. SHOE COMPANY To Keep Thei...,"Of course, putting America first is a totally ...",left-news,"Apr 14, 2016",False,obama pressured u.s. shoe company to keep thei...
18210,"French, Nigerien forces operating where three ...",NIAMEY (Reuters) - French and Nigerien troops ...,worldnews,"October 5, 2017",True,"french, nigerien forces operating where three ..."
2956,Congress split over privatizing air traffic co...,WASHINGTON (Reuters) - The U.S. Congress is di...,politicsNews,"June 29, 2017",True,congress split over privatizing air traffic co...
...,...,...,...,...,...,...
7763,North Carolina offers reward in arson at local...,"WINSTON-SALEM, N.C. (Reuters) - North Carolina...",politicsNews,"October 19, 2016",True,north carolina offers reward in arson at local...
15377,"Kremlin says 'logical' that Putin, Trump will ...",MOSCOW (Reuters) - A Kremlin aide said on Wedn...,worldnews,"November 8, 2017",True,"kremlin says 'logical' that putin, trump will ..."
17730,Mattis looking to see if changes need to be ma...,"TAMPA, Fla. (Reuters) - Defense Secretary Jim ...",worldnews,"October 11, 2017",True,mattis looking to see if changes need to be ma...
28030,Long Beach Police Have Discriminatory Policy ...,"On April 29, Los Angeles County Superior Court...",News,"May 1, 2016",False,long beach police have discriminatory policy ...


In [14]:
# Verification of df_test

df_test

Unnamed: 0,title,text,subject,date,RealNews?,document
41918,"ESPN SENIOR WRITER Says COPS, SOLDIERS Singing...",What happens when a black police officer sings...,left-news,"May 30, 2016",False,"espn senior writer says cops, soldiers singing..."
8149,Lawmaker says he misspoke about Republican Par...,WASHINGTON (Reuters) - Republican congressman ...,politicsNews,"September 14, 2016",True,lawmaker says he misspoke about republican par...
29658,A**hole Of The Day – Michele Bachmann: Muslim...,Michele Bachmann has been pretty quiet since l...,News,"February 9, 2016",False,a**hole of the day – michele bachmann: muslim...
41855,New Book Reveals HILLARY’S ANTI-SEMITIC SIDE: ...,"Hillary doesn t recall saying it, but Bill s c...",left-news,"Jun 11, 2016",False,new book reveals hillary’s anti-semitic side: ...
41890,CAN YOU GUESS THE ONE THING Majority Of Bernie...,The function of socialism is to raise sufferin...,left-news,"Jun 5, 2016",False,can you guess the one thing majority of bernie...
...,...,...,...,...,...,...
538,Justice Department says White House may name n...,WASHINGTON (Reuters) - The White House may nam...,politicsNews,"November 26, 2017",True,justice department says white house may name n...
42648,"LEFTIST FOR A LIVING Changes Position, Turns O...",FLASHBACK: Watch the video below to see Kudlow...,left-news,"Dec 12, 2015",False,"leftist for a living changes position, turns o..."
34186,WOW! 1996 NYT’s Stunning Article SLAYS First L...,How did we ever get to the point where we woul...,politics,"Oct 10, 2016",False,wow! 1996 nyt’s stunning article slays first l...
2073,Afghan president 'grateful' for Trump's commit...,KABUL (Reuters) - Afghan leader Ashraf Ghani o...,politicsNews,"August 22, 2017",True,afghan president 'grateful' for trump's commit...


# Part 1: Naive Bayes "by hand"

In [16]:
# Computing the Priors for each class

# Store the total number of instances in a variable called total_instances
total_instances = len(df_train)

# Store the total number of real instances in a variable called real_instances
real_instances = len(df_train[df_train['RealNews?']])

# Store the total number of fake instances in a variable called fake_instances
fake_instances = len(df_train[~df_train['RealNews?']])

# Compute the percentage of real instances to total instances and store it in a variable called prior_real
prior_real = real_instances / total_instances

# Compute the percentage of fake instances to total instances and store it in a variable called prior_fake
prior_fake = fake_instances / total_instances

In [17]:
# Print out the Priors for each class

print(f'{prior_real} is the prior for the real class')
print(f'{prior_fake} is the prior for the fake class')

0.48115151177682497 is the prior for the real class
0.518848488223175 is the prior for the fake class


In [18]:
# Generating the vocabulary, real tokens and their frequencies, and fake tokens and their frequencies
# The above items will be used later to compute the likelihood for each document given the class (during testing)

# store the distinct tokens in a hashset called vocabulary
vocabulary = set()

# store the distinct tokens of the real class as well as their frequencies in a hashmap called real_tokens
real_tokens = dict()

# store the distinct tokens of the fake class as well as their frequencies in a hashmap called fake_tokens 
fake_tokens = dict()

tokens = list()

for _, row in df_train.iterrows():
    tokens = re.split(r"\W+", row["document"])
    tokens = [token for token in tokens if token]
    for token in tokens:
        if row['RealNews?']:
            real_tokens[token] = real_tokens.get(token, 0) + 1
        else:
            fake_tokens[token] = fake_tokens.get(token, 0) + 1
        vocabulary.add(token)

# Storing the size of the vocabulary in a variable called vocabulary_size
vocabulary_size = len(vocabulary)

# Storing the total number of real tokens in a variable called count_real_tokens
count_real_tokens = len(real_tokens)

# Storing the total number of fake tokens in a variable called count_fake_tokens
count_fake_tokens = len(fake_tokens)

In [19]:
# Perform inference on the test set
# The marginal, which is used for normalization in Bayes Theorem, will be omitted
# The above statemet is valid becasue it is not needed when comparing posterior probabilities with each other

# y_pred is a list which stores the predictions for each instance in the test set
y_pred = list()

for _, row in df_test.iterrows():
    tokens = re.split(r"\W+", row["document"])
    tokens = [token for token in tokens if token]

    # Case 1: Compute posterior probability for the real class
    # The likelihood for the document given the real class must be computed
    # The prior for the real class has already been computed during training
    likelihood_real = 0

    # Iterte through each token of this document and add its log probability to likelihood_real
    for token in tokens:
        # Set the log_probability for the token to a default value of 0
        log_probability = 0
        if token in real_tokens:
            log_probability = math.log(real_tokens[token] / count_real_tokens)
        else:
            # Laplace smoothing is applied here
            log_probability = math.log(1 / (count_real_tokens + vocabulary_size))
        likelihood_real += log_probability

    # Compute the posterior probability that the document belongs to the real class
    # Store the result in a variable called posterior_real
    posterior_real = likelihood_real * prior_real
    
    # Case 2: Compute the posterior probability for the fake class 
    # The likelihood for the document given the fake class must be computed
    # The prior for the fake class has already been computed during training
    likelihood_fake = 0

    # Iterate through each token of this document and add its log probability to likelihood_fake
    for token in tokens:
        # Set the log_probability for the token to a default value of 0
        log_probability = 0
        if token in fake_tokens:
            log_probability = math.log(fake_tokens[token] / count_fake_tokens)
        else:
            # Laplace smoothing is applied here
            log_probability = math.log(1 / (count_fake_tokens + vocabulary_size))
        likelihood_fake += log_probability

    # Compute the posterior probability that the document belongs to the fake class
    # Store the result in a variable called posterior_fake
    posterior_fake = likelihood_fake * prior_fake

    # Take the maximum posterior probability between the two classes (Real and Fake)
    # Append the corresponding boolean prediction to y_pred
    y_pred.append(posterior_real >= posterior_fake)

# Store the gold labels in a variable called y_true
y_true = df_test["RealNews?"]

In [20]:
# Compute Metrics: Precision, Recall, F1 Score, and Support based on the Naive Bayes Classifier

precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)

print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F1 Score : {f1_score}')
print(f'Support : {support}')

Precision : [0.9995777  0.62522686]
Recall : [0.48854489 0.99975816]
F1 Score : [0.65631499 0.76933098]
Support : [4845 4135]


# Part 2: Tf-idf "by hand"

In [22]:
# Map each document to the terms that appear in it along with their frequencies

def compute_document_to_term_map(dataframe):
    document_to_term_map = dict()
    
    for doc_id, row in tqdm(dataframe.iterrows()):
        document_to_term_map[doc_id] = dict()
        tokens = re.split(r"\W+", row["document"])
        tokens = [token for token in tokens if token]
        for token in tokens:
            document_to_term_map[doc_id][token] = document_to_term_map[doc_id].get(token, 0) + 1
    return document_to_term_map

In [23]:
# Populate vocabulary

# Maps each token to its overall frequency in the corpus

def generate_vocabulary_list(document_to_term_map):
    token_frequency_map = dict()
    
    for doc_id, _ in tqdm(df_train.iterrows()):
        for token in document_to_term_map[doc_id]:
            token_frequency_map[token] = token_frequency_map.get(token, 0) + document_to_term_map[doc_id][token]
    
    # Store the vocabulary in a list where each token appears at least twice throughout the entire corpus
    vocabulary_list = [token for token in token_frequency_map if token_frequency_map[token] >= 2]
    
    return vocabulary_list

In [25]:
# Compute the term frequency for each term given the document

def compute_term_frequency_map(X_batch, document_to_term_map, vocabulary_list):
    term_frequency_map = dict()

    vocabulary_set = set(vocabulary_list)
    
    for doc_id, row in tqdm(X_batch.iterrows()):
        term_frequency_map[doc_id] = dict()
        for token in document_to_term_map[doc_id]:
            if token in vocabulary_set:
                term_frequency_map[doc_id][token] = 1 + math.log(document_to_term_map[doc_id][token])
    return term_frequency_map

In [31]:
# Compute the document frequency for each term in the corpus

def compute_document_frequency_map(document_to_term_map, vocabulary_list, dataframe):
    document_frequency_map = dict()
    
    # Convert vocabulary_list to a set for O(1) lookup
    vocabulary_set = set(vocabulary_list)
    
    for doc_id, _ in tqdm(dataframe.iterrows()):
        for token in document_to_term_map[doc_id]:
            if token in vocabulary_set:
                document_frequency_map[token] = document_frequency_map.get(token, 0) + 1
    return document_frequency_map

In [32]:
# Vectorize each document

X_batch_vectorized = list()

# N represents the total number of documents in the corpus
def vectorize_documents(X_batch, term_frequency_map, document_frequency_map, vocabulary_list, token_to_index_map, N):
    for doc_id, row in tqdm(X_batch.iterrows()):
        document_vector = [0] * len(vocabulary_list)
        for token in document_to_term_map[doc_id]:
            if token in token_to_index_map:
                token_tf = term_frequency_map[doc_id][token]
                token_idf = math.log(N / document_frequency_map[token])
                token_tf_idf = token_tf * token_idf
                index = token_to_index_map[token]
                document_vector[index] = token_tf_idf
        # Append document_vector to X_batch_vectorized
        X_batch_vectorized.append(document_vector)
    return X_batch_vectorized 


In [33]:
# The below function vectorizes each row (document) in X_batch and returns the entire vectorized batch back to the caller

def compute_tf_idf(X_batch, document_to_term_map, vocabulary_list, token_to_index_map, document_frequency_map, N):
    term_frequency_map = compute_term_frequency_map(X_batch, document_to_term_map, vocabulary_list)
    X_batch_vectorized = vectorize_documents(X_batch, term_frequency_map, document_frequency_map, vocabulary_list, token_to_index_map, N) 
    return X_batch_vectorized


In [34]:
# Store the size of df_train in a variable called df_train_size
df_train_size = len(df_train)

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 50

# Store the gold labels in y_train
y_train = df_train["RealNews?"]

# Maps each document to the terms that appear in it along with their frequencies
document_to_term_map = compute_document_to_term_map(df_train)

# Store the vocabulary in a list called vocabulary_list
vocabulary_list = generate_vocabulary_list(document_to_term_map)

# Store the size of the vocabulary in a variable called vocabulary_size
vocabulary_size = len(vocabulary_list)

# Map each distinct token in the vocabulary to its corresponding index in vocabulary_list
token_to_index_map = {token : index for index, token in enumerate(vocabulary_list)}

# Maps each token in the vocabulary to the number of distinct documents it appears in 
document_frequency_map = compute_document_frequency_map(document_to_term_map, vocabulary_list, df_train)

# Initialize the Logistic Regression Classifier and store it in clf
clf = LogisticRegression(random_state=0)

for start in tqdm(range(0, df_train_size, batch_size)):
    end = min(start + batch_size, df_train_size)
    df_batch = df_train[start:end]
    X_batch_vectorized = compute_tf_idf(df_batch, document_to_term_map, vocabulary_list, token_to_index_map, document_frequency_map, df_train_size)
    if len(X_batch_vectorized) == len(y_train[start:end]):
        clf.fit(X_batch_vectorized, y_train[start:end])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:03, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:45, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [01:37, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [36]:
# Vectorize the test set (df_test)
df_test_size = len(df_test)

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used 
batch_size = 500

# Store the vectorized dataframe in X_test
X_test = list()

# Maps each document to the terms that appear in it along with their frequencies
document_to_term_map = compute_document_to_term_map(df_test)

# Maps each token in the vocabulary to the number of distinct documents it appears in 
document_frequency_map = compute_document_frequency_map(document_to_term_map, vocabulary_list, df_test)

for start in tqdm(range(0, df_test_size, batch_size)):
    end = min(start + batch_size, df_test_size)
    df_batch = df_test[start:end]
    X_batch_vectorized = compute_tf_idf(df_batch, document_to_term_map, vocabulary_list, token_to_index_map, document_frequency_map, df_test_size)
    X_test.extend(X_batch_vectorized)


8980


0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:01, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [39]:
# Store the predictions in y_pred
y_pred = np.array([])

for start in tqdm(range(0, df_test_size, batch_size)):
    end = min(start + batch_size, df_test_size)
    X_test_batch = X_test[start:end]
    predictions = clf.predict(X_test_batch)
    y_pred = np.concatenate((y_pred, predictions), axis=None)

# Store the gold labels in y_true
y_true = df_test["RealNews?"]

  0%|          | 0/18 [00:00<?, ?it/s]

In [45]:
# Compute metrics (Precision, Recall, F1 Score, and Support)

precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)

print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F1 Score : {f1_score}')
print(f'Support : {support}')

Precision : [0.53780069 0.45440882]
Recall : [0.7752322  0.21934704]
F1 Score : [0.63504945 0.29587343]
Support : [4845 4135]


# Part 3 Naive Bayes and Tf-idf using Scikit-learn

In [47]:
# Vectorize df_train and fit gnb classifier

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 50

# Initialize the HashingVectorizer object and store it in a variable called vectorizer
vectorizer = HashingVectorizer(n_features=2**20)

# Initialize the gaussian naive bayes object and store it in a variable called gnb
gnb = GaussianNB()

for start in tqdm(range(0, df_train_size, batch_size)):
    end = min(start + batch_size, df_train_size)
    df_batch = df_train[start:end]
    X_train_sparse = vectorizer.fit_transform(df_batch["document"])
    X_train_dense = X_train_sparse.toarray()
    gnb.fit(X_train_dense, y_train[start:end])

  0%|          | 0/719 [00:00<?, ?it/s]

In [48]:
# Make and store the predictions in y_pred based on gnb classifier

# y_pred will be used to store the predictions
y_pred = np.array([])

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 500

for start in tqdm(range(0, df_test_size, batch_size)):
    end = min(start + batch_size, df_test_size)
    df_batch = df_test[start:end]
    X_train_sparse = vectorizer.fit_transform(df_batch["document"])
    X_train_dense = X_train_sparse.toarray()
    predictions = gnb.predict(X_train_dense)
    y_pred = np.concatenate((y_pred, predictions), axis=None)

# Store the gold labels in y_true
y_true = df_test["RealNews?"]

  0%|          | 0/18 [00:00<?, ?it/s]

In [49]:
# Compute metrics (Precision, Recall, F1 Score, and Support) based on gnb classifier

precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)

print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F1 Score : {f1_score}')
print(f'Support : {support}')

Precision : [0.85537655 0.59404215]
Recall : [0.47120743 0.90665054]
F1 Score : [0.60766569 0.71778671]
Support : [4845 4135]


In [66]:
# Tf-idf using Scikit-learn
# Vectorizing using TfidfVectorizer on df_train in batches and fitting clf (Logistic Regression Classifier) on each batch

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 50

# Initialize the TfidfVectorizer object and store it in a variable called vectorizer 
vectorizer = TfidfVectorizer()

# Use the vectorizer's fit method on df_train
vectorizer.fit(df_train["document"])

# Initialize the Logistic Regression object and store it in a variable called clf
clf = LogisticRegression(random_state=0)

for start in tqdm(range(0, df_train_size, batch_size)):
    end = min(start + batch_size, df_train_size)
    df_batch = df_train[start:end]
    X_train_sparse = vectorizer.transform(df_batch["document"])
    clf.fit(X_train_sparse, y_train[start:end])

  0%|          | 0/719 [00:00<?, ?it/s]

In [67]:
# Make and store the predictions in y_pred based on clf (Logistic Regression Classifier)

# y_pred will be used to store the predictions
y_pred = np.array([])

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 500

for start in tqdm(range(0, df_test_size, batch_size)):
    end = min(start + batch_size, df_test_size)
    df_batch = df_test[start:end]
    X_train_sparse = vectorizer.transform(df_batch["document"])
    predictions = clf.predict(X_train_sparse)
    y_pred = np.concatenate((y_pred, predictions), axis=None)

# Store the gold labels in y_true
y_true = df_test["RealNews?"]

  0%|          | 0/18 [00:00<?, ?it/s]

In [70]:
# Compute metrics (Precision, Recall, F1 Score, and Support) based on clf classifier

precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)

print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F1 Score : {f1_score}')
print(f'Support : {support}')

Precision : [1.         0.46067291]
Recall : [8.25593395e-04 1.00000000e+00]
F1 Score : [0.00164982 0.63076806]
Support : [4845 4135]


# ngram_range parameter

The parameter ngram_range allows the user to specify the range of ngrams to be added to the feature space. For example, (1, 3) means include unigrams, bigrams,and trigrams to the feature space. The parameter ngram_range generates additional features for the model to learn from when .fit() is called. The feature space increases when additional ngrams such as bigrams and trigrams are included. This  may be beneficial because a sequence of tokens may provide additional nuanced context and meaning which may help our model perform classification better.

In [76]:
# Tf-idf using Scikit-learn
# Vectorizing using TfidfVectorizer on df_train in batches and fitting clf (Logistic Regression Classifier) on each batch
# Using the ngram_range parameter in the TfidfVectorizer

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 50

# Initialize the TfidfVectorizer object and store it in a variable called vectorizer 
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Use the vectorizer's fit method on df_train
vectorizer.fit(df_train["document"])

# Initialize the Logistic Regression object and store it in a variable called clf
clf = LogisticRegression(random_state=0)

for start in tqdm(range(0, df_train_size, batch_size)):
    end = min(start + batch_size, df_train_size)
    df_batch = df_train[start:end]
    X_train_sparse = vectorizer.transform(df_batch["document"])
    clf.fit(X_train_sparse, y_train[start:end])

  0%|          | 0/719 [00:00<?, ?it/s]

In [77]:
# Make and store the predictions in y_pred based on clf (Logistic Regression Classifier)
# The .fit() method was called on clf in batches based on the TfidfVectorizer using the ngram_range parameter

# y_pred will be used to store the predictions
y_pred = np.array([])

# Set the batch size since the entire dataframe will NOT fit into memory all at once
# Hence, batch processing is used
batch_size = 500

for start in tqdm(range(0, df_test_size, batch_size)):
    end = min(start + batch_size, df_test_size)
    df_batch = df_test[start:end]
    X_train_sparse = vectorizer.transform(df_batch["document"])
    predictions = clf.predict(X_train_sparse)
    y_pred = np.concatenate((y_pred, predictions), axis=None)

# Store the gold labels in y_true
y_true = df_test["RealNews?"]

  0%|          | 0/18 [00:00<?, ?it/s]

In [78]:
# Compute metrics (Precision, Recall, F1 Score, and Support) based on clf classifier
# The .fit() method was called on clf in batches based on the TfidfVectorizer using the ngram_range parameter

precision, recall, f1_score, support = precision_recall_fscore_support(y_true, y_pred)

print(f'Precision : {precision}')
print(f'Recall : {recall}')
print(f'F1 Score : {f1_score}')
print(f'Support : {support}')

Precision : [1.         0.46051899]
Recall : [2.06398349e-04 1.00000000e+00]
F1 Score : [4.12711515e-04 6.30623761e-01]
Support : [4845 4135]


# Difference in Performance (Using ngram_range parameter)

The model performance was worse after using the ngram_range parameter which included unigrams and bigrams in the feature space. The precision for the fake class slightly decreased while the recall and F1 score for the real class significantly decreased. In conclusion, in this particular case, adding bigrams did not improve model performance.