### Part 4

You should now evaluate your models on the FakeNews and the LIAR dataset. Arrange all these results in a table to facilitate a comparison between them. You should be evaluating the model on how well it classifies articles correctly using F-score. You may want to include a confusion matrix to visualize the types of classification errors made by your models.

### Task 1
Evaluate the performance of your Simple and Advanced Models on your FakeNewsCorpus test set. It should be possible to achieve > 80% accuracy but you will not fail the project if your model cannot reach this performance.



In [247]:
# Import the necessary libraries
# pip install dask-ml
import dask.dataframe as dd
import dask_ml.model_selection
import numpy as np

In [248]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
cleaned_data = dd.read_csv('news_cleaned_2018_02_13-results_200MB.csv', encoding="utf-8", dtype={
        'Unnamed: 0': 'object',
        'id': 'object',
        'domain': 'object',
        'type': 'object',
        'url': 'object',
        'content': 'object',
        'scraped_at': 'object',
        'inserted_at': 'object',
        'updated_at': 'object',
        'title': 'object',
        'authors': 'object',
        'keywords': 'float64',
        'meta_keywords': 'object',
        'meta_description': 'object',
        'tags': 'object',
        'summary': 'float64',
        'tokens': 'object',
        'filtered_tokens': 'object',
        'stemmed_tokens': 'object',
    },)

In [249]:
# Define a function to modify the 'type' column values
# This function is used to simplify the classification problem
# The 'reliable' and 'political' types are combined into a single 'reliable' type
def modify_type(x):
    if x == 'reliabl' or x == 'polit':
        return '1'
    else:
        return '0'

In [250]:
# Apply the modify_type function to the 'type' column using the map function
cleaned_data['type'] = cleaned_data['type'].map(modify_type, meta=('type', 'object'))

In [251]:
# Identify if there are any missing values in the 'type' column
# If so, they are replaced with an empty string
def nan_to_empty(x):
    if isinstance(x, float) and np.isnan(x):
        return ''
    else:
        return x

In [252]:
# Apply the nan_to_empty function to the 'type' column using the map function
cleaned_data['content'] = cleaned_data['content'].map(nan_to_empty, meta=('content', 'object'))

In [253]:
# Define X and y as the 'content' and 'type' columns
y = cleaned_data['type']
X = cleaned_data['content']

In [254]:
# Split the data into train and test sets
# In an 80/10/10 split

X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
X_train, X_val, y_train, y_val = dask_ml.model_selection.train_test_split(X_train, y_train, test_size=0.5, random_state=0, shuffle=False)

In [255]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object for use in feature extraction
vectorization = TfidfVectorizer()

# Fit the vectorizer to the training data and transform the training data into a vector
# As well as the validation data
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test) 

In [256]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1500, class_weight = 'balanced', random_state = 0, C=100)

In [257]:
model.fit(xv_train, y_train)

LogisticRegression(C=100, class_weight='balanced', max_iter=1500,
                   random_state=0)

In [258]:
# Print the accuracy of the model on the validation data
model.score(xv_test,y_test)

0.8921992743511025

In [259]:
# Predict the class of the validation data
pred_model = model.predict(xv_test)

In [260]:
from sklearn.metrics import classification_report

# Print the classification report for the model
print(classification_report(y_test,pred_model))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     10668
           1       0.77      0.82      0.80      3664

    accuracy                           0.89     14332
   macro avg       0.85      0.87      0.86     14332
weighted avg       0.90      0.89      0.89     14332



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
conf_mat = confusion_matrix(y_val, pred_model)

# Plot the confusion matrix using Seaborn
plt.figure(figsize=(10,7))
sns.set(font_scale=1.4) # for label size
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 16}, fmt="d", cmap="Blues", xticklabels=['Fake', 'Reliable'], yticklabels=['Fake', 'Reliable'])

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

Advanced

In [None]:
import pandas as pd
from scipy import sparse
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from imblearn.over_sampling import RandomOverSampler 

In [None]:
# Creates the features and labels for the model

features = pd.read_csv('features_cleaned_multi.csv') 
labels = pd.read_csv('labels_multi.csv') 

In [None]:
# Vectorizes the features and saves them as sparse matrices

title_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word')
sparse_matrix_for_title = title_vectorizer.fit_transform(features['title'])
sparse.save_npz("sparse_matrix_for_title.npz", sparse_matrix_for_title)
sparse_matrix_for_title_load = sparse.load_npz("sparse_matrix_for_title.npz")   

content_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word')
sparse_matrix_for_content = content_vectorizer.fit_transform(features['content'])
sparse.save_npz("sparse_matrix_for_content.npz", sparse_matrix_for_content)
sparse_matrix_for_content_load = sparse.load_npz("sparse_matrix_for_content.npz")

In [None]:
# Combines the sparse matrices into one sparse matrix
matrix = hstack([sparse_matrix_for_content_load,sparse_matrix_for_title_load]) 

In [None]:
# Creates our X and y variables
X = matrix 
y = labels 
y = np.ravel(y)

In [None]:
# Splits the data into training, validation, and test sets using a 80/10/10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=0)

# Oversamples the training data to balance the classes
oversampler = RandomOverSampler(random_state=0)
X_train_oversampled, y_train_oversampled = oversampler.fit_resample(X_train, y_train)

# Creates the Naive Bayes model and prints the accuracy score
model = ComplementNB(alpha=0.1) 
model.fit(X_train_oversampled, y_train_oversampled)

print(model.score(X_train_oversampled, y_train_oversampled))

predictions_NB = model.predict(X_test)

print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, y_test)*100)

y_test_pred = model.predict(X_test) 

report = classification_report(y_test, y_test_pred)
print("classification report:")
print(report)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Create the confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

# Set up the plot
plt.figure(figsize=(10, 10))
sns.set(font_scale=1.2)

# Create the heatmap
sns.heatmap(cm, annot=True, fmt='g', cmap='coolwarm', linewidths=0.5, square=True, cbar=False, xticklabels=True, yticklabels=True)

# Customize the plot
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix')

# Show the plot
plt.show()

### Task 2
In order to allow you to play around cross-domain performance, try the same exercise on the LIAR dataset, where you know the labels, and can thus immediately calculate the performance. You are expected to directly evaluate the model you trained on the FakeNewsCorpus. In other words, you do not need to retrain the model on the LIAR dataset.

In [264]:
# Import the necessary libraries
# pip install dask-ml
import dask.dataframe as dd
import dask_ml.model_selection
import numpy as np

In [265]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
LIAR_features = dd.read_csv('features_cleaned_LIAR.csv', encoding="utf-8", dtype={'content' : 'object'},)

In [266]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
LIAR_labels = dd.read_csv('labels_LIAR.csv', encoding="utf-8", dtype={'label' : 'object'},)

In [267]:
# Define X and y as the 'content' and 'type' columns
y_test = LIAR_labels['label']
X_test = LIAR_features['content']

In [269]:
# Transform the training data into a vector
xv_test = vectorization.transform(X_test)

In [272]:
# Print the accuracy of the model on the validation data
model.score(xv_test,y_test)

0.4135753749013418

In [273]:
# Predict the class of the validation data
pred_model = model.predict(xv_test)

In [274]:
from sklearn.metrics import classification_report

# Print the classification report for the model
print(classification_report(y_test,pred_model))

              precision    recall  f1-score   support

           0       0.84      0.37      0.51      1059
           1       0.17      0.65      0.27       208

    accuracy                           0.41      1267
   macro avg       0.51      0.51      0.39      1267
weighted avg       0.73      0.41      0.47      1267



Advanced model LIAR

In [None]:
# Import the necessary libraries
# pip install dask-ml
import dask.dataframe as dd
import dask_ml.model_selection
import numpy as np

In [None]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
cleaned_data = dd.read_csv('news_cleaned_2018_02_13-results_200MB.csv', encoding="utf-8", dtype={
        'Unnamed: 0': 'object',
        'id': 'object',
        'domain': 'object',
        'type': 'object',
        'url': 'object',
        'content': 'object',
        'scraped_at': 'object',
        'inserted_at': 'object',
        'updated_at': 'object',
        'title': 'object',
        'authors': 'object',
        'keywords': 'float64',
        'meta_keywords': 'object',
        'meta_description': 'object',
        'tags': 'object',
        'summary': 'float64',
        'tokens': 'object',
        'filtered_tokens': 'object',
        'stemmed_tokens': 'object',
    },)

In [None]:
# Define a function to modify the 'type' column values
# This function is used to simplify the classification problem
# The 'reliable' and 'political' types are combined into a single 'reliable' type
def modify_type(x):
    if x == 'reliabl' or x == 'polit':
        return '1'
    else:
        return '0'

In [None]:
# Apply the modify_type function to the 'type' column using the map function
cleaned_data['type'] = cleaned_data['type'].map(modify_type, meta=('type', 'object'))

In [None]:
# Identify if there are any missing values in the 'type' column
# If so, they are replaced with an empty string
def nan_to_empty(x):
    if isinstance(x, float) and np.isnan(x):
        return ''
    else:
        return x

In [None]:
# Apply the nan_to_empty function to the 'type' column using the map function
cleaned_data['content'] = cleaned_data['content'].map(nan_to_empty, meta=('content', 'object'))

In [None]:
# Define X and y as the 'content' and 'type' columns
y = cleaned_data['type']
X = cleaned_data['content']

In [None]:
# Split the data into train and test sets
# In an 80/10/10 split

X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(X, y, test_size=0.2, random_state=0, shuffle=False)
X_train, X_val, y_train, y_val = dask_ml.model_selection.train_test_split(X_train, y_train, test_size=0.5, random_state=0, shuffle=False)

In [None]:
# Import the necessary libraries
# pip install dask-ml
import dask.dataframe as dd
import dask_ml.model_selection
import numpy as np

In [None]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
LIAR_features = dd.read_csv('features_cleaned_LIAR_multi.csv', encoding="utf-8", dtype={'content' : 'object'},)

In [None]:
# Read the CSV file into a Dask DataFrame with specified data types
# These dtypes can be found by printing the dask dataframe
LIAR_labels = dd.read_csv('labels_LIAR_multi.csv', encoding="utf-8", dtype={'label' : 'object'},)

In [None]:
# Define X and y as the 'content' and 'type' columns
y_test = LIAR_labels['label']
X_test = LIAR_features['content']
y_test = np.ravel(y_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object for use in feature extraction
vectorization = TfidfVectorizer()

# Fit the vectorizer to the training data and transform the training data into a vector
xv_train = vectorization.fit_transform(X_train)
xv_test = vectorization.transform(X_test)
# y_train = vectorization.transform(y_train)

In [None]:
model = ComplementNB(alpha=0.1) 
model.fit(xv_train, y_train)

In [None]:
model.score(xv_test,y_test)

In [None]:
# Predict the class of the validation data
pred_model = model.predict(xv_test)

In [None]:
from sklearn.metrics import classification_report

# Print the classification report for the model
print(classification_report(y_test,pred_model))

### Task 3
Compare the results of this experiment to the results you obtained in question 3. Report your LIAR results as part of your report. Remember to test the performance of your Simple Model as well.