# Assignment 4 - News classification

**Team Group** 
- Ricardo Cavalcante: 377744
- Tales Araujo: 374953

## 1. Preprocessing

### 1.1 Importing the data

In [475]:
from google.colab import drive

# Here I have uploaded a sample from the news dataset
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [476]:
"""NOTEBOOK REPRODUCIBILITY

If you have a local sample of this dataset, just place it within this directory,
change the DIR_PATH variable in order to match this new location and then simply
ignore the gdrive mounting step by commenting it. You can access a local sample
by logging into your Google Drive account and then downloading it from this URL:

https://drive.google.com/file/d/1URGK1ShN4CgjbIUPtw6HMs9a28qSfFwb
"""
DIR_PATH = "drive/My Drive/datasets/news/noticias.csv"

In [477]:
# Importing base libraries
import numpy as np
import pandas as pd

In [478]:
# Reading dataset
df = pd.read_csv(DIR_PATH)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [479]:
# The first column is used as an id. Let's define it as how it was supposed to be
# Size of the dataset, size of unique ids
df['Unnamed: 0'].shape[0], df['Unnamed: 0'].unique().shape

(6335, (6335,))

In [480]:
# Rearranging the data
df = df.rename(columns={"Unnamed: 0": "id"})
df = df.sort_values(by="id", ascending=True)
df = df.set_index('id')
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,title,text,label
0,Study: women had to drive 4 times farther afte...,Ever since Texas laws closed about half of the...,REAL
1,"Trump, Clinton clash in dueling DC speeches","Donald Trump and Hillary Clinton, now at the s...",REAL
2,"As Reproductive Rights Hang In The Balance, De...",WASHINGTON -- Forty-three years after the Supr...,REAL
3,"Despite Constant Debate, Americans' Abortion O...",It's been a big week for abortion news.\n\nCar...,REAL
4,Obama Argues Against Goverment Shutdown Over P...,President Barack Obama said Saturday night tha...,REAL


In [481]:
# Class distribution
df['label'].value_counts()

REAL    3171
FAKE    3164
Name: label, dtype: int64

In [482]:
# Importing necessary functions to perform analysis
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

### 1.2 Splitting data into training and testing sets

In [483]:
# Converting categorical classes into integer ones
label_mapping = {"REAL": 1, "FAKE": 0}
df['label'] = df['label'].map(label_mapping)

# USING ONLY TEXT FEATURE
#X, y = df['text'], df['label']
# USING ALL FEATURES
# Creating a new column with title and text grouped together
df['content'] = df['title'] + "\n\n" + df['text']
X, y = df['content'], df['label']

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## 2. Creating `CountVectorized` matrix

In [484]:
cv = CountVectorizer()

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

## 3. Creating `TfidVectorizer` matrix

In [485]:
tfidv = TfidfVectorizer(min_df=1, stop_words="english") # Test passing in english stopwords afterwards

X_train_tfidv = tfidv.fit_transform(X_train)
X_test_tfidv = tfidv.transform(X_test)

## 4. Creating and Training Multinomial Naive Bayes models

In [486]:
# Instantiate the class
mnb_cv, mnb_tfidv = MultinomialNB(), MultinomialNB()

In [487]:
# Fitting the cv model
mnb_cv.fit(X_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [488]:
# Fitting the tfidv model
mnb_tfidv.fit(X_train_tfidv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 5. Evaluating the results

5.1 Evaluating the First MNB model (`CountVectorizer` based)

In [489]:
# Generate predicted values for cv test set
pred_mnb_cv = mnb_cv.predict(X_test_cv)

In [490]:
# Actual labels for test set
actual = np.array(y_test)

In [491]:
# Computing accuracy score over the CV MNB model
accuracy_score(pred_mnb_cv, actual)

0.8876135820181731

In [492]:
# Computing the confusion matrix over the CV MNB model
cf_matrix_1 = confusion_matrix(actual, pred_mnb_cv, normalize='all')
print(cf_matrix_1)

[[0.42515543 0.07938785]
 [0.03299857 0.46245815]]


5.2 Evaluating the Second MNB model (`TfidVectorizer` based)

In [493]:
# Generate predicted values for TfidV test set
pred_mnb_tfidv = mnb_tfidv.predict(X_test_tfidv)

In [494]:
# Actual labels for test set
actual = np.array(y_test)

In [495]:
# Computing accuracy score over the TfidV MNB model
accuracy_score(pred_mnb_tfidv, actual)

0.8244858919177427

In [496]:
# Computing the confusion matrix over the TfidV MNB model
cf_matrix_2 = confusion_matrix(actual, pred_mnb_tfidv, normalize='all')
print(cf_matrix_2)

[[0.33668101 0.16786227]
 [0.00765184 0.48780488]]
