### Fake and Real News Detection 

using N-Grams and exploring on different classification models.

**Classification models:** 
- Naive Bayes
- Random Forest Classifier
- Decision Tree Classifier
- KNN
- SVM

In [2]:
import pandas as pd
import numpy as np
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
data = pd.read_csv("data/Fake_Real_Data.csv")
data.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [5]:
data["label"].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [6]:
news_data = data.copy()

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
to_del_elements = {"no", "not", "n't"}
nlp.Defaults.stop_words = nlp.Defaults.stop_words - to_del_elements

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
def preprocess(text: str) -> str:
    doc = nlp(text)
    processed_text = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]

    return " ".join(processed_text)

In [11]:
preprocess("this is not a fake news.")

'not fake news'

In [12]:
news_data["processed_text"] = news_data["Text"].apply(preprocess)

In [13]:
news_data.head(5)

Unnamed: 0,Text,label,processed_text
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,Trump Surrogate BRUTALLY Stabs Pathetic vide...
1,U.S. conservative leader optimistic of common ...,Real,U.S. conservative leader optimistic common gro...
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,trump propose U.S. tax overhaul stir concern d...
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,Court Forces Ohio allow million illegally pu...
4,Democrats say Trump agrees to work on immigrat...,Real,Democrats Trump agree work immigration bill wa...


In [14]:
news_data["label_num"] = news_data["label"].apply(lambda x: 1 if x=="Fake" else 0)

In [15]:
news_data.head(5)

Unnamed: 0,Text,label,processed_text,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,Trump Surrogate BRUTALLY Stabs Pathetic vide...,1
1,U.S. conservative leader optimistic of common ...,Real,U.S. conservative leader optimistic common gro...,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,trump propose U.S. tax overhaul stir concern d...,0
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,Court Forces Ohio allow million illegally pu...,1
4,Democrats say Trump agrees to work on immigrat...,Real,Democrats Trump agree work immigration bill wa...,0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(news_data["processed_text"], 
                                                    news_data["label_num"],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=news_data.label)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7920,), (1980,), (7920,), (1980,))

#### Naive Bayes

In [18]:
naive_bayes_pipeline = Pipeline([
    ("countvectorize", CountVectorizer()),
    ("classifier", MultinomialNB())
])

In [19]:
naive_bayes_pipeline.fit(X_train, y_train)

y_pred = naive_bayes_pipeline.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       974
           1       0.97      0.97      0.97      1006

    accuracy                           0.97      1980
   macro avg       0.97      0.97      0.97      1980
weighted avg       0.97      0.97      0.97      1980



#### Random Forest

In [20]:
randomforest_pipeline = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 3))),
    ("classifier", RandomForestClassifier())
])

In [21]:
randomforest_pipeline.fit(X_train, y_train)

In [22]:
y_pred = randomforest_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       980
           1       0.99      1.00      0.99      1000

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



#### Decision Tree

In [23]:
decisiontree_pipeline = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 3))),
    ("classifier", DecisionTreeClassifier())
])

In [24]:
decisiontree_pipeline.fit(X_train, y_train)

In [25]:
y_pred = decisiontree_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       980
           1       1.00      1.00      1.00      1000

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



#### KNN

In [26]:
knn_pipeline = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 3))),
    ("classifier", KNeighborsClassifier())
])

In [27]:
knn_pipeline.fit(X_train, y_train)

In [28]:
y_pred = knn_pipeline.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.99      0.51      0.67      1930
           1       0.04      0.90      0.09        50

    accuracy                           0.52      1980
   macro avg       0.52      0.70      0.38      1980
weighted avg       0.97      0.52      0.66      1980



#### SVM

In [29]:
svc_pipeline = Pipeline([
    ("countvectorize", CountVectorizer(ngram_range=(1, 3))),
    ("classifier", SVC())
])

In [30]:
svc_pipeline.fit(X_train, y_train)

In [31]:
y_pred = svc_pipeline.predict(X_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       979
           1       1.00      1.00      1.00      1001

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980

