In [25]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_csv("Fake_Real_Data.csv")

In [2]:
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [3]:
df.columns = df.columns.str.lower()

In [4]:
df.shape

(9900, 2)

In [5]:
df["label"].value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [6]:
df["label_num"] = df["label"].map({
    "Fake" : 0,
    "Real" : 1
})

In [7]:
df.head()

Unnamed: 0,text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0
1,U.S. conservative leader optimistic of common ...,Real,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0
4,Democrats say Trump agrees to work on immigrat...,Real,1


In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")
# doc = nlp("King harry jumps over the pond")
# doc.vector.shape

In [9]:
## create a new column by extracting text from every-row to store vectors for text column
df["vector"] = df["text"].apply(lambda x : nlp(x).vector)

In [10]:
df.head()

Unnamed: 0,text,label,label_num,vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,0,"[-0.103623025, 0.17802684, -0.11873861, -0.034..."
1,U.S. conservative leader optimistic of common ...,Real,1,"[-0.0063406364, 0.16712041, -0.06661373, 0.017..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,1,"[-0.122753024, 0.17192385, -0.024732638, -0.06..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,0,"[-0.027337318, 0.12501417, -0.0073965387, -0.0..."
4,Democrats say Trump agrees to work on immigrat...,Real,1,"[-0.032708026, 0.093958504, -0.03287002, -0.00..."


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["vector"].values, df["label_num"], test_size = 0.2, random_state = 42)

In [12]:
X_train.shape

(7920,)

In [13]:
X_test.shape

(1980,)

In [16]:
## In the original array, every single element in the array in itself is a numpy array
## Classifier expects 2D numpy array
## Use numpy stack function for this purpose

X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [20]:
clf = MultinomialNB()
## MultinomialNB won't work with negative values
# clf.fit(X_train_2d, y_train)

## Apply scaling 
scaler = MinMaxScaler()
scaler_train_embed = scaler.fit_transform(X_train_2d)
scaler_test_embed = scaler.transform(X_test_2d)

clf.fit(scaler_train_embed, y_train)

In [24]:
y_pred = clf.predict(scaler_test_embed)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       973
           1       0.96      0.94      0.95      1007

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [27]:
clf = KNeighborsClassifier(n_neighbors = 5, metric = "euclidean")
clf.fit(X_train_2d, y_train)
y_pred = clf.predict(X_test_2d)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       973
           1       0.97      0.99      0.98      1007

    accuracy                           0.98      1980
   macro avg       0.98      0.98      0.98      1980
weighted avg       0.98      0.98      0.98      1980

