In [3]:
import spacy
import pandas as pd

In [2]:
nlp = spacy.load('en_core_web_lg')

In [4]:
df = pd.read_csv('Fake_Real_Data.csv')
df.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [5]:
df.shape

(9900, 2)

In [6]:
df.label.value_counts()

label
Fake    5000
Real    4900
Name: count, dtype: int64

In [10]:
df['label_num'] = df.label.map({'Real':0,'Fake':1})

In [11]:
df.head()

Unnamed: 0,Text,label,label_num
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1
1,U.S. conservative leader optimistic of common ...,Real,0
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1
4,Democrats say Trump agrees to work on immigrat...,Real,0


In [15]:
vector = nlp('how are you').vector

In [17]:
df['text_vector'] = df.Text.apply(lambda x: nlp(x).vector)

In [20]:
df.head()

Unnamed: 0,Text,label,label_num,text_vector
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake,1,"[-0.6759837, 1.4263071, -2.318466, -0.451093, ..."
1,U.S. conservative leader optimistic of common ...,Real,0,"[-1.8355803, 1.3101058, -2.4919677, 1.0268308,..."
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real,0,"[-1.9851209, 0.14389805, -2.4221718, 0.9133005..."
3,Court Forces Ohio To Allow Millions Of Illega...,Fake,1,"[-2.7812982, -0.16120885, -1.609772, 1.3624227..."
4,Democrats say Trump agrees to work on immigrat...,Real,0,"[-2.2010763, 0.9961637, -2.4088492, 1.128273, ..."


In [None]:
df.text_vector[0]

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
xtrain , xtest , ytrain, ytest = train_test_split(df.text_vector, df.label_num, test_size=0.2, random_state=2023)

In [26]:
xtrain.shape

(7920,)

In [27]:
ytrain.shape

(7920,)

In [28]:
import numpy as np
xtrain_2d = np.stack(xtrain)
xtest_2d = np.stack(xtest)

In [29]:
xtrain_2d

array([[-1.5668225 ,  0.7193005 , -1.3984917 , ..., -0.82796407,
        -1.8337607 ,  0.46694985],
       [-1.7744157 ,  0.23035839, -1.793828  , ..., -1.1968979 ,
        -2.4129193 ,  0.7513793 ],
       [-1.3348427 ,  0.32678372, -1.4630445 , ..., -0.5638395 ,
        -2.3076472 ,  1.0211147 ],
       ...,
       [-1.5874971 ,  0.76052487, -1.841387  , ..., -0.97500235,
        -2.171352  ,  1.0011598 ],
       [-1.5534602 ,  0.1719242 , -1.494133  , ..., -0.26167375,
        -1.6082795 ,  0.9961052 ],
       [-1.3423282 ,  1.3968428 , -2.8533638 , ..., -0.58578455,
        -2.8941212 ,  1.1501148 ]], dtype=float32)

In [31]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [33]:
scaler = MinMaxScaler()
scale_xtrain = scaler.fit_transform(xtrain_2d)
scale_xtest = scaler.transform(xtest_2d)

In [36]:
model = MultinomialNB()
model.fit(scale_xtrain, ytrain)
ypred = model.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       976
           1       0.95      0.94      0.95      1004

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [37]:
from sklearn.neighbors import KNeighborsClassifier

In [38]:
knn = KNeighborsClassifier(metric='euclidean')
knn.fit(scale_xtrain, ytrain, )
ypred = knn.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       976
           1       0.99      0.99      0.99      1004

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



### EXERCISE

In [39]:
ndf = pd.read_json('news_dataset.json')
ndf.head()

Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS


In [40]:
ndf.category.value_counts()

category
CRIME       2500
SPORTS      2500
BUSINESS    2500
Name: count, dtype: int64

In [41]:
ndf['category_num'] = ndf.category.map({'CRIME':0, 'SPORTS':1,'BUSINESS':2})


In [42]:
ndf.head()

Unnamed: 0,text,category,category_num
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1
3,This Richard Sherman Interception Literally Sh...,SPORTS,1
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2


In [43]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return ' '.join(filtered_tokens)

In [44]:
ndf['clean_text'] = ndf.text.apply(lambda x: preprocess(x))
ndf.head()

Unnamed: 0,text,category,category_num,clean_text
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0,Larry Nassar blame victim say victimize newly ...
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0,woman Beats Cancer die fall horse
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,vegas taxpayer spend Record $ 750 million New ...
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,Richard Sherman Interception literally shake W...
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2,7 thing totally kill Weed Legalization Buzz


In [45]:
ndf['text_vector'] = ndf.clean_text.apply(lambda x: nlp(x).vector)
ndf.head()

Unnamed: 0,text,category,category_num,clean_text,text_vector
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,0,Larry Nassar blame victim say victimize newly ...,"[-0.5585511, -0.29323253, -0.9253956, 0.189389..."
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,0,woman Beats Cancer die fall horse,"[-0.73039824, -0.43196002, -1.2930516, -1.0628..."
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,1,vegas taxpayer spend Record $ 750 million New ...,"[-1.9413117, 0.121578515, -3.2996283, 1.511650..."
3,This Richard Sherman Interception Literally Sh...,SPORTS,1,Richard Sherman Interception literally shake W...,"[-1.4702771, -0.685319, 0.57398, -0.31135806, ..."
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,2,7 thing totally kill Weed Legalization Buzz,"[-1.037173, -1.9495698, -1.7179357, 1.2975286,..."


In [46]:
xtrain, xtest, ytrain, ytest = train_test_split(ndf.text_vector, ndf.category_num, test_size=0.2, random_state=2023) 

In [47]:
xtrain.shape

(6000,)

In [52]:
ytrain.shape

(6000,)

In [50]:
xtrain

5959    [1.0792834, 1.1013633, -1.2769212, 1.1597689, ...
3802    [-0.8554278, 0.03432389, -1.168391, 0.6399532,...
1483    [-2.653284, 0.30374798, -0.78534, 1.5748398, 0...
4947    [-0.61642164, -0.03995015, 0.58807075, 0.99469...
6399    [-0.13916554, 0.39687, 1.2778944, -1.1733665, ...
                              ...                        
6049    [-0.38335246, 0.73301524, -0.894511, 0.6304382...
2743    [-2.0266767, 0.17573553, -0.24038441, 0.216955...
6598    [0.84328496, -0.3154, -1.4194999, 0.644755, 3....
5657    [-2.3299215, 0.86253166, 0.078980766, 0.930267...
4951    [-2.48038, 0.28217518, -0.4465548, 1.5530193, ...
Name: text_vector, Length: 6000, dtype: object

In [53]:
xtrain_2d = np.stack(xtrain)
xtest_2d = np.stack(xtest)

In [55]:
scale_xtrain = scaler.fit_transform(xtrain_2d)
scale_xtest = scaler.transform(xtest_2d)


In [56]:
from sklearn.tree import DecisionTreeClassifier

In [57]:
dtmodel = DecisionTreeClassifier()
dtmodel.fit(scale_xtrain, ytrain)
ypred = dtmodel.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.79      0.72      0.75       512
           1       0.71      0.72      0.72       501
           2       0.72      0.78      0.75       487

    accuracy                           0.74      1500
   macro avg       0.74      0.74      0.74      1500
weighted avg       0.74      0.74      0.74      1500



In [58]:
model1 = MultinomialNB()
model1.fit(scale_xtrain, ytrain)
ypred = model1.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       512
           1       0.84      0.79      0.81       501
           2       0.79      0.83      0.80       487

    accuracy                           0.83      1500
   macro avg       0.83      0.83      0.83      1500
weighted avg       0.83      0.83      0.83      1500



In [59]:
knn1 = KNeighborsClassifier(metric='euclidean')
knn1.fit(scale_xtrain, ytrain )
ypred = knn1.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87       512
           1       0.91      0.80      0.85       501
           2       0.86      0.86      0.86       487

    accuracy                           0.86      1500
   macro avg       0.86      0.86      0.86      1500
weighted avg       0.86      0.86      0.86      1500



In [60]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
rfmodel = RandomForestClassifier()
rfmodel.fit(scale_xtrain, ytrain )
ypred = rfmodel.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90       512
           1       0.90      0.85      0.88       501
           2       0.86      0.90      0.88       487

    accuracy                           0.89      1500
   macro avg       0.89      0.89      0.89      1500
weighted avg       0.89      0.89      0.89      1500



In [62]:
from sklearn.ensemble import GradientBoostingClassifier


In [63]:
gbmodel = GradientBoostingClassifier()
gbmodel.fit(scale_xtrain, ytrain )
ypred = gbmodel.predict(scale_xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       512
           1       0.92      0.87      0.89       501
           2       0.87      0.92      0.89       487

    accuracy                           0.90      1500
   macro avg       0.90      0.90      0.90      1500
weighted avg       0.90      0.90      0.90      1500

