In [153]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [154]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [155]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [156]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/fake_news_dataset.csv', on_bad_lines='skip')

In [157]:
news_dataset.shape

(18876, 7)

In [158]:
news_dataset.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [159]:
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,1
source,944
author,964
category,1
label,2


In [160]:
news_dataset = news_dataset.fillna('')

In [161]:
display(news_dataset.head())

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [162]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author']+' '+news_dataset['title']

In [163]:
print(news_dataset['content'])

0                     Paula George Foreign Democrat final.
1          Joseph Hill To offer down resource great point.
2              Julia Robinson Himself church myself carry.
3                Mr. David Foster DDS You unit its should.
4        Austin Walker Billion believe employee summer ...
                               ...                        
18871        Shaun Gallagher Civil them know itself offer.
18872    Kristin Walker Huge threat art believe fish no...
18873    Karen Keller Participant win necessary natural...
18874    Henry Maxwell Own executive available value ab...
18875                         Describe few none drop hard.
Name: content, Length: 18876, dtype: object


In [164]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [165]:
print(X)
print(Y)

                                                title  \
0                             Foreign Democrat final.   
1                 To offer down resource great point.   
2                        Himself church myself carry.   
3                                You unit its should.   
4                Billion believe employee summer how.   
...                                               ...   
18871                   Civil them know itself offer.   
18872            Huge threat art believe fish notice.   
18873  Participant win necessary natural doctor true.   
18874   Own executive available value above on enter.   
18875                    Describe few none drop hard.   

                                                    text        date  \
0      more tax development both store agreement lawy...  2023-03-10   
1      probably guess western behind likely next inve...  2022-05-25   
2      them identify forward present success risk sev...  2022-09-01   
3      phone which item yar

In [166]:
port_stem = PorterStemmer()

In [167]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [168]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [169]:
print(news_dataset['content'])

0                       paula georg foreign democrat final
1                    joseph hill offer resourc great point
2                              julia robinson church carri
3                                  mr david foster dd unit
4              austin walker billion believ employe summer
                               ...                        
18871                       shaun gallagh civil know offer
18872     kristin walker huge threat art believ fish notic
18873    karen keller particip win necessari natur doct...
18874                 henri maxwel execut avail valu enter
18875                               describ none drop hard
Name: content, Length: 18876, dtype: object


In [170]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [171]:
print(X)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'karen keller particip win necessari natur doctor true'
 'henri maxwel execut avail valu enter' 'describ none drop hard']


In [172]:
print(Y)

['real' 'fake' 'fake' ... 'real' 'real' '']


In [173]:
Y.shape

(18876,)

In [174]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [175]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 129126 stored elements and shape (18876, 2319)>
  Coords	Values
  (0, 553)	0.3942662260750253
  (0, 729)	0.3822906484378464
  (0, 760)	0.4307546992056687
  (0, 820)	0.4761026347003828
  (0, 1606)	0.5349623102958897
  (1, 855)	0.40682454711644667
  (1, 945)	0.4383695833430309
  (1, 1106)	0.3924762559459367
  (1, 1543)	0.40797756110247124
  (1, 1649)	0.4040260988672087
  (1, 1760)	0.3982530676000362
  (2, 340)	0.45963588513613296
  (2, 402)	0.4691277798019183
  (2, 1116)	0.5769356924984198
  (2, 1801)	0.4855915835706128
  (3, 529)	0.3859082093166977
  (3, 535)	0.4666618955982245
  (3, 765)	0.5421869865984128
  (3, 1473)	0.3571866674928831
  (3, 2167)	0.46016571504982057
  (4, 128)	0.4435721617521777
  (4, 182)	0.39460924776910467
  (4, 209)	0.3980239953113895
  (4, 655)	0.41162756444153137
  (4, 2053)	0.39405719998102956
  :	:
  (18872, 182)	0.3398931721114824
  (18872, 739)	0.343340058714357
  (18872, 988)	0.34593918243572175

In [176]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [177]:
model = LogisticRegression()

In [178]:
model.fit(X_train, Y_train)

In [179]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [180]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.6641059602649007


In [181]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [182]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.526218220338983


In [183]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

['fake']
The news is Fake


In [184]:
print(Y_test[3])

real
