<a href="https://colab.research.google.com/github/tafnaz-ayub/tafnaz-FakeNews-detect/blob/main/Fake_news_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

About the Dataset:
1. Title of the article
2. Text of the article
3. Date of the article
4. Source of the article
5. Author of the article
6. Category of article
7. Label, which indicates whether the news is real or fake

In [None]:
import numpy as np
import pandas as pd
import re #searching words in a text or paragrapgh
from nltk.corpus import stopwords #nltk--> natural language tool kit
from nltk.stem.porter import PorterStemmer #gives a root word of a word
from sklearn.feature_extraction.text import TfidfVectorizer #convert the text into feature vectors i.e, numbers
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [None]:
#Data Preprocessing
#1. loading dataset
news = pd.read_csv('/content/fake_news_dataset.csv')
news.shape

(20000, 7)

In [None]:
news.head()

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake


In [None]:
#handle missing values
news = news.fillna('')
news.isnull().sum()

Unnamed: 0,0
title,0
text,0
date,0
source,0
author,0
category,0
label,0


In [None]:
news['content'] = news['author'] +' '+news['title']
print(news['content'])

0                     Paula George Foreign Democrat final.
1          Joseph Hill To offer down resource great point.
2              Julia Robinson Himself church myself carry.
3                Mr. David Foster DDS You unit its should.
4        Austin Walker Billion believe employee summer ...
                               ...                        
19995                         Gary Miles House party born.
19996    Maria Mcbride Though nation people maybe price...
19997     Kristen Franklin Yet exist with experience unit.
19998                  David Wise School wide itself item.
19999        James Peterson Offer chair cover senior born.
Name: content, Length: 20000, dtype: object


In [None]:
le = LabelEncoder()
news['label'] = le.fit_transform(news['label']) #real:1, fake:0
print(news['label'].value_counts())
print(news.head())

label
0    10056
1     9944
Name: count, dtype: int64
                                  title  \
0               Foreign Democrat final.   
1   To offer down resource great point.   
2          Himself church myself carry.   
3                  You unit its should.   
4  Billion believe employee summer how.   

                                                text        date    source  \
0  more tax development both store agreement lawy...  2023-03-10  NY Times   
1  probably guess western behind likely next inve...  2022-05-25  Fox News   
2  them identify forward present success risk sev...  2022-09-01       CNN   
3  phone which item yard Republican safe where po...  2023-02-07   Reuters   
4  wonder myself fact difficult course forget exa...  2023-04-03       CNN   

                 author    category  label  \
0          Paula George    Politics      1   
1           Joseph Hill    Politics      0   
2        Julia Robinson    Business      0   
3  Mr. David Foster DDS     Scienc

In [None]:
X = news.drop(columns="label",axis=1)
Y = news['label']
print(X)
print(Y)

                                       title  \
0                    Foreign Democrat final.   
1        To offer down resource great point.   
2               Himself church myself carry.   
3                       You unit its should.   
4       Billion believe employee summer how.   
...                                      ...   
19995                      House party born.   
19996  Though nation people maybe price box.   
19997        Yet exist with experience unit.   
19998               School wide itself item.   
19999         Offer chair cover senior born.   

                                                    text        date  \
0      more tax development both store agreement lawy...  2023-03-10   
1      probably guess western behind likely next inve...  2022-05-25   
2      them identify forward present success risk sev...  2022-09-01   
3      phone which item yard Republican safe where po...  2023-02-07   
4      wonder myself fact difficult course forget exa...  2023-

In [None]:
port_stem = PorterStemmer()
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content) #
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [None]:
news['content'] = news['content'].apply(stemming)

In [None]:
print(news['content'])

0                      paula georg foreign democrat final
1                   joseph hill offer resourc great point
2                             julia robinson church carri
3                                 mr david foster dd unit
4             austin walker billion believ employe summer
                               ...                       
19995                           gari mile hous parti born
19996    maria mcbride though nation peopl mayb price box
19997              kristen franklin yet exist experi unit
19998                         david wise school wide item
19999         jame peterson offer chair cover senior born
Name: content, Length: 20000, dtype: object


In [None]:
X = news['content'].values
Y = news['label'].values
print(X)
print(Y)

['paula georg foreign democrat final'
 'joseph hill offer resourc great point' 'julia robinson church carri' ...
 'kristen franklin yet exist experi unit' 'david wise school wide item'
 'jame peterson offer chair cover senior born']
[1 0 0 ... 1 0 0]


In [None]:
tfidf = TfidfVectorizer() #word count
tfidf.fit(news['content'].values)
X = tfidf.transform(news['content'].values)
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 136889 stored elements and shape (20000, 2320)>
  Coords	Values
  (0, 553)	0.39378500362567437
  (0, 729)	0.38563113779991653
  (0, 760)	0.4303325215423282
  (0, 820)	0.4702667357326091
  (0, 1607)	0.5384098017728571
  (1, 855)	0.4075140492253222
  (1, 945)	0.4349807003583625
  (1, 1106)	0.39395595700897607
  (1, 1544)	0.4059231913258755
  (1, 1650)	0.40644929622403225
  (1, 1761)	0.3994347584771229
  (2, 340)	0.4619493340010572
  (2, 402)	0.46849498889312446
  (2, 1116)	0.5803950203356232
  (2, 1802)	0.4798508920150325
  (3, 529)	0.3893432059572742
  (3, 535)	0.46879786583185773
  (3, 765)	0.5331103685464007
  (3, 1474)	0.36176526198760245
  (3, 2168)	0.4621251552704844
  (4, 128)	0.4431492340687908
  (4, 182)	0.39333180511748317
  (4, 209)	0.39858312187055117
  (4, 655)	0.41121899358509134
  (4, 2054)	0.3943487167650092
  :	:
  (19996, 1316)	0.38290410625708515
  (19996, 1345)	0.3396980161532583
  (19996, 1349)	0.446572338

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [None]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)
print(training_data_accuracy)

0.6430625


In [None]:
X_train_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_train_prediction,Y_test)
print(testing_data_accuracy)

0.5015


In [None]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)
if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [None]:
print(Y_test[0])

1
