<a href="https://colab.research.google.com/github/vermacularis/Fake_news_prediction/blob/main/Fake_news_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# printing the stopwords in English
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
# loading the dataset to a pandas DataFrame
news_dataset = pd.read_csv('/content/train (1).csv')

In [5]:
news_dataset.shape

(10, 3)

In [6]:
# print the first 5 rows of the dataframe
news_dataset.head()

Unnamed: 0,title,text,label
0,Government announces new healthcare policy,The government has rolled out a new healthcare...,1
1,NASA discovers water on Mars,NASA's latest mission reveals signs of water o...,1
2,You won't believe what this celebrity did!,Shocking celebrity gossip that took the intern...,0
3,Miracle cure for cancer found in kitchen,"A home remedy claims to cure cancer instantly,...",0
4,"Elections rigged, claims anonymous source",An anonymous source claims the recent election...,0


In [7]:
# counting the number of missing values in the dataset
news_dataset.isnull().sum()

Unnamed: 0,0
title,0
text,0
label,0


In [8]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [9]:
# merging the author name and news title
news_dataset['content'] = news_dataset['text']+' '+news_dataset['title']

In [10]:
print(news_dataset['content'])

0    The government has rolled out a new healthcare...
1    NASA's latest mission reveals signs of water o...
2    Shocking celebrity gossip that took the intern...
3    A home remedy claims to cure cancer instantly,...
4    An anonymous source claims the recent election...
5    A man from a small town won the lottery twice ...
6    Climate scientists warn that global temperatur...
7    A team of biologists discovered a new bird spe...
8    A controversial study claims aliens built the ...
9    Scientists from an unknown lab claim the Earth...
Name: content, dtype: object


In [11]:
# separating the data & label
X = news_dataset.drop(columns='label', axis=1)
Y = news_dataset['label']

In [12]:
print(X)
print(Y)

                                            title  \
0      Government announces new healthcare policy   
1                    NASA discovers water on Mars   
2      You won't believe what this celebrity did!   
3        Miracle cure for cancer found in kitchen   
4       Elections rigged, claims anonymous source   
5          Local man wins lottery twice in a week   
6  Experts confirm climate change is accelerating   
7        New species of bird discovered in Amazon   
8   Aliens built the pyramids, new study suggests   
9          Flat Earth theory proven by scientists   

                                                text  \
0  The government has rolled out a new healthcare...   
1  NASA's latest mission reveals signs of water o...   
2  Shocking celebrity gossip that took the intern...   
3  A home remedy claims to cure cancer instantly,...   
4  An anonymous source claims the recent election...   
5  A man from a small town won the lottery twice ...   
6  Climate scientists wa

In [13]:
port_stem = PorterStemmer()

In [14]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [15]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [16]:
print(news_dataset['content'])

0    govern roll new healthcar polici aim reduc cos...
1    nasa latest mission reveal sign water surfac m...
2    shock celebr gossip took internet storm believ...
3    home remedi claim cure cancer instantli scient...
4    anonym sourc claim recent elect rig elect rig ...
5    man small town lotteri twice week local man wi...
6    climat scientist warn global temperatur rise f...
7    team biologist discov new bird speci deep amaz...
8    controversi studi claim alien built pyramid th...
9    scientist unknown lab claim earth actual flat ...
Name: content, dtype: object


In [17]:
#separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

In [18]:
print(X)

['govern roll new healthcar polici aim reduc cost govern announc new healthcar polici'
 'nasa latest mission reveal sign water surfac mar nasa discov water mar'
 'shock celebr gossip took internet storm believ celebr'
 'home remedi claim cure cancer instantli scientif proof miracl cure cancer found kitchen'
 'anonym sourc claim recent elect rig elect rig claim anonym sourc'
 'man small town lotteri twice week local man win lotteri twice week'
 'climat scientist warn global temperatur rise faster expect expert confirm climat chang acceler'
 'team biologist discov new bird speci deep amazon rainforest new speci bird discov amazon'
 'controversi studi claim alien built pyramid thousand year ago alien built pyramid new studi suggest'
 'scientist unknown lab claim earth actual flat flat earth theori proven scientist']


In [19]:
print(Y)

[1 1 0 0 0 1 1 1 0 0]


In [20]:
Y.shape

(10,)

In [21]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [22]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 91 stored elements and shape (10, 84)>
  Coords	Values
  (0, 3)	0.2281432758606343
  (0, 6)	0.2281432758606343
  (0, 19)	0.2281432758606343
  (0, 32)	0.4562865517212686
  (0, 33)	0.4562865517212686
  (0, 47)	0.3393537631334516
  (0, 48)	0.4562865517212686
  (0, 54)	0.2281432758606343
  (0, 59)	0.2281432758606343
  (1, 22)	0.20193016800222305
  (1, 39)	0.23753937224738556
  (1, 43)	0.4750787444947711
  (1, 45)	0.23753937224738556
  (1, 46)	0.4750787444947711
  (1, 56)	0.23753937224738556
  (1, 63)	0.23753937224738556
  (1, 70)	0.23753937224738556
  (1, 80)	0.4750787444947711
  (2, 8)	0.31622776601683794
  (2, 13)	0.6324555320336759
  (2, 31)	0.31622776601683794
  (2, 36)	0.31622776601683794
  (2, 62)	0.31622776601683794
  (2, 67)	0.31622776601683794
  (2, 75)	0.31622776601683794
  :	:
  (7, 22)	0.3701023975535447
  (7, 47)	0.32379586395918103
  (7, 52)	0.21768389550677114
  (7, 66)	0.4353677910135423
  (7, 71)	0.2176838955067

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, Y_train)

In [26]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [28]:
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [29]:
print(Y_test[0])

0
