<a href="https://colab.research.google.com/github/theshahidshaikh/Fake_news_prediction/blob/main/Fake_News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
news_dataset = pd.read_csv('/content/WELFake_Dataset.csv',)

In [7]:
news_dataset.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,0
text,0
label,0


In [8]:
news_dataset = news_dataset.fillna('')

In [9]:
X = news_dataset.drop(columns='label',axis=1)
Y = news_dataset['label']

In [10]:
#Stemming
port_stem = PorterStemmer()

def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [11]:
news_dataset['title'] = news_dataset['title'].apply(stemming)

In [15]:
print(news_dataset['title'])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [17]:
X = news_dataset['title'].values
Y = news_dataset['label'].values

In [18]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [19]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 629110 stored elements and shape (72134, 19639)>
  Coords	Values
  (0, 407)	0.3190180925014663
  (0, 1802)	0.33473541566384035
  (0, 3679)	0.24871262252022117
  (0, 5509)	0.31820565801047196
  (0, 6425)	0.28932771754845743
  (0, 6730)	0.48553136502134386
  (0, 7887)	0.26746434949988324
  (0, 9699)	0.22829788917209384
  (0, 17260)	0.24871262252022117
  (0, 17363)	0.2542650376115143
  (0, 18648)	0.1297506867782943
  (0, 19106)	0.19134939529376566
  (2, 1049)	0.28404017886581956
  (2, 2673)	0.30809679188606154
  (2, 2919)	0.3639616996972358
  (2, 6880)	0.2652283770602196
  (2, 8020)	0.2692285294185893
  (2, 11864)	0.2231406266784195
  (2, 12011)	0.16878852994653004
  (2, 12744)	0.27904818164471595
  (2, 13591)	0.22687620695463123
  (2, 14591)	0.3580030298678158
  (2, 15094)	0.1609967301122813
  (2, 16446)	0.1999703023632961
  (2, 18034)	0.35962437110547785
  :	:
  (72130, 17778)	0.13227219506940732
  (72130, 18936)	0.2530249939

In [21]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=1)

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train,Y_train)

In [24]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [25]:
print('accuracy score of training data: ',training_data_accuracy)

accuracy score of training data:  0.9202696379988563


In [26]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [27]:
print('accuracy score of test data:',test_data_accuracy)

accuracy score of test data: 0.8988008594995495


Testing model

In [32]:
X_new = X_test[10]
prediction = model.predict(X_new)
if prediction[0] == 0:
  print('News is real')
else:
  print('News is Fake')

print('Predicted: ',prediction)
print('Actual prediction: ',Y_test[10])

News is real
Predicted:  [0]
Actual prediction:  0


Checking for sequence of news

In [33]:
for i in range(1,10):
  X_new = X_test[i]
  prediction = model.predict(X_new)
  if prediction[0] == 0:
   print('News is real')
  else:
    print('News is Fake')

  print(i,') Predicted: ',prediction)
  print(i,') Actual prediction: ',Y_test[i])

News is Fake
1 ) Predicted:  [1]
1 ) Actual prediction:  1
News is real
2 ) Predicted:  [0]
2 ) Actual prediction:  0
News is Fake
3 ) Predicted:  [1]
3 ) Actual prediction:  1
News is real
4 ) Predicted:  [0]
4 ) Actual prediction:  1
News is real
5 ) Predicted:  [0]
5 ) Actual prediction:  0
News is real
6 ) Predicted:  [0]
6 ) Actual prediction:  0
News is Fake
7 ) Predicted:  [1]
7 ) Actual prediction:  1
News is real
8 ) Predicted:  [0]
8 ) Actual prediction:  0
News is Fake
9 ) Predicted:  [1]
9 ) Actual prediction:  1
