# **Fake News Detection**

## **Import the Dependencies**

In [7]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## **Data Collection and Processing**

In [8]:
# Loading datasets containing true and fake news
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [9]:
# Adding a label column: 0 for True news, 1 for Fake news
true_df['label'] = 0
fake_df['label'] = 1

In [10]:
# Combining the true and fake datasets into one DataFrame
combined_train_df = pd.concat([true_df, fake_df], ignore_index=True)

In [11]:
# Saving the combined dataset to a CSV file for future reference
combined_train_df.to_csv('train_df.csv', index=False)

In [12]:
# Displaying the first few rows of the combined dataset
combined_train_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [13]:
# Downloading stopwords from the NLTK library
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# Printing the list of stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
# Checking the shape of the dataset
combined_train_df.shape

(44898, 5)

In [17]:
# Displaying the distribution of labels
combined_train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,23481
0,21417




*   True - 0
*   Fake - 1



In [18]:
# Checking for missing values
if (combined_train_df.isnull().sum().any()>0):
  print('Yes, there are missing values')
else:
  print('No, there are no missing values')

No, there are no missing values


In [20]:
# Combining 'subject' and 'title' columns to create a 'content' column
combined_train_df['content'] = combined_train_df['subject'] + ' ' + combined_train_df['title']

In [21]:
combined_train_df.head()

Unnamed: 0,title,text,subject,date,label,content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0,"politicsNews As U.S. budget fight looms, Repub..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0,politicsNews U.S. military to accept transgend...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0,politicsNews Senior U.S. Republican senator: '...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0,politicsNews FBI Russia probe helped by Austra...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0,politicsNews Trump wants Postal Service to cha...


In [22]:
# Splitting the dataset into features (X) and labels (y)
X = combined_train_df.drop(columns='label', axis=1)
y = combined_train_df['label']

In [23]:
X.head()

Unnamed: 0,title,text,subject,date,content
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017","politicsNews As U.S. budget fight looms, Repub..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",politicsNews U.S. military to accept transgend...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",politicsNews Senior U.S. Republican senator: '...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",politicsNews FBI Russia probe helped by Austra...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",politicsNews Trump wants Postal Service to cha...


In [24]:
y.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


## **Text Preprocessing with Stemming**

In [25]:
# Initializing the Porter Stemmer
port_stem = PorterStemmer()

In [26]:
# Function for text stemming
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [27]:
# Applying the stemming function to the 'content' column
combined_train_df['content'] = combined_train_df['content'].apply(stemming)

In [28]:
# Displaying the preprocessed content
print(combined_train_df['content'])

0        politicsnew u budget fight loom republican fli...
1        politicsnew u militari accept transgend recrui...
2        politicsnew senior u republican senat let mr m...
3        politicsnew fbi russia probe help australian d...
4        politicsnew trump want postal servic charg muc...
                               ...                        
44893    middl east mcpain john mccain furiou iran trea...
44894    middl east justic yahoo settl e mail privaci c...
44895    middl east sunnistan us alli safe zone plan ta...
44896    middl east blow million al jazeera america fin...
44897    middl east u navi sailor held iranian militari...
Name: content, Length: 44898, dtype: object


In [29]:
# Extracting the 'content' and 'label' as NumPy arrays
X = combined_train_df['content'].values
y = combined_train_df['label'].values

In [31]:
print(X)
print(y)

['politicsnew u budget fight loom republican flip fiscal script'
 'politicsnew u militari accept transgend recruit monday pentagon'
 'politicsnew senior u republican senat let mr mueller job' ...
 'middl east sunnistan us alli safe zone plan take territori booti northern syria'
 'middl east blow million al jazeera america final call quit'
 'middl east u navi sailor held iranian militari sign neocon polit stunt']
[0 0 0 ... 1 1 1]


## **Text Vectorization using TF-IDF**

In [32]:
# Initializing the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
# Fitting and transforming the text data
vectorizer.fit(X)
X = vectorizer.transform(X)

In [33]:
print(X)

  (0, 1562)	0.3196726650464833
  (0, 4278)	0.2927186074045738
  (0, 4336)	0.4345277390146697
  (0, 4400)	0.42019879032367147
  (0, 6846)	0.39179755589089704
  (0, 8856)	0.12551451331511962
  (0, 9655)	0.2211712298366435
  (0, 10257)	0.47808774715136015
  (1, 62)	0.4014736991072735
  (1, 7402)	0.32482563796755554
  (1, 7563)	0.442009905860988
  (1, 8598)	0.3967361252632901
  (1, 8856)	0.1375989646743916
  (1, 9471)	0.4620472617911705
  (1, 11922)	0.38485544905554037
  (2, 6175)	0.3472661916761841
  (2, 6681)	0.38069785649924365
  (2, 7665)	0.49428660922615436
  (2, 7676)	0.4191764777617795
  (2, 8856)	0.14331079041626407
  (2, 9655)	0.25253034830840093
  (2, 10341)	0.2593263567383867
  (2, 10346)	0.4035931662464548
  (3, 701)	0.39833549185955924
  (3, 3224)	0.3570552196038325
  :	:
  (44895, 11479)	0.19949325846572508
  (44895, 11645)	0.34624357110863535
  (44895, 12460)	0.1783599906334707
  (44895, 13197)	0.303124538335321
  (44896, 247)	0.32584095755787734
  (44896, 364)	0.25883527763

## **Splitting the data into Train and Test**

In [34]:
# Splitting the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=1)

In [35]:
# Displaying the shape of the training datasets
print(X.shape, X_train.shape, X_test.shape)

(44898, 13208) (31428, 13208) (13470, 13208)


In [36]:
# Displaying the shape of the testing  datasets
print(y.shape, y_train.shape, y_test.shape)

(44898,) (31428,) (13470,)


## **Model Training**

In [37]:
# Initializing the Logistic Regression model
model = LogisticRegression()

In [38]:
# Training the model on the training data
model.fit(X_train, y_train)

## **Model Evaluation**

In [40]:
# Predicting on the training data
y_train_pred = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train_pred, y_train)
print('Accuracy Score of Training Data:',training_data_accuracy)

Accuracy Score of Training Data: 0.9999681812396589


In [41]:
# Predicting on the test data
y_test_pred = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test_pred, y_test)
print('Accuracy Score of Test Data:', test_data_accuracy)

Accuracy Score of Test Data: 0.9998515219005196


## **Predictive System**

In [48]:
# Testing the model on a single example

X_new = X_test[10]

prediction = model.predict(X_new)

# Interpreting the prediction
if prediction[0] == 0:
  print('True News')
else:
  print('Fake News')

Fake News


In [49]:
# Verifying the actual label for the test case
print(y_test[10])

1
