In [None]:
import pandas as pd
import numpy as np

In [None]:
trueNews = pd.read_csv('True.csv')
fakeNews = pd.read_csv('Fake.csv')

In [None]:
trueNews.head()

In [None]:
fakeNews.head()

Labeling true and fake news

In [None]:
trueNews['label'] = 1
fakeNews['label'] = 0

Creating news dataframe with true and fake news

In [None]:
news = pd.concat([trueNews, fakeNews], axis=0)

In [None]:
news.head()

In [None]:
news.isnull().sum()

In [None]:
news = news.drop(['title', 'subject', 'date'], axis = 1)
news.head()

Shuffeling the news 

In [None]:
news = news.sample(frac=1)
news.head()

In [None]:
news.reset_index(inplace=True)
news.head()

In [None]:
news = news.drop(['index'], axis=1)
news.head()

Removing unnecessary details from news

In [None]:
import re

def remove(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation (corrected regex)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove digits
    text = re.sub(r'\d', '', text)
    
    # Remove newline characters
    text = re.sub(r'\n', '', text)
    
    return text


In [None]:
news['text'] = news['text'].apply(remove)

In [None]:
news['text']

In [None]:
X = news['text']
Y = news['label']

Splitting the data into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=44)

Vectorizing news text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorization = TfidfVectorizer()

XV_train = vectorization.fit_transform(X_train)
XV_test = vectorization.transform(X_test)

Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(XV_train, Y_train)

In [None]:
predications_lr = model_lr.predict(XV_test)
model_lr.score(XV_test, Y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, predications_lr))

Decision tree classifier model

In [None]:
from sklearn.tree import DecisionTreeClassifier

model_dtc = DecisionTreeClassifier()
model_dtc.fit(XV_train, Y_train)

In [None]:
predications_dtc = model_dtc.predict(XV_test)
model_dtc.score(XV_test, Y_test)

In [None]:
print(classification_report(Y_test, predications_dtc))

Randomforest classifier model

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier()
model_rfc.fit(XV_train, Y_train)

In [None]:
predictions_rfc = model_rfc.predict(XV_test)
model_rfc.score(XV_test, Y_test)

In [None]:
print(classification_report(Y_test, predictions_rfc))

Gradient boosting classifier model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = GradientBoostingClassifier()
model_gbc.fit(XV_train, Y_train)

In [None]:
predictions_gbc = model_gbc.predict(XV_test)
model_gbc.score(XV_test, Y_test)

In [None]:
print(classification_report(Y_test, predictions_gbc))

In [None]:
def outputLabel(label):
    if label==0:
        return 'It is a fake news'
    else:
        return 'It is a genuine news'

In [None]:
# Function to input a news to test whether it is genunine or not
def testing(news):

    news = {'text': [news]}
    testNews = pd.DataFrame(news)
    testNews['text'] = testNews['text'].apply(remove)
    X_test = testNews['text']
    XV_test = vectorization.transform(X_test)
    predications_lr = model_lr.predict(XV_test)
    predictions_rfc = model_rfc.predict(XV_test)
    predications_dtc = model_dtc.predict(XV_test)
    predictions_gbc = model_gbc.predict(XV_test)

    return 'LR predcition = {}, RFC prediction = {}, DTC prediction = {}, GBC prediction = {}'.format(
        outputLabel(predications_lr[0]), outputLabel(predictions_rfc[0]), outputLabel(predications_dtc[0]), outputLabel(predictions_gbc[0])
    )


In [51]:
news = ("Government Confirms: Teleportation Devices Are Real")
testing(news)

'LR predcition = It is a fake news, RFC prediction = It is a fake news, DTC prediction = It is a fake news, GBC prediction = It is a fake news'

In [52]:
from sklearn.ensemble import VotingClassifier

# Using ensemble method with voting classifier

voting_clf = VotingClassifier(estimators=[
    ('lr', model_lr), ('rfc', model_rfc), ('dtc', model_dtc), ('gbc', model_gbc)],
    voting='soft') 

voting_clf.fit(XV_train, Y_train)

In [56]:
predications_voting_clf = voting_clf.predict(XV_test)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(Y_test, predications_voting_clf)
report = classification_report(Y_test, predications_voting_clf)

print(accuracy)
print(report)

0.9969933184855234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4632
           1       1.00      1.00      1.00      4348

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980

