In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Download csv file

In [None]:
!gdown 1zgcfdcUXf-M61UYNLyFTFjllZf7nQo1i
!gdown 1xJYo0F-O8PqApkdsyRREzZDV4qmTomQY

Downloading...
From: https://drive.google.com/uc?id=1zgcfdcUXf-M61UYNLyFTFjllZf7nQo1i
To: /content/True.csv
100% 53.6M/53.6M [00:00<00:00, 69.2MB/s]
Downloading...
From: https://drive.google.com/uc?id=1xJYo0F-O8PqApkdsyRREzZDV4qmTomQY
To: /content/Fake.csv
100% 62.8M/62.8M [00:01<00:00, 61.7MB/s]


In [None]:
True_news = pd.read_csv('True.csv')
Fake_news = pd.read_csv('Fake.csv')
True_news['label'] = 0
Fake_news['label'] = 1
True_news.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [None]:
Fake_news.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [None]:
dataset1 = True_news[['text','label']]
dataset2 = Fake_news[['text','label']]
dataset = pd.concat([dataset1 , dataset2])
dataset.shape

(44898, 2)

In [None]:
dataset.isnull().sum() # no null values

text     0
label    0
dtype: int64

In [None]:
dataset['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [None]:
dataset1.shape # true news

(21417, 2)

In [None]:
dataset2.shape # fake news

(23481, 2)

In [None]:
dataset = dataset.sample(frac = 1)
dataset.head(20)

Unnamed: 0,text,label
8912,WASHINGTON (Reuters) - Seven U.S. senators urg...,0
934,WASHINGTON (Reuters) - Republicans in the U.S....,0
18605,TOKYO (Reuters) - The disapproval rating for J...,0
12730,The popular Christian author Eric Metaxas real...,1
2919,"BRIDGEWATER, N.J. (Reuters) - U.S. President D...",0
10024,"WASHINGTON (Reuters) - Donald Trump, under pre...",0
20436,We ve heard it all before He was such a good b...,1
19476,You may need a calculator to count how many as...,1
10942,"Martha, Martha, Martha You re 75-years old! Ti...",1
8002,WASHINGTON (Reuters) - The centerpiece of Pres...,0


In [None]:
ps = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def cleaning_data(row):

    # convert text to into lower case
    row = row.lower()

    # this line of code only takes words from text and removes numbers and special characters using RegX
    row = re.sub('[^a-zA-Z]', ' ', row)

    # split the data and make token.
    token = row.split()

    # lemmatize the word and remove stop words like a, an, the, is, are...
    news = [ps.lemmatize(word) for word in token if word not in stopwords]

    # finally join all the tokens with space
    cleanned_news = ' '.join(news)

    # return cleaned data
    return cleanned_news

# Preprocessing data

In [None]:
dataset['text'] = dataset['text'].apply(lambda x : cleaning_data(x))
dataset.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
vectorizer = TfidfVectorizer(max_features = 50000 , lowercase=False , ngram_range=(1,2))
dataset.shape

(44898, 2)

In [None]:
X = dataset.iloc[:35000,0]
y = dataset.iloc[:35000,1]
X.head()

8912     washington reuters seven u senator urged secre...
934      washington reuters republican u house represen...
18605    tokyo reuters disapproval rating japanese prim...
12730    popular christian author eric metaxas really p...
2919     bridgewater n j reuters u president donald tru...
Name: text, dtype: object

In [None]:
y.head()

8912     0
934      0
18605    0
12730    1
2919     0
Name: label, dtype: int64

In [None]:
train_data , test_data , train_label , test_label = train_test_split(X , y , test_size = 0.2 ,random_state = 0)
vec_train_data = vectorizer.fit_transform(train_data)
vec_train_data = vec_train_data.toarray()
train_data.shape , test_data.shape

((28000,), (7000,))

In [None]:
vec_test_data = vectorizer.transform(test_data).toarray()
vec_train_data.shape , vec_test_data.shape

((28000, 50000), (7000, 50000))

# Save the vectorizer

In [None]:
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

In [None]:
train_label.value_counts() # balanced partition

1    14668
0    13332
Name: label, dtype: int64

In [None]:
test_label.value_counts() # balanced partition

1    3648
0    3352
Name: label, dtype: int64

In [None]:
training_data = pd.DataFrame(vec_train_data , columns=vectorizer.get_feature_names_out())
testing_data = pd.DataFrame(vec_test_data , columns= vectorizer.get_feature_names_out())
clf = MultinomialNB()
clf.fit(training_data, train_label)
y_pred  = clf.predict(testing_data)

In [None]:
pd.Series(y_pred).value_counts()

1    3686
0    3314
dtype: int64

In [None]:
test_label.value_counts()

1    3648
0    3352
Name: label, dtype: int64

In [None]:
print(classification_report(test_label , y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      3352
           1       0.95      0.96      0.96      3648

    accuracy                           0.95      7000
   macro avg       0.95      0.95      0.95      7000
weighted avg       0.95      0.95      0.95      7000



In [None]:
y_pred_train = clf.predict(training_data)
print(classification_report(train_label , y_pred_train))

              precision    recall  f1-score   support

           0       0.96      0.95      0.96     13332
           1       0.96      0.96      0.96     14668

    accuracy                           0.96     28000
   macro avg       0.96      0.96      0.96     28000
weighted avg       0.96      0.96      0.96     28000



In [None]:
accuracy_score(train_label , y_pred_train)

0.9592142857142857

In [None]:
accuracy_score(test_label , y_pred)

0.9528571428571428

In [None]:
news = cleaning_data(str("Greece is a country without debt."))
single_prediction = clf.predict(vectorizer.transform([news]).toarray())
single_prediction



array([0])

# Save the model

In [None]:
joblib.dump(clf , 'model.pkl')

['model.pkl']

In [None]:
model = joblib.load('model.pkl')

# Test model with own implementation

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Initialization
ps = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

# Define the cleaning function
def cleaning_data(row):

    # convert text to into lower case
    row = row.lower()

    # this line of code only takes words from text and removes numbers and special characters using RegX
    row = re.sub('[^a-zA-Z]', ' ', row)

    # split the data and make token.
    token = row.split()

    # lemmatize the word and remove stop words like a, an, the, is, are...
    news = [ps.lemmatize(word) for word in token if word not in stopwords]

    # finally join all the tokens with space
    cleanned_news = ' '.join(news)

    # return cleaned data
    return cleanned_news

# Load your saved model
model = joblib.load('model.pkl')

# Assuming vectorizer is the same one you've used before (i.e., with the same parameters).
# If not, you might want to save the vectorizer as well to ensure consistency.
vectorizer = joblib.load('vectorizer.pkl')


def predict_fake_news(article):
    # 1. Preprocess the article
    cleaned_article = cleaning_data(article)

    # 2. Vectorize the cleaned article
    vec_article = vectorizer.transform([cleaned_article]).toarray()

    # 3. Use the model to make predictions
    prediction = model.predict(vec_article)

    return "Fake News" if prediction[0] == 1 else "True News"

# Test the function
article = "A total of 44 parties, coalitions of parties, and independent candidates, 6 fewer than in the May 21 elections total, have submitted their applications to the Supreme Court to run in the June 25 national elections, ANA reports. Among submissions is one by Ilias Kasidiaris, who is serving a jail sentence for his high-level participation in the neo-Nazi criminal organization Golden Dawn (Chryssi Avgi). Kasidiaris will be running with the 'Greeks for Country & Freedom' combination of independent candidates."
print(predict_fake_news(article))


True News




# Test scrapping script

In [None]:
from bs4 import BeautifulSoup
import requests
import re


def extract_article(soup):
    text = [p.text for p in soup.find_all('p')]
    return '\n'.join(text)


url = "https://www.theonion.com/new-peta-ad-seems-to-imply-that-throwing-horses-out-of-1850797405"
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')

article_text = extract_article(soup)

# Split the text into lines, remove short lines, and then recombine
lines = article_text.split('\n')
filtered_lines = [line for line in lines if len(line) > 150]  # Dont include recommendations, advertisements etc
filtered_text = '\n'.join(filtered_lines)

print(filtered_text)

<Response [200]>
NORFOLK, VA—In a new campaign featuring several graphic images of the animals falling from 30,000 feet in the sky,  People For Ethical Treatment Of Animals released an ad Thursday that seemed to imply that throwing horses out of planes was a common practice. “How do you think he feels about your in-flight entertainment?” said the ad, which heavily suggested that the ritual of hurling horses from commercial airlines and watching them fall to the ground and die occurred fairly regularly, if not daily. “Hey, Chris Pratt, would you do this to your dog? What if Seabiscuit pushed you out of a plane? It’s not just humans who deserve parachutes. When it comes to throwing horses out of airplanes, just say ‘neigh.’” At press time, PETA released a follow-up ad implying that horses being thrown from planes was also making humans obese and impotent.
