In [1]:
# importing the necessary packages
import requests
from bs4 import BeautifulSoup
import pickle
import numpy as np
import pandas as pd
import re
import tkinter as tk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

def getTokens(input):
	tokensBySlash = str(input.encode('utf-8')).split('/')	#get tokens after splitting by slash
	allTokens = []
	for i in tokensBySlash:
		tokens = str(i).split('-')	#get tokens after splitting by dash
		tokensByDot = []
		for j in range(0,len(tokens)):
			tempTokens = str(tokens[j]).split('.')	#get tokens after splitting by dot
			tokensByDot = tokensByDot + tempTokens
		allTokens = allTokens + tokens + tokensByDot
	allTokens = list(set(allTokens))	#remove redundant tokens
	if 'com' in allTokens:
		allTokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
	return allTokens


    
#this function scraps the article fromt the given link
def scrapper(url):
    #send request to the url
    r1 = requests.get(url)
    coverpage = r1.content
    soup1 = BeautifulSoup(coverpage, 'html5lib')
    #Get all content that has "<p>" html tags
    article = soup1.find_all('p')
    content = []
    list_paragraphs = []
    #append all the paragraphs into one single paragraph
    for p in np.arange(0, len(article)):
        paragraph = article[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
    content.append(final_article)
    
    #now time to scrap the title
    #Get all content that has "<h1>" or ""<h2>" html tags
    titles = soup1.find_all(re.compile(r'(h1|h2)'))  
    
    #Now a believe a title less than 30 characters is not a title.. it might be a outlier
    #so i need to pick one that hase more than 30 characters
    for t in titles:
        length = len(str(t.get_text()))
        if length >= 30:
            title = t.get_text()
            break

    # Now once I have the title, url, article.. I create a dataframe with all these
    df = pd.DataFrame(
        {'url': url,
         'headline': title,
         'body': content})
    return (df)


#This function predicts the article real or fake
def news_classifier(data):
    #transform all the characters to lower form. Because python thinks "HAS" and "has" is not same
    data['body'] = [entry.lower() for entry in data['body']]
    #Tokenization : In this each entry in the corpus will be broken into set of words
    data['body']= [word_tokenize(entry) for entry in data['body']]
    # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    i=0
    for index,entry in enumerate(data['body']):

        # Declaring Empty List to store the words that follow the rules for this step
        Final_words = []
        # Initializing WordNetLemmatizer()
        word_Lemmatized = WordNetLemmatizer()
        # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
        for word, tag in pos_tag(entry):
            # Below condition is to check for Stop words and consider only alphabets
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
                Final_words.append(word_Final)
        # The final processed set of words for each iteration will be stored in 'text_final'
        data.loc[index,'text_final'] = str(Final_words)
    
    #These will now contain for each row a list of unique integer number 
    #and its associated importance as calculated by TF-IDF
    X = tokenizer_news.transform(data['text_final'])
    predictions = nb.predict(X)
    
    if predictions == 0:
        return("FAKE")
    
    if predictions == 1:
        return("REAL")



In [3]:
# Unpickle files

#get pickled file for url classifier
lr = open("./LogReg_url.sav","rb")
lr = pickle.load(lr)

#get pickled file for url tokenizer
Tfidf_vect = open("tokenizer.sav","rb")
Tfidf_vect = pickle.load(Tfidf_vect)

#get pickled file for news classifier
nb = open("./naive_news.sav","rb")
nb = pickle.load(nb)

#get pickled file for news tokenizer
tokenizer_news = open("./tokenizer_news.sav","rb")
tokenizer_news = pickle.load(tokenizer_news)



# GUI


In [4]:
import tkinter as tk

#create window for url checker
root= tk.Tk()
root.title("URL Checker")

#create canvas
canvas1 = tk.Canvas(root, width = 400, height = 300)
canvas1.pack()

#create label
label3 = tk.Label(root, text= "Enter URL:")
canvas1.create_window(200, 110, window=label3)

#create entry
entry1 = tk.Entry (root) 
canvas1.create_window(200, 140, window=entry1)

#create function for url checker button
def urlchecker():  
    #gets the text from user input
    url = entry1.get()
    #converts it to lower form
    val= url.lower()
    val=[val]
    
    #tokenize the input
    X = Tfidf_vect.transform(val)
    predictions_LR = lr.predict(X)
    
    if predictions_LR == 0:
        predictions_LR = "Not malicious"
        label1 = tk.Label(root, bg="#90ee90", fg="green",text= predictions_LR)
        canvas1.create_window(200, 230, window=label1)
        
        #scrap the data
        data = scrapper(url)
        
        #convert pandas dataframe to numpy array
        arr = data.to_numpy()
        arr = arr.flatten()
        
        #get url
        u = arr[0]
        #get title
        t = arr[1]
        #get body
        b = arr[2]
        
        print("URL: \n\n",u)
        print("\n\nTitle: \n\n",t)
        print("\n\nBody: \n\n",b)
        
        #create window for url checker
        root2= tk.Tk()
        root2.title("News Checker")
        
        #create canvas
        canvas2 = tk.Canvas(root2, width = 400, height = 300)
        canvas2.pack()
        
        #create label
        label4 = tk.Label(root2, text= "Enter Article Name:")
        canvas2.create_window(200, 110, window=label4)
        
        #create entry
        entry2 = tk.Entry (root2) 
        canvas2.create_window(200, 140, window=entry2)
        
        #create function for news_checker button
        def newschecker(t,data):
            
            #gets the text from user input
            title = entry2.get()
            #converts to lower form
            title = title.lower()
            
            #check the title user inputed is present in the scrapped database
            if title == t.lower():
                #classify the article
                news_prediction = news_classifier(data)
                if news_prediction == "REAL":
                    label2 = tk.Label(root2,bg="#90ee90", fg="green", text= "The news is " + str(news_prediction) + "!!!!")
                    canvas2.create_window(200, 230, window=label2)
                if news_prediction == "FAKE":
                    label2 = tk.Label(root2,bg="#ffcccb", fg="red", text= "The news is " + str(news_prediction) + "!!!!")
                    canvas2.create_window(200, 230, window=label2)
            else:
                label2 = tk.Label(root2, text= "No article found of this name")
                canvas2.create_window(200, 230, window=label2)
        
        #button for checking news article
        button2 = tk.Button(master=root2,text='Check article', command=lambda: newschecker(t,data))
        canvas2.create_window(200, 180, window=button2)

        root2.mainloop()
                     
    
    if predictions_LR == 1:
        predictions_LR = "The URL is malicious. Please do not proceed"
        label1 = tk.Label(root, bg="#ffcccb", fg="red", text= predictions_LR)
        canvas1.create_window(200, 230, window=label1)
    
#button for checking url   
button1 = tk.Button(text='Check URL', command=urlchecker)
canvas1.create_window(200, 180, window=button1)

root.mainloop()

URL: 

 https://www.bbc.com/news/health-53665008


Title: 

 Coronavirus: Asymptomatic cases 'carry same amount of virus'


Body: 

  Share this with Email Facebook Messenger Messenger Twitter Pinterest WhatsApp LinkedIn Copy this link These are external links and will open in a new window People with symptomless Covid-19 can carry as much of the virus as those with symptoms, a South Korean study has suggested. South Korea was able to identify and isolate asymptomatic cases through mass testing as early as the start of March.  There is mounting evidence these cases represent a considerable proportion of coronavirus infections. But the researchers weren't able to say how much these people actually passed the virus on. People with a positive coronavirus test were monitored in a community treatment centre, allowing scientists to look at how much of the virus was detectable in their nose and throat swabs. They were given regular tests, and only released once they were negative.  Results of

# Debug