# <center> <font color = Blue> Text Analytics Project <font><center>

## <center> Annie Nguyen <font><center>

## <center> Fake News Website Scraping <font><center>

## <center> Spring 2021 <font><center>

### **Scraping**

In [1]:
!pip install feedparser



In [2]:
!pip install newspaper3k



In [3]:
import json

In [8]:
dictionary = {
  "breitbart": {
    "link": "http://www.breitbart.com/politics"
  }
}

In [9]:
json_object = json.dumps(dictionary, indent = 4) 

In [10]:
with open("NewsPapers.json", "w") as outfile: 
    outfile.write(json_object) 

In [7]:
import feedparser as fp
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime

# Set the limit for number of articles to download
LIMIT = 14500

data = {}
data['newspapers'] = {}

# Loads the JSON files with news sites
with open('NewsPapers.json') as data_file:
    companies = json.load(data_file)

count = 1

# Iterate through each news company
for company, value in companies.items():
    # If a RSS link is provided in the JSON file, this will be the first choice.
    # Reason for this is that, RSS feeds often give more consistent and correct data.
    # If you do not want to scrape from the RSS-feed, just leave the RSS attr empty in the JSON file.
    if 'rss' in value:
        d = fp.parse(value['rss'])
        print("Downloading articles from ", company)
        newsPaper = {
            "rss": value['rss'],
            "link": value['link'],
            "articles": []
        }
        for entry in d.entries:
            # Check if publish date is provided, if no the article is skipped.
            # This is done to keep consistency in the data and to keep the script from crashing.
            if hasattr(entry, 'published'):
                if count > LIMIT:
                    break
                article = {}
                article['link'] = entry.link
                date = entry.published_parsed
                article['published'] = datetime.date(2020,2,1)
                try:
                    content = Article(entry.link)
                    content.download()
                    content.parse()
                except Exception as e:
                    # If the download for some reason fails (ex. 404) the script will continue downloading
                    # the next article.
                    print(e)
                    print("continuing...")
                    continue
                article['title'] = content.title
                article['text'] = content.text
                newsPaper['articles'].append(article)
                print(count, "articles downloaded from", company, ", url: ", entry.link)
                count = count + 1
    else:
        # This is the fallback method if a RSS-feed link is not provided.
        # It uses the python newspaper library to extract articles
        print("Building site for ", company)
        paper = newspaper.build(value['link'], memoize_articles=False)
        newsPaper = {
            "link": value['link'],
            "articles": []
        }
        noneTypeCount = 0
        for content in paper.articles:
            if count > LIMIT:
                break
            try:
                content.download()
                content.parse()
            except Exception as e:
                print(e)
                print("continuing...")
                continue
            # Again, for consistency, if there is no found publish date the article will be skipped.
            # After 10 downloaded articles from the same newspaper without publish date, the company will be skipped.
            if content.publish_date is None:
                print(count, " Article has date of type None...")
                noneTypeCount = noneTypeCount + 1
                if noneTypeCount > 100:
                    print("Too many noneType dates, aborting...")
                    noneTypeCount = 0
                    break
                count = count + 1
                continue
            article = {}
            article['title'] = content.title
            article['text'] = content.text
            article['link'] = content.url
            article['published'] = content.publish_date.isoformat()
            newsPaper['articles'].append(article)
            print(count, "articles downloaded from", company, " using newspaper, url: ", content.url)
            count = count + 1
            noneTypeCount = 0
    count = 1
    data['newspapers'][company] = newsPaper

# Finally it saves the articles as a JSON-file.
try:
    with open('scraped_articles.json', 'w') as outfile:
        json.dump(data, outfile)
except Exception as e: print(e)

Building site for  breitbart
1 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/crime/2021/04/20/derek-chauvin-guilty-on-all-three-charges-2nd-3rd-degree-murder-2nd-degree-manslaughter/?utm_source=breaking_news&utm_medium=banner
2 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/jury-reaches-verdict-in-derek-chauvin-murder-trial/
3 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/crime/2021/04/20/derek-chauvin-guilty-on-all-three-charges-2nd-3rd-degree-murder-2nd-degree-manslaughter/
4 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/nancy-pelosi-thank-you-george-floyd-for-sacrificing-your-life-for-justice/
5 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/law-and-order/2021/04/20/watch-minneapolis-protesters-accost-truck-driver-following-conviction-of-derek-chauvin/
6 arti

45 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/facebook-ceo-mark-zuckerberg-thinks-hes-making-the-world-a-better-place/#disqus_thread
46 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/vanessa-bryant-dumps-partnership-with-nike-on-behalf-of-kobe-bryant-estate/
47 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/vanessa-bryant-dumps-partnership-with-nike-on-behalf-of-kobe-bryant-estate/#disqus_thread
48 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/joe-biden-praying-for-the-right-verdict-in-derek-chauvin-trial/
49 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/joe-biden-praying-for-the-right-verdict-in-derek-chauvin-trial/#disqus_thread
50 articles downloaded from breitbart  using newspaper, url:  http://w

87 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/health/2021/04/20/columbia-u-mandates-students-get-coronavirus-vaccine-to-return-to-campus/#disqus_thread
88 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/local/2021/04/20/marijuana-activist-offers-free-joints-vaccinated-new-yorkers-mark/
89 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/local/2021/04/20/marijuana-activist-offers-free-joints-vaccinated-new-yorkers-mark/#disqus_thread
90 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/europe/2021/04/20/400k-fewer-young-people-employed-in-britain-since-the-lockdown/
91 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/europe/2021/04/20/400k-fewer-young-people-employed-in-britain-since-the-lockdown/#disqus_thread
92 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-sec

131 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/exclusive-missing-action-republicans-rip-democrat-mark-kelly-disappearing-public-view-after-getting-elected/#disqus_thread
132 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/politico-john-cornyn-willing-find-common-ground-gun-control/
133 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/politico-john-cornyn-willing-find-common-ground-gun-control/#disqus_thread
134 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/watch-florida-sheriff-warns-new-arrivals-not-to-vote-the-stupid-way-you-did-up-north/
135 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/watch-florida-sheriff-warns-new-arrivals-not-to-vote-the-stupid-way-you-did-up-north/#disqus_thread
136 articles do

173 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/sanders-warren-call-on-biden-to-restrict-aid-to-israel-sparking-outrage/#disqus_thread
174 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/netanyahu-on-way-to-opposition-leading-mk-declares/
175 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/netanyahu-on-way-to-opposition-leading-mk-declares/#disqus_thread
176 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/18/report-biden-admin-demands-israel-stop-bragging-about-disabling-irans-nuclear-facility/
177 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/18/report-biden-admin-demands-israel-stop-bragging-about-disabling-irans-nuclear-facility/#disqus_thread
178 articles downloaded from breitbart  using newspaper, url:  ht

215 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/entertainment/2021/04/19/morrissey-manager-rips-the-simpsons-after-show-lampoons-the-rocker-in-hurtful-and-racist-episode/#disqus_thread
216 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/europe/2021/04/19/netflix-faces-another-protest-petition-this-time-over-hot-instagrammers-show-byron-baes/
217 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/europe/2021/04/19/netflix-faces-another-protest-petition-this-time-over-hot-instagrammers-show-byron-baes/#disqus_thread
218 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2021/04/18/panic-time-nba-fast-approaching-historic-ratings-lows/
219 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/crime/2021/04/18/photos-vandals-smear-pig-blood-on-home-once-occupied-by-derek-chauvin-defense-witness/
220 articles downl

260 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/idaho-state-legislature-hears-testimony-oregon-counties-want-join-idaho/
261 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/law-and-order/2021/04/20/protesters-take-to-streets-following-guilty-verdict-in-chauvin-trial/
262 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/former-inmate-activist-calls-out-biden-for-failure-to-act-during-his-proclaimed-second-chances-months/
263 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/20/washington-post-gives-cindy-axne-four-pinocchios-false-response-democrat-court-packing-bill/
264 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2021/04/20/anthem-kneeler-eric-reid-puts-his-mansion-in-gated-community-on-the-market/
265 articles downloaded from breitbart  us

307 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/20/daily-mail-files-antitrust-lawsuit-against-google-over-search-engine-censorship/
308 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/20/facebook-ceo-mark-zuckerberg-thinks-hes-making-the-world-a-better-place/
309 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/sports/2021/04/20/vanessa-bryant-dumps-partnership-with-nike-on-behalf-of-kobe-bryant-estate/
310 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/20/bokhari-internet-freedom-advocates-defend-social-media-censorship/
311 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/economy/2021/04/20/coke-procter-gamble-announce-plans-to-hike-prices/
312 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/20/union-appeals-amazon-el

354 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2021/04/19/watch-border-patrol-marine-agents-rescue-two-small-children-on-banks-of-rio-grande/
355 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2021/04/18/biden-admin-opens-another-migrant-child-detention-center-in-texas/
356 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2021/04/16/texas-congressman-unveils-bill-to-designate-certain-mexican-cartels-as-terrorist-organizations/
357 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2021/04/18/inmates-using-cats-to-smuggle-drugs-into-prisons-in-panama/
358 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/border/2021/04/15/biden-doj-wins-lawsuit-to-seize-260-year-old-texas-ranch-along-border/
359 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/bord

399 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2021/04/20/sanders-warren-call-on-biden-to-restrict-aid-to-israel-sparking-outrage/
400 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2021/04/18/report-biden-admin-demands-israel-stop-bragging-about-disabling-irans-nuclear-facility/
401 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/the-media/2021/04/18/exclusive-explosive-new-book-reveals-how-nyt-is-godfather-of-fake-news/
402 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2021/04/18/israel-ditches-outdoor-masks-reopens-schools-as-country-returns-to-normal/
403 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/middle-east/2021/04/18/pakistan-pm-imran-khan-equates-abusing-mohammed-to-holocaust-denial/
404 articles downloaded from breitbart  using newspaper, url:  http://

445 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/16/blm-activist-calls-rioting-looting-legitimate-response-state-violence/
446 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/16/facebook-censors-new-york-post-over-article-exposing-blm-co-founder/
447 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/16/jeff-bezos-claims-amazon-needs-to-do-a-better-job-for-employees/
448 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/tech/2021/04/16/arizona-community-college-district-pays-155k-to-prof-attacked-over-islamic-terrorism-quiz-questions/
449 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/crime/2021/04/20/derek-chauvin-guilty-on-all-three-charges-2nd-3rd-degree-murder-2nd-degree-manslaughter/?utm_source=breaking_news&utm_medium=banner
450 articles downloaded from breitba

489 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/joe-biden-praying-for-the-right-verdict-in-derek-chauvin-trial/#disqus_thread
490 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/report-house-democrats-feeling-pressure-censure-rep-maxine-waters/
491 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/report-house-democrats-feeling-pressure-censure-rep-maxine-waters/#disqus_thread
492 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/cruz-democrats-want-to-see-violence-looting-police-cars-firebombed-it-serves-their-political-end/
493 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/cruz-democrats-want-to-see-violence-looting-police-cars-firebombed-it-serves-their-political-end/#disqus_thread
494 articles downlo

531 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/europe/2021/04/20/400k-fewer-young-people-employed-in-britain-since-the-lockdown/#disqus_thread
532 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/national-security/2021/04/20/xi-jinping-china-will-use-belt-road-create-universal-rules-standards/
533 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/national-security/2021/04/20/xi-jinping-china-will-use-belt-road-create-universal-rules-standards/#disqus_thread
534 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/doctor-says-russia-keeping-her-examining-political-prisoner-alexei-navalny/
535 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/doctor-says-russia-keeping-her-examining-political-prisoner-alexei-navalny/#disqus_thread
536 articles downloaded from breitbart  u

575 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/watch-florida-sheriff-warns-new-arrivals-not-to-vote-the-stupid-way-you-did-up-north/#disqus_thread
576 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/mccarthy-maxine-waters-believes-theres-value-in-violence-censure-could-result-in-loss-of-financial-services-chairmanship/
577 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/mccarthy-maxine-waters-believes-theres-value-in-violence-censure-could-result-in-loss-of-financial-services-chairmanship/#disqus_thread
578 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/detroit-police-chief-tlaibs-waters-remarks-shameful-for-non-white-communities-that-rely-on-police/
579 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/1

617 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/18/report-biden-admin-demands-israel-stop-bragging-about-disabling-irans-nuclear-facility/#disqus_thread
618 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/gretchen-whitmer-traveled-florida-month-ago-calling-residents-not/
619 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/gretchen-whitmer-traveled-florida-month-ago-calling-residents-not/#disqus_thread
620 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/exclusive-desantis-corporate-media-they-are-lockdowners-but-majority-of-the-public-see-through-the-narrative/
621 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/19/exclusive-desantis-corporate-media-they-are-lockdowners-but-majority-of-the-public-see-through-

658 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/crime/2021/04/18/photos-vandals-smear-pig-blood-on-home-once-occupied-by-derek-chauvin-defense-witness/
659 articles downloaded from breitbart  using newspaper, url:  https://www.breitbart.com/politics/2021/04/20/pete-buttigieg-infrastructure-plan-important-racial-justice-perspective/
660  Article has date of type None...
661  Article has date of type None...
662  Article has date of type None...
663  Article has date of type None...
664  Article has date of type None...
665  Article has date of type None...
666  Article has date of type None...
667  Article has date of type None...
668  Article has date of type None...
669  Article has date of type None...
670  Article has date of type None...
671  Article has date of type None...
672  Article has date of type None...
673  Article has date of type None...
674  Article has date of type None...
675  Article has date of type None...
676  Article has 

739 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2021/04/16/report-mozambique-jihadists-beheading-skinning-cutting-off-limbs/
740 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2021/04/16/report-mozambique-jihadists-beheading-skinning-cutting-off-limbs/#disqus_thread
741 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/africa/2021/04/14/100-chibok-girls-still-missing-7-years-after-boko-haram-kidnapping/
742 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/national-security/2021/04/14/rights-group-decries-indescribable-horror-of-mozambique-islamist-insurgency/
743 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/africa/2021/04/12/egypt-refuses-release-suez-canal-blocking-ship-compensation/
744 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.c

784 articles downloaded from breitbart  using newspaper, url:  http://www.breitbart.com/politics/2021/04/16/watch-white-house-defends-un-ambassador-most-people-recognize-history-of-systemic-racism-in-america/


In [11]:
with open('scraped_articles.json') as json_data:
    d = json.load(json_data)

In [12]:
for i, site in enumerate((list(d['newspapers']))):
    print(i, site)

0 breitbart


In [13]:
import pandas as pd
for i, site in enumerate((list(d['newspapers']))):
    articles = list(d['newspapers'][site]['articles'])
    if i == 0:
        df = pd.DataFrame.from_dict(articles)
        df["site"] = site
    else:
        new_df = pd.DataFrame.from_dict(articles)
        new_df["site"] = site
        df = pd.concat([df, new_df], ignore_index = True)     

In [14]:
df.shape

(738, 5)

In [15]:
df

Unnamed: 0,title,text,link,published,site
0,Derek Chauvin Guilty on All Three Charges: 2nd...,"Former Minneapolis, Minnesota, police officer ...",http://www.breitbart.com/crime/2021/04/20/dere...,2021-04-20T00:00:00,breitbart
1,***Live Wire*** Derek Chauvin Found Guilty on ...,Former Minneapolis police officer Derek Chauvi...,http://www.breitbart.com/politics/2021/04/20/j...,2021-04-20T00:00:00,breitbart
2,Derek Chauvin Guilty on All Three Charges: 2nd...,"Former Minneapolis, Minnesota, police officer ...",http://www.breitbart.com/crime/2021/04/20/dere...,2021-04-20T00:00:00,breitbart
3,Pelosi: 'Thank You George Floyd for Sacrificin...,House Speaker Nancy Pelosi (D-CA) reacted to t...,http://www.breitbart.com/politics/2021/04/20/n...,2021-04-20T00:00:00,breitbart
4,WATCH: Minneapolis Protesters Accost Truck Dri...,Black Lives Matter demonstrators blocking the ...,http://www.breitbart.com/law-and-order/2021/04...,2021-04-20T00:00:00,breitbart
...,...,...,...,...,...
733,"WaPo Accuses GOP of Boosting Book Sales, Ignor...",The Washington Post has chosen to report on Re...,http://www.breitbart.com/the-media/2021/04/17/...,2021-04-17T00:00:00,breitbart
734,Carlson: ‘We Are Moving Toward Some Kind of La...,Nearly a year after a New York Times report su...,http://www.breitbart.com/clips/2021/04/17/carl...,2021-04-17T00:00:00,breitbart
735,CNN’s Cuomo: If Cops Were Killing White Kids I...,CNN anchor Chris Cuomo said Friday that if pol...,http://www.breitbart.com/clips/2021/04/16/cnns...,2021-04-16T00:00:00,breitbart
736,CNN’s Lemon: ‘You Don’t See Racists’ Flourishi...,"During the handover with anchor Chris Cuomo, C...",http://www.breitbart.com/clips/2021/04/16/cnns...,2021-04-16T00:00:00,breitbart


In [None]:
#Code to convert the above list to a new csv file.
df.to_csv('scraped_1.csv')