### Scrape article info from Politifact
Articles are listed in multiple pages on Politifact.

In [9]:
## install libraries
# ! pip install beautifulsoup4
# ! pip install requests
# ! pip install urllib

In [12]:
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

#### Set up a new file & headers

In [220]:
filename = "Politifact.csv"
f = open(filename,"w", encoding = 'utf-8')
headers = "Date, Link, Statement, Tags, Source, Article_Body, Label, References\n"
f.write(headers)

upperframe = [] 

#### Scrape info of all articles listed in each page & write to file

In [None]:
# pages needed: 55-153

In [234]:
for page in range(92, 154):
    print('processing page :', page)
    url = 'https://www.politifact.com/factchecks/list/?page='+str(page)
    print(url)
    
    #an exception might be thrown, so the code should be in a try-except block
    try:
        #use the browser to get the url. This is suspicious command that might blow up.
        page=requests.get(url)                             # this might throw an exception if something goes wrong.
    
    except Exception as e:                                   # this describes what to do if an exception is thrown
        error_type, error_obj, error_info = sys.exc_info()      # get the exception information
        print ('ERROR FOR LINK:',url)                          #print the link that cause the problem
        print (error_type, 'Line:', error_info.tb_lineno)     #print error info and line that threw the exception
        continue                                              #ignore this page. Abandon this and go back.
    
    time.sleep(2)   
    soup = BeautifulSoup(page.text,'html.parser')
    
    frame = []
    
    links = soup.find_all('li',attrs={'class':'o-listicle__item'})
    
    for j in links:
        
        Statement = j.find("div",attrs={'class':'m-statement__quote'}).text.strip()
        
        Link = "https://www.politifact.com"
        Link += j.find("div",attrs={'class':'m-statement__quote'}).find('a')['href'].strip()
        
        Source = j.find('div', attrs={'class':'m-statement__meta'}).find('a').text.strip()
        Label = j.find('div', attrs ={'class':'m-statement__content'}).find('img',attrs={'class':'c-image__original'}).get('alt').strip()
        
        # get the article
        article = requests.get(Link) 
        article_soup = BeautifulSoup(article.text, "html.parser")
        
        # article's date         
        # extract from the article
        Date = article_soup.find('span', attrs ={'class':'m-author__date'}).text.strip()
        
        ## in case of error, extract from the list - NOTE: inspect the TEXT INDEXES before proceeding
#         Date = j.find('div',attrs={'class':'m-statement__body'}).find('footer').text[-18:-1].strip()

        # tags
        tag_links = article_soup.find_all('li', attrs={'class':'m-list__item'})
        tags = ''
        for j in tag_links:
            tag = j.text.strip()
            tags += (tag + ', ')
        Tags = tags[:-2]
        
        # article body
        Article_Body = article_soup.find('article', attrs={'class':'m-textblock'}).text.strip()
        
        #references
        ref_links = article_soup.find_all('div',attrs={'class':'t-row__center'})
        References = ref_links[-1].find("article",attrs={'class':'m-superbox__content'}).text.strip()
        
        # add all elements in a table
        frame.append((Date, Link, Statement, Tags, Source, Article_Body, Label, References))
        f = open(filename,"w", encoding = 'utf-8')
        f.write(Date.replace(",","^") + "," + Link + "," + Statement.replace(",","^") + "," + Tags \
                + Source.replace(",","^") + "," + Article_Body + Label.replace(",","^") + References + "\n")
        f.close()
        
    upperframe.extend(frame)

processing page : 92
https://www.politifact.com/factchecks/list/?page=92
processing page : 93
https://www.politifact.com/factchecks/list/?page=93
processing page : 94
https://www.politifact.com/factchecks/list/?page=94
processing page : 95
https://www.politifact.com/factchecks/list/?page=95
processing page : 96
https://www.politifact.com/factchecks/list/?page=96
processing page : 97
https://www.politifact.com/factchecks/list/?page=97
processing page : 98
https://www.politifact.com/factchecks/list/?page=98
processing page : 99
https://www.politifact.com/factchecks/list/?page=99
processing page : 100
https://www.politifact.com/factchecks/list/?page=100
processing page : 101
https://www.politifact.com/factchecks/list/?page=101
processing page : 102
https://www.politifact.com/factchecks/list/?page=102
processing page : 103
https://www.politifact.com/factchecks/list/?page=103
processing page : 104
https://www.politifact.com/factchecks/list/?page=104
processing page : 105
https://www.politif

In [235]:
data = pd.DataFrame(upperframe, columns=['Date','Link','Statement','Tags','Source',
                                         'Article_Body','Label','References'])

In [236]:
data.shape

(2970, 8)

In [237]:
data.tail()

Unnamed: 0,Date,Link,Statement,Tags,Source,Article_Body,Label,References
2965,"June 25, 2019",https://www.politifact.com/factchecks/2019/jun...,"Says Henry Kissinger said that ""once the herd ...","Fake news, Public Health, Facebook Fact-checks...",Facebook posts,Former Secretary of State and National Securit...,pants-fire,"Facebook post, June 20, 2019\nBritannica, Henr..."
2966,"June 25, 2019",https://www.politifact.com/factchecks/2019/jun...,Says police are warning that a new technique b...,"Fake news, Facebook Fact-checks, Facebook posts",Facebook posts,Fake warnings on social media are usually vagu...,false,"Facebook post, June 20, 2019\nNexis search, Ju..."
2967,"June 25, 2019",https://www.politifact.com/factchecks/2019/jun...,"""Wisconsin is a Top 10 state for health care c...","Health Care, Wisconsin, Scott Walker",Scott Walker,Health care remains a political flashpoint in ...,true,"Scott Walker, tweet, May 30, 2019\nKaiser Fami..."
2968,"June 25, 2019",https://www.politifact.com/factchecks/2019/jun...,A Florida millionaire was arrested after admit...,"Crime, Facebook Fact-checks, Viral image",Viral image,"An Area Florida Man is at it again, according ...",pants-fire,"Facebook post, June 4, 2019\nEmpire News, ""Flo..."
2969,"June 25, 2019",https://www.politifact.com/factchecks/2019/jun...,"The Trump administration has denied ""4.2 milli...","Jobs, Workers, Florida, Florida Democratic Party",Florida Democratic Party,As President Donald Trump praises his economic...,half-true,"Florida Democratic Party, Press release, June ..."


***Insert here:*** Check the data accuracy, missing values, duplicates, etc.

#### Write to an excel file for future analysis

In [252]:
# data.to_excel('Politifact_page_55_to_153.xlsx')