## Dataset Extractor

In [1]:
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib

In [2]:
ignore_html_tags = ['script']
file_output = 'dataset.csv'

MAX_URLS_SITE = 10
MIN_TEXT_LEVEL = 5

def get_soup(url, timeout=10):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    try:
        request = urllib.request.urlopen(urllib.request.Request(url, data=None, headers={'User-Agent': user_agent}), timeout=timeout)
    except:
        return 'timeout'
    request.status
    page = request.read()
    return BeautifulSoup(page, 'lxml')

def treat_text(text):
    if type(text) == list:
        text_ = []
        for t in text:
            text_.append(t.replace('\n', '').replace('\t', '').replace('\r','').strip())
        return text_
    return text.replace('\n', '').replace('\t', '').replace('\r','').strip()

def validate_text(text, level):
    if level >= MIN_TEXT_LEVEL:
        return treat_text(text)[:2000]
    return 'None Text'

def validate_text_dict(dictionary):
    for k in dictionary:
        dictionary[k] = treat_text(dictionary[k])
    return dictionary

def validate_attrs(attrs):
    new_attrs = []
    for d in attrs:
        new_attrs.append(validate_text_dict(d))
    return new_attrs

### Snopes

In [3]:
def snopes():
    data = []
    urls__ = []
    pages = 100
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        url_ = "https://www.snopes.com/fact-check/page/" + str(page_number) + "/"
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('a', {"class": "article-link"}, href=True)
        for anchor in links:
            url = anchor['href']
            if url in urls__:
                continue
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            snopes_walk_html(soup, data_, 'snope', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def snopes_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' in features['attrs'] and
         'article-title' in features['attrs']['class'])):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif ((features['tag'] == 'meta' and
          'itemprop' in features['attrs'] and
          'datePublished' in features['attrs']['itemprop']) or
          (features['tag'] == 'span' and
          'class' in features['attrs'] and
          'date-wrapper' in features['attrs']['class']) or
          (features['tag'] == 'span' and
          'itemprop' in features['attrs'] and
          'itemReviewed' in features['itemprop'])):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Claim
    elif (features['tag'] == 'p' and 
          len(brothers) > 0 and
          'class' in brothers[-1]['attrs'] and
          'claim' in brothers[-1]['attrs']['class'] and
          'section-break' in brothers[-1]['attrs']['class']):
        features['label'] = 'Claim'
        dic_['claim'] += 1
    
    # Credibility
    elif (features['tag'] == 'a' and 
          parent is not None and
          parent['tag'] == 'div' and
          'class' in parent['attrs'] and
          'rating-wrapper' in parent['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(snopes_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Full Fact

In [4]:
def fullfact():
    alfab = "bcdefghijklmnopqrstuvxyz"
    #alfab = "b"
    data = []
    urls__ = []
    pages = 20
    count = 0
    count_url = 0
    for l in alfab:
        for page_number in range(1, pages+1):
            print('')
            count += 1
            print(count, '/', (pages)*len(alfab), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "http://fullfact.org/search/?q=" + l + "&page=" + str(page_number)
            try:
                soup_ = get_soup(url_)
                if type(soup_) == str:
                    print('timeout', end='')
                    continue
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll('a', {"rel": "bookmark"}, href=True)
            for anchor in links:
                url = "http://fullfact.org" + anchor['href']
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
                fullfact_walk_html(soup, data_, 'fullfact', url, 0, dic_=dic_)
                if dic_['claim'] and dic_['credibility']:
                    print('.', end='')
                    data += data_
                else:
                    print(',', end='')
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def fullfact_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' not in features['attrs'])):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif (features['tag'] == 'p' and
          'class' in features['attrs'] and
          'date' in features['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Claim
    elif (features['tag'] == 'p' and 
          len(brothers) > 0 and
          'Claim' == brothers[-1]['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1
    
    # Credibility
    elif (features['tag'] == 'p' and 
          len(brothers) > 0 and
          'Conclusion' == brothers[-1]['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(fullfact_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Politifact 

In [21]:
def politifact():
    data = []
    urls__ = []
    pages = 100
    count = 0
    types=["true","mostly-true","half-true","barely-true","false","pants-fire","no-flip","half-flip","full-flop"]
    count_url = 0
    for type_ in types:
        for page_number in range (1, pages+1):
            count += 1
            print('')
            print(count, '/', (pages)*len(types), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "http://www.politifact.com/truth-o-meter/rulings/" + str(type_) + "/?page=" + str(page_number)
            try:
                soup_ = get_soup(url_)
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll("p", {"class": "statement__text"})
            for anchor in links:
                anchor = anchor.find('a', {"class": "link"}, href=True)
                url = "http://www.politifact.com" + str(anchor['href'])
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
                politifact_walk_html(soup, data_, 'politifact', url, 0, dic_=dic_)
                if dic_['claim']:
                    print('.', end='')
                    data += data_
                else:
                    print(',', end='')
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def politifact_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' in features['attrs'] and
         'article__title' in features['attrs']['class'])):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif ((features['tag'] == 'p' and
          'class' in parent['attrs'] and
          'widget__content-xs' in parent['attrs']['class'] and
          'Published' in features['text']) or
          (features['tag'] == 'p' and
          'class' in parent['attrs'] and
          'widget__content' in parent['attrs']['class'] and
          'Published' in features['text'])):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Claim
    elif ((features['tag'] == 'div' and 
          'class' in features['attrs'] and
          'statement__text' in features['attrs']['class']) or
          (features['tag'] == 'div' and 
          'class' in features['attrs'] and
          'sharethefacts-statement' in features['attrs']['class'])):
        features['label'] = 'Claim'
        dic_['claim'] += 1
    
    # Credibility
    elif (features['tag'] == 'img' and 
          'class' in features['attrs'] and
          'statement-detail' in features['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(politifact_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### TruthorFiction

In [30]:
def truthorfiction():
    data = []
    urls__ = []
    pages = 100
    count = 0
    types=["a"]
    count_url = 0
    for type_ in types:
        for page_number in range (1, pages+1):
            count += 1
            print('')
            print(count, '/', (pages)*len(types), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "https://www.truthorfiction.com/page/" + str(page_number) + "/?s=" + str(type_)
            try:
                soup_ = get_soup(url_)
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll("h2", {"class": "grid-title"})
            for anchor in links:
                anchor = anchor.find('a', href=True)
                url = str(anchor['href'])
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
                truthorfiction_walk_html(soup, data_, 'truthorfiction', url, 0, dic_=dic_)
                if dic_['claim'] and dic_['credibility']:
                    print('.', end='')
                    data += data_
                else:
                    print(',', end='')
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def truthorfiction_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or features['tag'] == 'h1'):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif ((features['tag'] == 'span' and
           parent['tag'] == 'div' and
          'class' in parent['attrs'] and
          'post-box-meta-single' in parent['attrs']['class']) or
          (features['tag'] == 'div' and
           'class' in features['attrs'] and
           'post-box-meta-single' in features['attrs']['class'])):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Claim
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and 
          'Summary of eRumor' in brothers[-1]['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1
    
    # Credibility
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and 
          'The Truth' in brothers[-1]['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(truthorfiction_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Africa Check

In [7]:
def africacheck():
    data = []
    urls__ = []
    pages = 30
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        url_ = 'https://africacheck.org/latest-reports/page/' + str(page_number)
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('div', {"class": "article-content"})
        for anchor in links:
            url = anchor.find('a', href=True)['href']
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            print('.', end='')
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            africacheck_walk_html(soup, data_, 'africacheck', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def africacheck_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1')):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif (features['tag'] == 'time' and 'Published' in features['text']):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Claim
    elif ('class' in features['attrs'] and 
          ('claim-content' in features['attrs']['class'] or 
           'the-content' in features['attrs']['class'] or
           'report-claim' in features['attrs']['class'])):
        features['label'] = 'Claim'
        dic_['claim'] += 1
    
    # Credibility
    elif ('class' in features['attrs'] and 
          ('report-verdict' in features['attrs']['class'] or
           'indicator' in features['attrs']['class'])):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(africacheck_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### CheckYourFact

In [22]:
def checkyourfact():
    data = []
    urls__ = []
    pages = 20
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        
        url_ = 'http://checkyourfact.com/page/' + str(page_number)
        
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.find('articles').findAll('a', href=True)
        for anchor in links:
            url = 'http://checkyourfact.com' + anchor['href']
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            checkyourfact_walk_html(soup, data_, 'checkyourfact', url, 0, dic_=dic_)
            if dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def checkyourfact_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or 
        (features['tag'] == 'h1' and 
         parent['tag'] == 'article')):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif ((features['tag'] == 'time' and
          parent['tag'] == 'article')):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Credibility 
    elif (features['tag'] == 'p' and
          'Verdict: ' in features['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(checkyourfact_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### TheFerret

In [29]:
def theferret():
    data = []
    urls__ = []
    pages = 9
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        url_ = 'https://theferret.scot/category/fact-check/page/' + str(page_number)
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('article')
        for anchor in links:
            url = anchor.find('a', href=True)['href'] 
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            theferret_walk_html(soup, data_, 'theferret', url, 0, dic_=dic_)
            if dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def theferret_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or 
        (features['tag'] == 'h1' and
        'class' in features['attrs'] and
        ('cover-title' in features['attrs']['class'] or
         'title' in features['attrs']['class']))):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif (features['tag'] == 'time'):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Credibility
    elif (features['tag'] == 'h3' and
          'Ferret Fact Service verdict:' in features['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(theferret_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### The Conversation

In [37]:
def theconversation():
    data = []
    urls__ = []
    pages = 10
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        
        url_ = "https://theconversation.com/us/topics/factcheck-6544?page=" + str(page_number)
        
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('a', {'class': 'article-link'}, href=True)
        for anchor in links:
            url = anchor['href']
            url = "https://theconversation.com" + url
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            theconversation_walk_html(soup, data_, 'theconversation', url, 0, dic_=dic_)
            if dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def theconversation_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or 
        (features['tag'] == 'h1' and
         'class' in features['attrs'])):
        features['label'] = 'Title'
        dic_['title'] += 1
    
    # Date
    elif (features['tag'] == 'time'):
        features['label'] = 'Date'
        dic_['date'] += 1
    
    # Credibility
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and
          brothers[-1]['tag'] == 'h2' and
          brothers[-1]['text'] == 'Verdict'):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(theconversation_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Washingtonpost

In [11]:
def washingtonpost():
    data = []
    urls__ = []
    pages = 100 
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')

        url_ = "https://www.washingtonpost.com/news/fact-checker/page/" + str(page_number)

        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll("div",{"class":"story-headline"})
        for anchor in links:
            url = str(anchor.find("a", href=True)['href'])
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            washingtonpost_walk_html(soup, data_, 'washingtonpost', url, 0, dic_=dic_)
            if True:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data

def washingtonpost_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'

    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1')):
        features['label'] = 'Title'
        dic_['title'] += 1

    # Date
    elif (features['tag'] == 'span' and
          'class' in features['attrs'] and
          'author-timestamp' in features['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] += 1

    # None
    else:
        features['label'] = 'None'

    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(washingtonpost_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features


### Rappler

In [12]:
def rappler():
    data = []
    urls__ = []
    pages = 16
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')

        url_ = "https://www.rappler.com/newsbreak/fact-check?start=" + str(page_number*10 - 10)
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('h4')
        for anchor in links:
            url = 'https://www.rappler.com' + anchor.find('a', href=True)['href']
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            rappler_walk_html(soup, data_, 'rappler', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data

def rappler_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'

    # Title
    if (features['tag'] == 'title' 
        or (features['tag'] == 'h1')):
        features['label'] = 'Title'
        dic_['title'] += 1

    # Date
    elif (features['tag'] == 'div' and
          'class' in features['attrs'] and
          'published' in features['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] += 1

    # Claim
    elif (features['tag'] == 'p' and 
          'Claim:' in features['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1

    # Credibility
    elif ('Rating:' in features['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1

    # None
    else:
        features['label'] = 'None'

    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(rappler_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features


### Verafiles

In [13]:
def verafiles():
    data = []
    urls__ = []
    pages = 1
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')

        url_ = "http://verafiles.org/rundown?ccm_paging_p=" + str(page_number)

        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('div', {'class': 'page-list-article__title'})
        for anchor in links:
            url = anchor.find('a', href=True)['href']
            print(url)
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            verafiles_walk_html(soup, data_, 'verafiles', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data

def verafiles_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'

    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1')):
        features['label'] = 'Title'
        dic_['title'] += 1

    # Date
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and
          brothers[-1]['tag'] == 'p' and
          'DATE' in brothers[-1]['text']):
        features['label'] = 'Date'
        dic_['date'] += 1

    # Claim
    elif (parent and parent['tag'] == 'main'):
        features['label'] = 'Claim'
        dic_['claim'] += 1
        print(features['tag'] , '**********')

    # Credibility
    elif (features['tag'] == 'div' and
          'class' in features['attrs'] and
          'sharethefacts-rating' in features['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1

    # None
    else:
        features['label'] = 'None'

    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(verafiles_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features


### Factcheckni

In [14]:
def factcheckni():
    data = []
    urls__ = []
    pages = 11
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')

        url_ = "https://factcheckni.org/page/" + str(page_number)

        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('article')
        for anchor in links:
            url = anchor.find('a', href=True)['href'] 
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            factcheckni_walk_html(soup, data_, 'factcheckni', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data

def factcheckni_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'

    # Title
    if (features['tag'] == 'title' or 
        features['tag'] == 'h1'):
        features['label'] = 'Title'
        dic_['title'] += 1

    # Date
    elif (features['tag'] == 'span' and
          'class' in features['attrs'] and
          'posted-on' in features['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] += 1

    # Claim
    elif (features['tag'] == 'p' and
          'CLAIM: ' in features['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1

    # Credibility
    elif (features['tag'] == 'p' and
          'CONCLUSION: ' in features['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1

    # None
    else:
        features['label'] = 'None'

    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(factcheckni_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_)) # EDIT - SITENAME
    return features


### Abc

In [15]:
def abc():
    data = []
    urls__ = []
    pages = 10
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')

        url_ = "https://www.abc.net.au/news/factcheck/factchecks/?page=" + str(page_number)

        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.find("ul", {"class":"article-index"}).findAll('a', href=True)
        for anchor in links:
            url = "https://www.abc.net.au" + anchor['href']
            if url in urls__:
                continue
            urls__.append(url)
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': 0, 'credibility': 0, 'date': 0, 'title': 0}
            abc_walk_html(soup, data_, 'abc', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility']:
                print('.', end='')
                data += data_
            else:
                print(',', end='')
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data

def abc_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'

    # Title
    if (features['tag'] == 'title' or features['tag'] == 'h1'):
        features['label'] = 'Title'
        dic_['title'] += 1

    # Date
    elif (features['tag'] == 'time'):
        features['label'] = 'Date'
        dic_['date'] += 1

    # Claim
    elif (features['tag'] == 'p' and
          brothers and
          brothers[-1]['tag'] == 'h2' and
          'The claim' in brothers[-1]['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1

    # Credibility
    elif (features['tag'] == 'p' and
          brothers and
          brothers[-1]['tag'] == 'h2' and
          'The verdict' in brothers[-1]['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1

    # None
    else:
        features['label'] = 'None'

    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(abc_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features


### Run and Save data

In [16]:
data = []

#### Snopes

In [17]:
data_ = snopes()


1 / 100   [ 0 / 10 ]..........

In [18]:
data += data_

#### FullFact

In [19]:
data_ = fullfact()


1 / 480   [ 0 / 10 ],,,,,,,,,,,,,,,,,,,,
2 / 480   [ 0 / 10 ],,,.,,,,,,,,,,,,,,,,
3 / 480   [ 1 / 10 ],,,,,.,,,,,,,,,,,,,,
4 / 480   [ 2 / 10 ],,,.,,,,,,,,,,,,,.,,
5 / 480   [ 4 / 10 ].,,,,,,,,,,.,,,,,,,.
6 / 480   [ 7 / 10 ],.,,,,.,,,,,,,,,,,,,
7 / 480   [ 9 / 10 ],,.

In [23]:
data += data_

#### PolitiFact

<font color="orange">Doesn't have conclusion</font>

In [24]:
data_ = politifact()


1 / 900   [ 0 / 10 ]..........

In [25]:
data += data_

#### TruthorFiction

In [26]:
data_ = truthorfiction()


1 / 100   [ 0 / 10 ]....................
2 / 100   [ 10 / 10 ]

In [27]:
data += data_

#### AfricaCheck

In [28]:
data_ = africacheck()


1 / 30   [ 0 / 10 ]....................
2 / 30   [ 10 / 10 ]

In [31]:
data += data_

#### CheckYourFact

<font color="orange">Doesn't have claim</font>

In [32]:
data_ = checkyourfact()


1 / 20   [ 0 / 10 ]..,..,......

In [33]:
data += data_

#### TheFerret

<font color="orange">Doesn't have claim</font>

In [34]:
data_ = theferret()


1 / 9   [ 0 / 10 ]..timeouttimeout.....
2 / 9   [ 7 / 10 ].,,timeouttimeout.timeouttimeout,
3 / 9   [ 9 / 10 ]timeouttimeouttimeouttimeout,timeouttimeouttimeouttimeout
4 / 9   [ 9 / 10 ]timeout,timeout.timeout

In [35]:
data += data_

#### TheConversation

<font color="orange">Doesn't have claim</font>

In [38]:
data_ = theconversation()


1 / 10   [ 0 / 10 ],..,.......,,.

In [39]:
data += data_

#### Washingtonpost 

<font color='red'>The code is not complete</font>

In [None]:
data_ = washingtonpost()

In [None]:
data += data_

#### Rappler

In [40]:
data_ = rappler()


1 / 16   [ 0 / 10 ].,........
2 / 16   [ 9 / 10 ].

In [41]:
data += data_

#### Verafiles

<font color='red'>Current, the extractor is not working for claim and credibility</font>

In [None]:
data_ = verafiles()

In [None]:
data += data_

#### Factcheckni

In [42]:
data_ = factcheckni()


1 / 11   [ 0 / 10 ].....,
2 / 11   [ 5 / 10 ]...,,.
3 / 11   [ 9 / 10 ].

In [43]:
data += data_

#### Abc

In [44]:
data_ = abc()


1 / 10   [ 0 / 10 ]timeout
2 / 10   [ 0 / 10 ],,,,,,,,,,,,.,,,,,,,,,,.,timeout,,,.,.,.,,,,.,.,,.,,,.,,,,.

In [45]:
data += data_

#### Save

In [46]:
df = pd.DataFrame(data)
df.to_csv(file_output, encoding='utf-8')