## Dataset Extractor

In [None]:
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib

In [None]:
ignore_html_tags = ['script']
file_output = 'dataset.csv'

MAX_URLS_SITE = 10
MIN_TEXT_LEVEL = 5

def get_soup(url, timeout=10):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
    try:
        request = urllib.request.urlopen(urllib.request.Request(url, data=None, headers={'User-Agent': user_agent}), timeout=timeout)
    except:
        return 'timeout'
    request.status
    page = request.read()
    return BeautifulSoup(page, 'lxml')

def treat_text(text):
    if type(text) == list:
        text_ = []
        for t in text:
            text_.append(t.replace('\n', '').replace('\t', '').replace('\r','').strip())
        return text_
    return text.replace('\n', '').replace('\t', '').replace('\r','').strip()

def validate_text(text, level):
    if level >= MIN_TEXT_LEVEL:
        return treat_text(text)[:2000]
    return 'None Text'

def validate_text_dict(dictionary):
    for k in dictionary:
        dictionary[k] = treat_text(dictionary[k])
    return dictionary

def validate_attrs(attrs):
    new_attrs = []
    for d in attrs:
        new_attrs.append(validate_text_dict(d))
    return new_attrs

### Snopes

In [3]:
def snopes():
    data = []
    urls__ = []
    pages = 100
    count_url = 0
    for page_number in range(1, pages+1):
        print('')
        print(page_number, '/', pages, '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
        url_ = "https://www.snopes.com/fact-check/page/" + str(page_number) + "/"
        #soup_ = get_soup(url_)
        try:
            soup_ = get_soup(url_)
            if type(soup_) == str:
                print('timeout', end='')
                continue
        except:
            print("Error urlopen.", url_)
            continue
        soup_.prettify('utf-8')
        links = soup_.findAll('a', {"class": "article-link"}, href=True)
        for anchor in links:
            url = anchor['href']
            if url in urls__:
                continue
            soup = get_soup(url, timeout=5)
            if type(soup) == type(''):
                print('timeout', end='')
                continue
            count_url += 1
            if count_url > MAX_URLS_SITE:
                return data
            print('.', end='')
            soup.prettify("utf-8")
            data_ = []
            dic_ = {'claim': False, 'credibility': False, 'body': False, 'date': False, 'title': False}
            snopes_walk_html(soup, data_, 'snope', url, 0, dic_=dic_)
            if dic_['claim'] and dic_['credibility'] and dic_['body']:
                data += data_
            else:
                count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data
            
def snopes_walk_html(element, data, site, url, level, parent=None, brothers=[], dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' in features['attrs'] and
         'article-title' in features['attrs']['class'])):
        features['label'] = 'Title'
        dic_['title'] = True
        #print("Title")
    
    # Date
    elif ((features['tag'] == 'meta' and
          'itemprop' in features['attrs'] and
          'datePublished' in features['attrs']['itemprop']) or
          (features['tag'] == 'span' and
          'class' in features['attrs'] and
          'date-wrapper' in features['attrs']['class']) or
          (features['tag'] == 'span' and
          'itemprop' in features['attrs'] and
          'itemReviewed' in features['itemprop'])):
        features['label'] = 'Date'
        dic_['date'] = True
        #print("Date")
    
    # Body
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and 
          'article-text-inner' in features['attrs']['class']):
        features['label'] = 'Body'
        dic_['body'] = True
        #print("Body")
    
    # Claim
    elif (features['tag'] == 'p' and 
          len(brothers) > 0 and
          'class' in brothers[-1]['attrs'] and
          'claim' in brothers[-1]['attrs']['class'] and
          'section-break' in brothers[-1]['attrs']['class']):
        features['label'] = 'Claim'
        dic_['claim'] = True
        #print("Claim")
    
    # Credibility
    elif (features['tag'] == 'span' and 
          parent is not None and
          parent['tag'] == 'a' and
          'class' in parent['attrs'] and
          'claim' in parent['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] = True
        #print("Credibility")
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(snopes_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Full Fact

In [4]:
def fullfact():
    alfab = "bcdefghijklmnopqrstuvxyz"
    #alfab = "b"
    data = []
    urls__ = []
    pages = 100
    count = 0
    count_url = 0
    for l in alfab:
        for page_number in range(1, pages+1):
            print('')
            count += 1
            print(count, '/', (pages)*len(alfab), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "http://fullfact.org/search/?q=" + l + "&page=" + str(page_number)
            try:
                soup_ = get_soup(url_)
                if type(soup_) == str:
                    print('timeout', end='')
                    continue
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll('a', {"rel": "bookmark"}, href=True)
            for anchor in links:
                url = "http://fullfact.org" + anchor['href']
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                print('.', end='')
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': False, 'credibility': False, 'body': False, 'date': False, 'title': False}
                fullfact_walk_html(soup, data_, 'fullfact', url, 0, dic_=dic_)
                if dic_['claim'] and dic_['credibility'] and dic_['body']:
                    data += data_
                else:
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def fullfact_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' not in features['attrs'])):
        features['label'] = 'Title'
        dic_['title'] = True
        #print("Title")
    
    # Date
    elif (features['tag'] == 'p' and
          'class' in features['attrs'] and
          'hidden-xs' in features['attrs']['class'] and
          'hidden-sm' in features['attrs']['class'] and
          'date' in features['attrs']['class'] and
          'updated' in features['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] = True
        #print("Date")

    # Body
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and 
          'article-post-content' in features['attrs']['class']):
        features['label'] = 'Body'
        dic_['body'] = True
        #print("Body")
    
    # Claim
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and
          'col-xs-12' in features['attrs']['class'] and
          'col-sm-6' in features['attrs']['class'] and
          'col-left' in features['attrs']['class']):
        features['label'] = 'Claim'
        dic_['claim'] = True
        #print("Claim")
    
    # Credibility
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and
          'col-xs-12' in features['attrs']['class'] and
          'col-sm-6' in features['attrs']['class'] and
          'col-right' in features['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] = True
        #print("Credibility")
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(fullfact_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Politifact 

In [5]:
def politifact():
    data = []
    urls__ = []
    pages = 100
    count = 0
    types=["true","mostly-true","half-true","barely-true","false","pants-fire","no-flip","half-flip","full-flop"]
    count_url = 0
    for type_ in types:
        for page_number in range (1, pages+1):
            count += 1
            print('')
            print(count, '/', (pages)*len(types), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "http://www.politifact.com/truth-o-meter/rulings/" + str(type_) + "/?page=" + str(page_number)
            try:
                soup_ = get_soup(url_)
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll("p", {"class": "statement__text"})
            for anchor in links:
                anchor = anchor.find('a', {"class": "link"}, href=True)
                url = "http://www.politifact.com" + str(anchor['href'])
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                print('.', end='')
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': 0, 'credibility': 0, 'body': 0, 'date': 0, 'title': 0}
                politifact_walk_html(soup, data_, 'politifact', url, 0, dic_=dic_)
                if dic_['claim'] and dic_['credibility'] and dic_['body']:
                    data += data_
                else:
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def politifact_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' in features['attrs']) and
         'article__title' in features['attrs']['class']):
        features['label'] = 'Title'
        dic_['title'] += 1
        #print("Title")
    
    # Date
    elif ((features['tag'] == 'p' and
          'class' in parent['attrs'] and
          'widget__content-xs' in parent['attrs']['class'] and
          'Published' in features['text']) or
         (features['tag'] == 'span' and
          'class' in features['attrs'] and
          'article__meta' in features['attrs']['class'])):
        features['label'] = 'Date'
        dic_['date'] += 1
        #print("Date")

    # Body
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and 
          'article__text' in features['attrs']['class']):
        features['label'] = 'Body'
        dic_['body'] += 1
        #print("Body")
    
    # Claim
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and
          'statement__text' in features['attrs']['class']):
        features['label'] = 'Claim'
        dic_['claim'] += 1
        #print("Claim")
    
    # Credibility
    elif (features['tag'] == 'img' and 
          'class' in features['attrs'] and
          'statement-detail' in features['attrs']['class']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
        #print("Credibility")
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(politifact_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### TruthorFiction

In [6]:
def truthorfiction():
    data = []
    urls__ = []
    pages = 100
    count = 0
    types=["a"]
    count_url = 0
    for type_ in types:
        for page_number in range (1, pages+1):
            count += 1
            print('')
            print(count, '/', (pages)*len(types), '  [', count_url, '/', MAX_URLS_SITE, ']', end='')
            url_ = "https://www.truthorfiction.com/page/" + str(page_number) + "/?s=" + str(type_)
            try:
                soup_ = get_soup(url_)
            except:
                print("Error urlopen.", url_)
                continue
            soup_.prettify('utf-8')
            links = soup_.findAll("h2", {"class": "grid-title"})
            for anchor in links:
                anchor = anchor.find('a', href=True)
                url = str(anchor['href'])
                if url in urls__:
                    continue
                urls__.append(url)
                soup = get_soup(url, timeout=5)
                if type(soup) == type(''):
                    print('timeout', end='')
                    continue
                count_url += 1
                if count_url > MAX_URLS_SITE:
                    return data
                print('.', end='')
                soup.prettify("utf-8")
                data_ = []
                dic_ = {'claim': 0, 'credibility': 0, 'body': 0, 'date': 0, 'title': 0}
                truthorfiction_walk_html(soup, data_, 'truthorfiction', url, 0, dic_=dic_)
                if dic_['claim'] and dic_['credibility'] and dic_['body']:
                    data += data_
                else:
                    count_url -= 1
    if count_url < MAX_URLS_SITE:
        print('Warning:', count_url, 'URLS was include')
    return data


def truthorfiction_walk_html(element, data, site, url, level, parent=None, brothers=None, dic_=None):
    features = {}
    if element is None:
        return
    features['site'] = site
    features['url'] = url
    features['tag'] = element.name
    features['attrs'] = validate_text_dict(element.attrs)
    features['level'] = level
    features['text'] = validate_text(element.text, level)

    if brothers:
        features['brother_tag'] = brothers[-1]['tag']
        features['brother_attrs'] = brothers[-1]['attrs']
        features['brother_text'] = brothers[-1]['text']
    else:
        features['brother_tag'] = ''
        features['brother_attrs'] = {}
        features['brother_text'] = 'NONE'
    
    # Title
    if (features['tag'] == 'title' or (features['tag'] == 'h1' and
         'class' not in features['attrs'])):
        features['label'] = 'Title'
        dic_['title'] += 1
        #print("Title")
    
    # Date
    elif (features['tag'] == 'span' and
           parent['tag'] == 'div' and
          'class' in parent['attrs'] and
          'post-box-meta-single' in parent['attrs']['class']):
        features['label'] = 'Date'
        dic_['date'] += 1
        #print("Date")

    # Body
    elif (features['tag'] == 'div' and 
          'class' in features['attrs'] and 
          'inner-post-entry' in features['attrs']['class']):
        features['label'] = 'Body'
        dic_['body'] += 1
        #print("Body")
    
    # Claim
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and 
          'Summary of eRumor' in brothers[-1]['text']):
        features['label'] = 'Claim'
        dic_['claim'] += 1
        #print("Claim")
    
    # Credibility
    elif (features['tag'] == 'p' and
          len(brothers) > 0 and 
          'The Truth' in brothers[-1]['text']):
        features['label'] = 'Credibility'
        dic_['credibility'] += 1
        #print("Credibility")
    
    # None
    else:
        features['label'] = 'None'
    
    data.append(features)
    brothers_ = []
    for e_child in element:
        if (type(e_child) == bs4.element.NavigableString or
            type(e_child) == bs4.element.Comment or
            type(e_child) == bs4.element.Doctype or
            type(e_child) == str or e_child.name in ignore_html_tags):
                continue
        brothers_.append(truthorfiction_walk_html(e_child, data, site, url, level+1, parent=features, brothers=brothers_, dic_=dic_))
    return features

### Run and Save data

In [7]:
data = []

#### Snopes

In [8]:
data_ = snopes()


1 / 100   [ 0 / 10 ]..........

In [9]:
data += data_

#### FullFact

In [10]:
data_ = fullfact()


1 / 2400   [ 0 / 10 ]....................
2 / 2400   [ 0 / 10 ]....................
3 / 2400   [ 1 / 10 ]....................
4 / 2400   [ 2 / 10 ]....................
5 / 2400   [ 4 / 10 ]....................
6 / 2400   [ 7 / 10 ]....................
7 / 2400   [ 9 / 10 ]...

In [12]:
data += data_

#### PolitiFact

In [13]:
data_ = politifact()


1 / 900   [ 0 / 10 ]..........

In [14]:
data += data_

#### TruthorFiction

In [15]:
data_ = truthorfiction()


1 / 100   [ 0 / 10 ]..........
2 / 100   [ 10 / 10 ]

In [16]:
data += data_

#### Save

In [18]:
df = pd.DataFrame(data)
df.to_csv(file_output, encoding='utf-8')