# Loading dependencies, defining functions 

In [None]:
from bs4 import BeautifulSoup
import requests
import pickle
import os.path
from newspaper import Article
import re

def get_stopfake_links(url, start_page=1,end_page=2):
    '''Iterate over given page range on https://www.stopfake.org, 
    getting all div-containers with class <cont-img>. 
    Downloads all links contained in <a> tags.
    Returns list of links'''
    links = []
    for p_number in range(start_page,end_page):
        url_temp = url + str(p_number)
        page = requests.get(url_temp)
        soup = BeautifulSoup(page.text, 'lxml')    
        div_list = soup.find_all("div", "cont-img")
        for el in div_list:
            link = el.find('a', href=True).get('href')
            if 'stopfakenews' in link:
                continue
            else:
                links.append(link)
        print('page processed: '+ str(p_number))
    return links

def get_fake_links(urls):
    '''Iterate over list passed. Reads only first <a> tag contained
    in div with class <post-content>. Last loop gets rid of junk links like Facebook, Google. 
    Returns list of links.'''
    fake_links = []
    total_length = len(urls)
    for x in range(len(urls)):  
        counter = 0
        page = requests.get(urls[x])
        soup = BeautifulSoup(page.text, 'lxml')
        div_list = soup.find_all("div", "post-content")
        for el in div_list:    
            counter +=1
            f_link = el.find('a', href=True)
            if counter == 1 and f_link != None:
                fake_links.append(f_link.get('href'))
            else: 
                continue
        print('Links processed: '+ str(round(((x/total_length)*100),2))+'%')
    fake_links_cleaned = [link for link in fake_links if 'stopfake' not in link if 'facebook' not in link if 'youtube' not in link if 'google' not in link]
    return fake_links_cleaned


def download_articles(url):
    '''Iterate over links with news/fakenews. If there is no problem with link,
    download article. Returns dictionary with "keys" - URLs and "values" -  text strings'''
    text_parsed = {}
    total_length = len(url)
    for x in range(len(url)):
        try:
            req = requests.get(url[x])
        except (requests.exceptions.ConnectionError,requests.exceptions.SSLError) as error:
            print(error)
        try:
            if req.status_code == 200:
                article = Article(url[x], language='ru')
                article.download(),article.parse()
                text_parsed[url[x]] = article.text
            else:
                continue
        except (BaseException) as error:
            print(error)
        print('Links processed: '+ str(round(((x/total_length)*100),2))+'%')
    return text_parsed

def text_serializer(filename,text):
    with open(filename, 'wb') as file:
        pickle.dump(text, file)
        
def text_deserializer(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data


# Data scrapping from StopFake.org

In [None]:
#Saving links on StopFake rebuttal articles from https://stopfake.org
stopfake_links = get_stopfake_links('http://www.stopfake.org/category/novosti/page/', 1,1600)
print(len(stopfake_links))


In [None]:
#Saving links on original fakes. 90% of links are first URL of each StopFake article.  
fakenews_links = get_fake_links(stopfake_links)
print(len(fakenews_links))
#text_serializer(os.path.join('raw_data', 'fakenews_links_#898_aug_11_2017'), fakenews_links)
#fakenews_links = text_deserializer(os.path.join('raw_data', 'fakenews_links_#898_aug_11_2017'))


In [None]:
#Downloading original fakes from fakenews_links
fake_news_text = download_articles(fakenews_links)


In [None]:
text_serializer(os.path.join('raw_data', 'faketext_#807_aug_11_2017'), fake_news_text)