# Scrape News Articles from Websites 

Code to scrape online news articles by using Newspaper library with multithreading. 
In this code, articles from websites publishing fake articles are collected.
Link for fake news sources data - https://github.com/BigMcLargeHuge/opensources/tree/master/sources

Reference:
News Scraping - https://github.com/ankkur13/Big-Data-Systems-and-Intelligence-Analytics/blob/master/News%20Data%20Scraping%20.ipynb
Real News Sources - https://github.com/N2ITN/are-you-fake-news

In [None]:

import numpy as np
import json
import newspaper
from newspaper import Article
from time import mktime
from datetime import datetime
import pandas as pd
import requests
import numpy as np
import csv
from newspaper.article import ArticleException, ArticleDownloadState
from time import sleep
from newspaper import news_pool
from multiprocessing.dummy import Pool as ThreadPool

In [None]:
LIMIT = 15 #number of articles per source


In [None]:
companies = pd.read_csv('sources_fake1.csv', encoding='utf-8')

# Data Pre-Processing

In [None]:

companies.rename(columns={'Unnamed: 0': 'Source'}, inplace=True)
companies.drop(['2nd type', '3rd type', 'Source Notes (things to know?)'], axis=1, inplace=True)
companies['type'].fillna('fake', inplace=True)
companies['type'] = companies['type'].str.strip()
companies['Source'] = companies['Source'].str.strip()


# Scraping with Multithreading

In [None]:
pool = ThreadPool(10)   # no. of threads to use
          

def get_articles(company):
    count=1
    scraped_articles = pd.DataFrame(columns=['Source','SourceHTTP', 'Title', 'Authors', 'Text', 'URL','PublishedDate'])
    
    url = company.strip()
    if 'http://' or 'https://' not in url:
         _url = 'http://' + url

    paper = newspaper.build(_url, memoize_articles=False, language='en')
    
    for content in paper.articles:
        if count > LIMIT:
            break
        try:
            slept = 0
            content.download()
            while content.download_state == ArticleDownloadState.NOT_STARTED or content.download_state != 2:
                if slept > 20:
                    raise ArticleException('Download never started')
                sleep(1)
                slept += 1
            content.parse()
        except Exception as e:
            print(e)
            print("Continuing...")
            continue

        if content.title:
            if len(content.text) < 150:
                continue
            article = {}
            article['Source'] = url
            article['SourceHTTP'] = _url
            article['Title'] = content.title
            article['Authors'] = content.authors
            article['Text'] = content.text
            article['URL'] = content.url
            
            # If published date does not exist or is not in a recognizable format
            if content.publish_date is None:
                article['PublishedDate'] = None
            else:
                article['PublishedDate'] = pd.Timestamp(datetime.date(content.publish_date))
                
            scraped_articles = scraped_articles.append(article, ignore_index=True, sort=True)
            
            count = count + 1
    count = 1
    print("Done ", company)
    return scraped_articles

scraped_articles = pd.DataFrame(columns=['Source','SourceHTTP', 'Title', 'Authors', 'Text', 'URL','PublishedDate'])

scraped_articles = scraped_articles.append(pool.map(get_articles, companies['Source']), ignore_index=True, sort=True)


pool.close() 
pool.join()

In [None]:
# write back to a csv
scraped_articles.to_csv('Fake_News_Articles.csv', index=False, encoding='utf-8')