# Script to Scrape News Text from GKG URL

In [None]:
# !pip install -q git+https://github.com/codelucas/newspaper.git
# !pip install -q readability-lxml

# !pip install -q urllib

In [None]:
# !pip install -q pandarallel

In [None]:
# import gdelt
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as  np
import time
from random import randint
import urllib
import re 
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import math
from pandarallel import pandarallel
import multiprocessing

import gc
import warnings
warnings.simplefilter('ignore', FutureWarning)

# SCRAPE TEXT

In [None]:
# load GKG file
country_news_alt = pd.read_csv('../../kenya_aug2016_nov2017.csv',
                               lineterminator='\n', index_col=0)

In [None]:
country_news_alt

In [None]:
country_news_alt.info()

In [None]:
# plot volume of data

plt.figure(figsize=(22,8))
sns.histplot(country_news_alt.DATE)
plt.title('Volume of related text in selected period')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Base scraper

# Modified from...
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description:  Python script to pull content from a website (works on news stories).

###################################
# Standard Library imports
###################################

import re
import pytz
import datetime
import platform


###################################
# Third party imports
###################################

import requests
import nltk
nltk.download('punkt')
from newspaper import Article
from bs4 import BeautifulSoup
from readability.readability import Document as Paper
from requests.packages.urllib3.exceptions import InsecureRequestWarning


requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


done = {}


def textgetter(url):
    """Scrapes web news and returns the content
    Parameters
    ----------
    url : str
        web address to news report
    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article
    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group())
        if u.search(s.search(url).group()) is not None:
            site = site.group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            yield done[url]
            pass
        try:
            # make a request to the url
            r = requests.get(url, verify=False, timeout=1)
        except:
            # if the url does not return data, set to empty values
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        # if url does not return successfully, set ot empty values
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None

        # test if length of url content is greater than 500, if so, fill data
        if len(r.content)>500:
            # set article url
            article = Article(url)
            # test for python version because of html different parameters
            if int(platform.python_version_tuple()[0])==3:
                article.download(input_html=r.content)
            elif int(platform.python_version_tuple()[0])==2:
                article.download(html=r.content)
            # parse the url
            article.parse()
            article.nlp()
            # if parse doesn't pull text fill the rest of the data
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider']=site
                answer['published_date'] = article.publish_date
                answer['keywords']=article.keywords
                answer['summary']=article.summary
                # convert the data to isoformat; exception for naive date
                if isinstance(article.publish_date,datetime.datetime):
                    try:
                        answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
                    except:
                        answer['published_date']=article.publish_date.isoformat()
                

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url
                
                

            # if previous didn't work, try another library
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                # as we did above, pull text if it's greater than 200 length
                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
                # if nothing works above, use beautiful soup
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all(
                            'div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
        # if nothing works, fill with empty values
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        yield answer

    # the else clause to catch if invalid url passed in
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider']=site
        answer['published_date']=None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        answer['keywords']=None
        answer['summary']=None
        yield answer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/williamsimpson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Outer scraper function 

def scrape_news_v2(row):
    '''
    Provides logic to check success of scrapes and rescrape if necessary
    First trys to scrape from the Way Back Machine
    If that is not successful, it scrapes from the live url
    Provides flags to indicate provenance of articles 
    
    Parameters:
    ———————————
    row: str
        a URL
    
    Outputs:
    ————————
    scrape_news_art: str
        the text of the article
    flag_art: int
        indicates if the text was obtained successfully and identifies source of failure
    wbm_tag: int
        indicates if the article was scraped from the Way Back Machine 
    '''
    url = row
    try:
        # get first url if multiple 
        if re.findall('<UDIV>',url):
            url = re.findall(r'.+?(?=<UDIV>)', url)[0]
        # find Way Back Machine version of url if available
        way_back_machine_url = f'http://archive.org/wayback/available?url={url}'
        r = requests.get(way_back_machine_url)
        response_json = r.json()
        #print(response_json)
        # no url returned from WBM
        if len(response_json['archived_snapshots'])<1:
            # use original url and run scrape twice
            scrape_news_art, flag_art, wbm_tag = get_text(url)
            scrape_news_art, flag_art, wbm_tag = get_text(url)
            # mark that this is not from the wbm
            wbm_tag = 0
            
        else:
            snapshot_url = response_json['archived_snapshots']['closest']['url']    
            scrape_news_art, flag_art, wbm_tag = get_text(snapshot_url)
            # if WBM did not work try plain url
            if re.findall('Unable to reach website', scrape_news_art):
                # use original url and run scrape twice
                scrape_news_art, flag_art, wbm_tag = get_text(url)
                scrape_news_art, flag_art, wbm_tag = get_text(url)
                # mark that this is not from the wbm
                wbm_tag = 0
        # sleep 
        time.sleep(randint(2,7))    
        
        return scrape_news_art, flag_art, wbm_tag
    
    except:
        #print('>>>FAILED at:',url)
        return 'ALL FAIL', -1, 0

In [None]:
# Helper function for scrape_news_v2 function

def get_text(row):
    '''
    Wraps the base scraper function and
    provides logic to track possible points of failure in the scrape
    
    Parameters:
    ———————————
    row: str
        a URL
    
    Outputs:
    ————————
    text: str
        the text of the article
    flag: int
        indicates if the text was obtained successfully and identifies source of failure
    tag: int
        indicates if the article was scraped from the Way Back Machine
    '''
    url = row
    try:
        # check if resembles url
        if not url.startswith('http'):
            text = 'NOT URL'
            flag = -2
            tag = 1
            return text, flag, tag 
        
        # scrape url
        text = next(textgetter(url))
        
        # check if url worked
        if len(text) == 0:
            text = 'No dict from scraper'
            flag = -3
            tag = 1
            return text, flag, tag
        # url worked but no text returned
        elif text == 'Unable to reach website.' or text == 'Unable to reach website. ':
            flag = -4
            tag = 1
            return text, flag, tag
        # if link works
        else:
            # get text
            text = text['text']
            # if text returned is empty
            if len(text) < 1:
                text = 'No text returned'
                flag = -5
                tag = 1
                return text, flag, tag
            # if scraper returns 'no text returned'
            elif text == 'No text returned' or text == 'No text returned ':
                text = 'No text returned'
                flag = -6
                tag = 1
                return text, flag, tag
            # got text as expected
            else:  
                flag = 1 
                tag = 1
                return text, flag, tag
        
    except:
        #print('>>>FAILED at:',url)
        return 'ALL FAIL', -7, 1

In [None]:
# basic clean news text function

def clean_txt(txt):
    '''
    Removes newline characters
    
    Parameters:
    ———————————
    txt: str
        original text
    
    Outputs:
    ————————
    txt: str
        clean text
    '''
    txt = re.sub('\\n+', ' ', txt)
    return txt

In [None]:
# for parallelization

num_cores = multiprocessing.cpu_count()
print(num_cores)

8

In [None]:
# intialize parallel computation

pandarallel.initialize(nb_workers=num_cores, progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
# SUPER-BATCH FOR THIS MACHINE

# update the slice of the dataframe once each super-batch is complete
# If overall dataset is small enough this could be the entire dataset
# Otherwise you will have to recombine the datasets afterwards 
# After each super-batch you should move the saved files to another folder
# Else they will be overwritten the next time you run the scraping script
# Once the super-batch is successfully complete, you can delete all but the last output file 
# This is because the script appends each batch to the previous output file

country_news_alt = country_news_alt[:10000]
country_news_alt.reset_index(inplace=True, drop=True)
country_news_alt.shape

In [None]:
# Scrape in Batches 

# The only variable you should need to change is the batch size as needed
# change file paths as needed

# define batch variables 
idx = 0
batch_size = 500  
total_articles = country_news_alt.shape[0]
num_batches = math.ceil(total_articles / batch_size)

for batch in range(num_batches):
    print('Batch:', batch+1,'of',num_batches)
    # load previous batch
    if batch == 0:
        prev_batch_df = pd.DataFrame()
    else:
        prev_batch_df = pd.read_csv(f'../../data_large/scrape_news_df_batch_{batch-1}.csv', lineterminator='\n', index_col=0)
  
    # instatiate vals
    curr_batch_df = country_news_alt[idx:idx+batch_size].copy()

    # scrape news per batch 
    scrape_news_art_flags_tags = country_news_alt['SOURCEURLS'][idx:idx+batch_size].parallel_apply(scrape_news_v2)
    scrape_news_art = [scrape_news_art_flags_tags[i+idx][0] for i in range(len(scrape_news_art_flags_tags))]
    flags_art = [scrape_news_art_flags_tags[i+idx][1] for i in range(len(scrape_news_art_flags_tags))]
    tags_art = [scrape_news_art_flags_tags[i+idx][2] for i in range(len(scrape_news_art_flags_tags))]

    # clean batch
    news_per_batch_clean = [clean_txt(str(tx)) for tx in scrape_news_art]

    # store batch
    curr_batch_df['news_text'] = news_per_batch_clean
    curr_batch_df['scraping_flag'] = flags_art
    curr_batch_df['wbm_tag'] = tags_art
    output_batch_df = prev_batch_df.append(curr_batch_df)

    # write batch to disk
    output_batch_df.to_csv(f'../../data_large/scrape_news_df_batch_{batch}.csv')

    # unpdate indices for next batch
    idx += batch_size

    # sleep 
    time.sleep(randint(10,30))

Batch: 1 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 2 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 3 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 4 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 5 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 6 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 7 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 8 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 9 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 10 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 11 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 12 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 13 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 14 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 15 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 16 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 17 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 18 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 19 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 20 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 21 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 22 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 23 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 24 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 25 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 26 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 27 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Batch: 28 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…



Batch: 29 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 30 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 31 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

Batch: 32 of 60


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=63), Label(value='0 / 63'))), HBox…

# View Example Scrape

In [None]:
# view df with scraped text

country_news_latest = pd.read_csv('../../data_large/scrape_news_df_batch_49.csv',
                               lineterminator='\n', index_col=0)

In [None]:
country_news_latest

Unnamed: 0,DATE,NUMARTS,COUNTS,THEMES,LOCATIONS,PERSONS,ORGANIZATIONS,TONE,CAMEOEVENTIDS,SOURCES,SOURCEURLS,news_text,scraping_flag,wbm_tag
0,2017-07-11,1,,TAX_FNCACT;TAX_FNCACT_ACTRESS;TAX_ETHNICITY;TA...,"4#Uhuru, Nairobi Area, Kenya#KE#KE05#-1.28333#...",john okafor;kenneth okwonko;tonto dikeh,,"2.83687943262411,4.25531914893617,1.4184397163...",,247nigerianewsupdate.co,http://www.247nigerianewsupdate.co/2017/07/see...,Unable to reach website.,-4,0
1,2017-07-11,1,,TAX_ECON_PRICE;EPU_ECONOMY_HISTORIC;TAX_ETHNIC...,1#Burundi#BY#BY#-3.5#30#BY;1#Rwanda#RW#RW#-2#3...,,netherlands africa business council;africa eve...,"2.04081632653061,2.33236151603499,0.2915451895...",,cta.int,http://brussels.cta.int/index.php?option=com_k...,Unable to reach website.,-4,0
2,2017-07-11,2,,TAX_FNCACT;TAX_FNCACT_WOMEN;USPEC_POLITICS_GEN...,"4#Nairobi, Nairobi Area, Kenya#KE#KE05#-1.2833...",millie odhiambo mbita;johanna ngeno emurua dik...,orange democratic movement;nairobi county at k...,"-0.164473684210526,3.61842105263158,3.78289473...","672052169,672052170,672052205,672053130,672032...",sde.co.ke;sde.co.ke,https://www.sde.co.ke/article/2001246965/how-k...,"Ida Odinga, Margaret Kenyatta and Rachel Ruto ...",1,1
3,2017-07-11,1,,WB_135_TRANSPORT;TAX_FNCACT;TAX_FNCACT_DRIVERS...,1#Uganda#UG#UG#1#32#UG;1#Kenya#KE#KE#1#38#KE,fernando wangila,public service vehicles;technology management,"-2.28571428571429,1.07142857142857,3.357142857...",672171132672074694,techweez.com,http://www.techweez.com/2017/07/11/ntsa-smart-...,Kenya’s National Transport and Safety Board (N...,1,1
4,2017-07-11,1,,AGRICULTURE;TAX_FNCACT;TAX_FNCACT_FARMERS;UNGP...,"4#Kericho, Central, Kenya#KE#KE01#-0.410736#37...",esther ruto;africa allafrica;rusi cheruiyot,comart foundation;coady international institut...,"2.76710222905457,3.38201383551115,0.6149116064...",,bizcommunity.com,http://www.bizcommunity.com/Article/196/356/16...,By adopting agroforestry and improved agricult...,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,2017-08-04,1,,TAX_ETHNICITY;TAX_ETHNICITY_KENYANS;EDUCATION;...,"4#Nyamira, Nyanza, Kenya#KE#KE07#-0.083333#34....",musa mustapha;agnes mutiota;thomas nyakundi;os...,boundaries commission,"-2.45022970903522,1.07197549770291,3.522205206...",678753308678754466678754467678754469,standardmedia.co.ke,https://www.standardmedia.co.ke/article/200125...,Unable to reach website.,-4,0
24996,2017-08-04,1,KILL#200##1#Kenya#KE#KE#1#38#KE;CRISISLEX_CRIS...,LEADER;TAX_FNCACT;TAX_FNCACT_PRESIDENT;USPEC_P...,1#Tanzania#TZ#TZ#-6#35#TZ;1#Kenya#KE#KE#1#38#K...,natasha stott despoja;gideon kayinamura;mwai k...,party leader australia professor pierre moukok...,"0.574712643678161,3.06513409961686,2.490421455...","678599092,678599093,678542527,678599437,678599...",myjoyonline.com,https://www.myjoyonline.com/politics/2017/Augu...,No dict from scraper,-3,0
24997,2017-08-04,1,,TAX_FNCACT;TAX_FNCACT_CHIEF;GENERAL_GOVERNMENT...,"1#Rwanda#RW#RW#-2#30#RW;5#Gauteng, Gauteng, So...",addis abeba;carlo ladicicco;sol campbell,wide area networks;gauteng high court on,"0,3.07692307692308,3.07692307692308,6.15384615...",,direct.news,https://africa.direct.news/news=933349,No text returned,-5,1
24998,2017-08-04,1,,UNGP_FORESTS_RIVERS_OCEANS;TAX_FNCACT;TAX_FNCA...,"1#Mexico#MX#MX#23#-102#MX;2#New York, United S...",trudy coxe;jonathan soroff;jonathan kaye;iris ...,preservation society;preservation society of n...,"6.04395604395604,7.14285714285714,1.0989010989...",,improper.com,http://www.improper.com/photos-parties/newport...,Unable to reach website.,-4,0


In [None]:
# view success of scraper (1==success)

country_news_latest.scraping_flag.value_counts()

 1    13486
-4     6215
-3     4695
-5      322
-2      194
-6       81
-1        7
Name: scraping_flag, dtype: int64