# Feedly Data Extraction Demo

Python 3

In [29]:
from feedly.client import *
from feedly import *
from newspaper import Article, ArticleException # http://newspaper.readthedocs.io/en/latest/
from time import sleep
import numpy as np
import pandas as pd
import datetime
import math
import pickle
%matplotlib inline
pd.set_option('max_rows',300)

In [30]:
## IAP Crds
TOKEN = "A2zjasgZJawkY8etL3a9w1QP_BFLH7YcnaW_s-7kR7oU8Nkrz-ZY8spKj_rGuqYtyAJ4vYItikat_WS35cBCKA9jqYrbg_frpzLL_987_THA8BB4cXYfVGReSQMoScif6g7HI72_aKHYcheyqFVjObZX6QYiCZbrDAyzE1XvvvORiy8MjTSwRXQoX3in0_ywGYgFsfxJRA5M073PVSJJDv0Tv67JxC-GlvFRV3xiLqthS3Ed_8Qzztk:feedlydev"
FEEDLY_REDIRECT_URI = "http://fabreadly.com/auth_callback"
FEEDLY_CLIENT_ID="d8f62d80-bd91-4b23-bdc3-c219d0489a26"

# Load the Feed

Reference: [Feedly Documentation](https://developer.feedly.com/cloud/)

In [31]:
import json
import requests

# Feedly
feedaccess = TOKEN

## Use url below to get the feed ids. 
myurl = 'https://cloud.feedly.com/v3/subscriptions'
headers = {'Authorization': 'OAuth ' + feedaccess}
res = requests.get(url=myurl, headers=headers)
con = res.json()
output = json.dumps(con , indent=4)

# See all IAP Feeds and their IDs

From the API you can pull specific feeds by ID, or you can pull everything. One issue is that IAP is also a content generator and actually pushes things to Feedly, so you also end up pulling that stuff. Anything marked EWS, or ewsdata.rightsindevelopment.org or a link to that site is an IAP generated entry, we need to filter these out. We should be able to do this faitly easily with the metadata that is available for each item. Other option is pull from a bunch of different feeds, but my guess is that filtering will actually be easier. 

Code below shows all the different feeds - the ALL feed is not listed but is coded as an option in the `pull_feed` function.

In [32]:
def see_feeds(feedaccess=feedaccess):
    """
        Get the list of IAP feeds and the feed id - we need this when pulling the feed data. 
    """
    myurl = 'https://cloud.feedly.com/v3/subscriptions'
    headers = {'Authorization': 'OAuth ' + feedaccess}
    res = requests.get(url=myurl, headers=headers)
    con = res.json()
    output = json.dumps(con , indent=4)
    df = pd.DataFrame([(c['title'] , c['categories'][0]['id']) for c in con])
    df.columns = ['Title','id']
    return df, con

In [33]:
df, raw  = see_feeds(feedaccess)

In [34]:
df.head()

Unnamed: 0,Title,id
0,All - EWS,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
1,EWS SA,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
2,ADB,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
3,WB,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
4,"Title: World Bank, Text: Loan",user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...


In [35]:
def pull_feed(feed_id, feedcount, all_feeds=False,  feedaccess=feedaccess):
    """
    Pull the feed information from the Feedly API and returns a list of pulled JSON objects. 
    Returns a list in case we are pulling more then 1000 items, then we have multiple JSON objects. 
    
    feed_id: Id of the feed we want to pull from. (str)
    feedcount: Target number of items to pull from the feed. (int)
    all_feeds: If true then pulls all items in the IAP feed - value of feed_id will be ignored (Bool)
    feedaccess: Token Information (str)
    """
    
    feedcount = str(feedcount)
    current_count = 0
    continuation_rounds = math.ceil(int(feedcount) / 1000.0)
    json_data = []
    continuation_id = None
    if all_feeds:
        feed_id = 'user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/category/global.all'

    for i in range(continuation_rounds):
        print('Pulling Data - Round %s' % str(i+1))
        myurl = "http://cloud.feedly.com/v3/streams/contents?streamId=" + feed_id + "&count=" + feedcount
        
        if continuation_id:
            myurl += "&continuation={}".format(continuation_id)
        headers = {'Authorization': 'OAuth ' + feedaccess}
        res = requests.get(url=myurl, headers=headers)
        con = res.json()
        json_data.append(con)
        
        if int(feedcount) > 1000:
            print(con.keys())
            continuation_id = con['continuation']
    
    print('Complete')
    return json_data

In [53]:
pulled_json = pull_feed('',45000,all_feeds=True)

Pulling Data - Round 1
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 2
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 3
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 4
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 5
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 6
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 7
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 8
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 9
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 10
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 11
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 12
dict_keys(['items', 'id', 'continuation', 'updated'])
Pulling Data - Round 13
dict_keys(['items', 'id', 'continuation', 'update

---------------

# Process the Feed 

Convert to a dataframe

#TODO - Figure out what tags we need to preserve here - like from which news feed were they pulled - should be valuable for identifying the bank being mentioned.  Might have all of them but might be some others that could be useful. 

items reference: https://developer.feedly.com/v3/entries/

In [54]:
def process_pulled_data(json_data):
    df_data = []
    
    for grp in range(len(json_data)):
        data = json_data[grp]
        for i in range(len(data['items'])):

            vals = data['items'][i]
            article_data = []
            article_data += [vals['fingerprint'], vals['published'], vals['title'],vals['alternate'][0]['href'],vals['categories'][0]['label']]
            try:
                article_data.append(vals['content']['content'])
            except:
                article_data.append(None)

            try:
                article_data.append(vals['summary']['content'])
            except:
                article_data.append(None)
            df_data.append(article_data)
        
        
    df = pd.DataFrame(df_data, columns=None)
    df.columns = ['article_id','published','title','url','feed_label','content','summary']
    df.published = [datetime.datetime.fromtimestamp(i/1000.0) for i in df.published]
    return df

In [55]:
json2df = process_pulled_data(pulled_json)

In [56]:
pulled_json[0]['items'][0]

{'alternate': [{'href': 'https://news.mongabay.com/2018/06/not-all-doom-and-gloom-qa-with-conservation-job-market-researchers/',
   'type': 'text/html'}],
 'author': 'Jeremy Hance',
 'categories': [{'id': 'user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/category/NEWS - Mongabay',
   'label': 'NEWS - Mongabay'}],
 'content': {'content': 'You grew up watching David Attenborough documentaries and reading Gerald Durrell memoirs. You volunteered banding marmosets in Brazil. You have a bachelor’s in biology and a master’s in conservation biology. You spent a year interning at an international NGO. You’ve got the passion, the education, the experience — but now you just can’t find a job. And you’re not the only one: young conservationists are reporting a tough, rough time out there, with intense competition, a flood of unpaid internships, a prevalence of short-term work, and high student-loan debt. A recent study in Conservation Biology\xa0attempts to uncover some concrete data on the hard-to-quant

---------------------

## Filter the Items 

**Remove the EWS Posts - these are from IAP we don't need to process them **

In [40]:
json2df['keep'] = [False if 'ews.rightsindevelopment.org' in i else True for i in json2df.url]

In [41]:
json2df.keep.value_counts()

True     18893
False    16107
Name: keep, dtype: int64

In [42]:
json2df = json2df[json2df.keep]

**Filter out File Uploads and other Non-Articles**

It appears that if the `summary` field is empty the item is not an article. 

In [43]:
temp_keep = json2df.copy(deep=True)

In [44]:
json2df = json2df[json2df['summary'].notnull()]
print(json2df.shape)

(17038, 8)


---------------

## De-Dupe a Bit 

This is just a basic group by - does not look for articles that are duplicated in content but maybe from a different url, or differnt source. 

In [45]:
grp_df = json2df.groupby(['article_id','title','url','keep']).agg({
    'content':'min',
    'summary':'min',
    'published':'max',
    'feed_label': lambda x: ','.join(set(x))}
    ).reset_index()

In [46]:
grp_df.head()

Unnamed: 0,article_id,title,url,keep,published,summary,feed_label,content
0,100904a0,AfDB approves third sovereign lending instrument,https://newtelegraphonline.com/2018/01/afdb-ap...,True,2018-01-04 20:19:26,"<table border=""0"" cellspacing=""3"" cellpadding=...",NEWS AFDB- All Streams,
1,100da81a,Nigerian banks to benefit from AfDB's $50m tra...,http://punchng.com/nigerian-banks-to-benefit-f...,True,2018-04-05 19:06:37,"<table border=""0"" cellspacing=""3"" cellpadding=...",NEWS AFDB- All Streams,
2,10117f34,EBRD invests over EUR 0.5 bln in Romania in 2017,https://www.romania-insider.com/ebrd-invests-r...,True,2018-01-30 05:30:26,"<table border=""0"" cellspacing=""3"" cellpadding=...",NEWS EBRD - All streams,
3,1012c3f6,Britain pledges to easy Zim's cash shortages,http://www.thezimbabwean.co/2017/11/britain-pl...,True,2017-11-30 03:01:00,"<table border=""0"" cellspacing=""3"" cellpadding=...",NEWS AFDB- All Streams,
4,10146756,First real test for Jokowi on haze as annual f...,https://news.mongabay.com/2017/08/indonesian-p...,True,2017-08-07 20:00:00,"<img alt="""" src=""https://imgs.mongabay.com/wp-...",NEWS - Mongabay,JAKARTA — Fire season has returned to Indonesi...


In [48]:
grp_df.shape

(14237, 8)

--------------------

## Export - Pre Scrape

This is just to create a file the IAP can use to create labeled data. 

In [296]:
# grp_df = grp_df.sample(frac=1)

# print(grp_df.shape)

# grp_df[['article_id','published','title','url','feed_label']].to_csv('../Temp_Output/article20k_pull4labeling.csv',index=False)

----------------------------

## Export the Article Dataset 

**Dump**

In [53]:
# with open('../Temp_Output/api_article_extract.pkl', 'wb') as file:
#     pickle.dump(grp_df, file)


## Load the Api article extract 

In [8]:
with open('../Temp_Output/api_article_extract.pkl', 'rb') as file:
    grp_df = pickle.load(file)


In [9]:
grp_df.shape

(11753, 8)

-----------

## Scrape the articles

**NOTE** - This is Slow - so may need to run in batches or overnight, or both. 

Doing some scraping for article content --- I've pull around 20K articles which when deduped and filtered is around 11K actual news articles. This is a lot of content to scrape at once - hence the use of the file cache. Code is all a little hacky - just wanted to get some stuff pulled quickly. We don't want to be pulling data at the event . 

**Cache**
I'm saving the "scraped" article content in a dictionary and then writing it to a file. If we change the information we are pulling using the newspaper library we will need to recreate this cache. 

**Shuffle **

In [7]:
grp_df = grp_df.sample(frac=1)

In [5]:
try:
    with open('../Temp_Output/article_cache.pkl', 'rb') as file:
        cache = pickle.load(file)
except:
    print('error')
    
error_count = 0

In [34]:
def get_text_via_Article(url, article_id, try_hard=False):
    """
    Returns scraped article content and the keywords- using the newspaper3k module (http://newspaper.readthedocs.io/en/latest/)
    """
    
    global cache  ## Just writing to the global cahce object this way we can interrupt the run without losing the data
    global error_count
    if article_id not in cache:  ## Check to see if we have already scraped this article (maybe in a previous run of this)
        article = Article(url)   ## Newspaper Article Object
        article.download()
        try:
            article.parse()  ## Sometimes this step fails because the download doesn't complete
        except ArticleException:  ## In that case we give the download an additional 10 seconds to complete. 
            if try_hard: ## If we want to actually try to download the ones that failed
                print('Encountered Exception',url)
                article.download()
                print('\nGoing to try a longer download period. ')
                sleep(10) #Sometimes it take a lil bit to download the article - longer is better but then it takes longer .... 
                try:  
                    article.parse()  ## Try again 
                except ArticleException:  #Otherwise lets just keep going
                    print('Failed - Article Not Downloaded\n')
                    error_count += 1
                    cache[article_id] = (-1,-1)
                    return None 
            else:
                error_count += 1
                cache[article_id] = (-1,-1)
                return None
        ## Now Process Article 
        try:
            article.nlp() ## This came up for one article - parse passed but not nlp
        except ArticleException:
            cache[article_id] = (-1,-1)
            return None
        cache[article_id] = (article.text, article.keywords)
        return None
    else:
        return None
    

**Loop over the dataframe and extract the article content (if it hasn't been scraped yet ) **

In [35]:
for cnt, idx in enumerate(grp_df.index):
    if cnt%100 == 0:
        print('** Iteration Count', cnt,' **')
        print('** Error Count', error_count, ' **')
    if cnt%500 == 0:
        print('### Saving Cache ###')
        with open('../Temp_Output/article_cache.pkl', 'wb') as file:
            pickle.dump( cache, file)
    row = grp_df.loc[idx]
    get_text_via_Article(row['url'],row['article_id'])

** Iteration Count 0  **
** Error Count 78  **
### Saving Cache ###
** Iteration Count 100  **
** Error Count 78  **
** Iteration Count 200  **
** Error Count 78  **
** Iteration Count 300  **
** Error Count 78  **
** Iteration Count 400  **
** Error Count 78  **
** Iteration Count 500  **
** Error Count 78  **
### Saving Cache ###
** Iteration Count 600  **
** Error Count 78  **
** Iteration Count 700  **
** Error Count 78  **
** Iteration Count 800  **
** Error Count 78  **
** Iteration Count 900  **
** Error Count 78  **
** Iteration Count 1000  **
** Error Count 78  **
### Saving Cache ###
** Iteration Count 1100  **
** Error Count 78  **
** Iteration Count 1200  **
** Error Count 78  **
** Iteration Count 1300  **
** Error Count 78  **
** Iteration Count 1400  **
** Error Count 78  **
** Iteration Count 1500  **
** Error Count 78  **
### Saving Cache ###
** Iteration Count 1600  **
** Error Count 78  **
** Iteration Count 1700  **
** Error Count 78  **
** Iteration Count 1800  **




You must `download()` an article first!
You must `download()` an article first!
You must `download()` an article first!
You must `download()` an article first!
** Iteration Count 2000  **
** Error Count 89  **
### Saving Cache ###
You must `download()` an article first!
Article `download()` failed with 404 Client Error: Not Found for url: http://www.businessdayonline.com/cros-agro-secures-afdbgrow-africa-funding-support/ on URL http://www.businessdayonline.com/cros-agro-secures-afdbgrow-africa-funding-support/
You must `download()` an article first!
Article `download()` failed with 404 Client Error: Not Found for url: http://www.fijitimes.com/story.aspx?id=427964 on URL http://www.fijitimes.com/story.aspx?id=427964
** Iteration Count 2100  **
** Error Count 93  **
Article `download()` failed with 503 Server Error: Service Temporarily Unavailable for url: https://www.moneylife.in/article/indian-economy-to-advance-73-percentage-in-2018-19-fastest-growing-world-bank/54242.html on URL http

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Article `download()` failed with 404 Client Error: Not Found for url: https://www.journalducameroun.com/en/tunisia-gets-e50m-ebrd-loan-to-finance-small-businesses/ on URL https://www.journalducameroun.com/en/tunisia-gets-e50m-ebrd-loan-to-finance-small-businesses/
Article `download()` failed with 404 Client Error: Not Found for url: https://reliefweb.int/report/mozambique/african-development-bank-and-mozambique-sign-us-29-million-grant-agreements on URL https://reliefweb.int/report/mozambique/african-development-bank-and-mozambique-sign-us-29-million-grant-agreements
** Iteration Count 2300  **
** Error Count 113  **
You must `download()` an article first!
You must `download()` an article first!
You must `download()` an article first!
Article `download()` failed with 503 Server Error: Service Unavailable: Back-end server is at capacity for url: http://www.downtoearth.org.in/news/private-finance-gets-a-boost-at-climate-summit-in-france-but-states-don-t-commit-much-59329 on URL http://ww

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Article `download()` failed with 404 Client Error: Not Found for url: http://www.fijitimes.com/story.aspx?id=438014 on URL http://www.fijitimes.com/story.aspx?id=438014
You must `download()` an article first!
Article `download()` failed with 521 Server Error: Origin Down for url: https://nbherard.com/world/the-european-investment-bank-provides-eur-10-million-loan-to-flexenclosure/36556 on URL https://nbherard.com/world/the-european-investment-bank-provides-eur-10-million-loan-to-flexenclosure/36556
** Iteration Count 3700  **
** Error Count 209  **
Article `download()` failed with 404 Client Error: Unknown site! for url: http://waterwastemanagement.cleantechnology-business-review.com/news/eu-partners-with-ebrd-to-provide-eur53m-to-improve-wastewater-services-in-jordan-110518-6148272 on URL http://waterwastemanagement.cleantechnology-business-review.com/news/eu-partners-with-ebrd-to-provide-eur53m-to-improve-wastewater-services-in-jordan-110518-6148272
Article `download()` failed with 4

Building prefix dict from /Users/michaeldowd/anaconda/envs/my_py_3/lib/python3.5/site-packages/jieba/dict.txt ...
Dumping model to file cache /var/folders/dv/85fh_3dj591fh0bk3w5sw4lr0000gn/T/jieba.cache
Loading model cost 2.8712501525878906 seconds.
Prefix dict has been built succesfully.


You must `download()` an article first!
You must `download()` an article first!
Article `download()` failed with 503 Server Error: Service Temporarily Unavailable for url: https://fp.brecorder.com/2017/12/20171229331069/ on URL https://fp.brecorder.com/2017/12/20171229331069/
You must `download()` an article first!
You must `download()` an article first!
Article `download()` failed with 503 Server Error: Service Temporarily Unavailable for url: https://mothership.sg/2018/01/we-watched-a-63-minute-davos-forum-to-see-if-chan-chun-sing-did-spore-proud/ on URL https://mothership.sg/2018/01/we-watched-a-63-minute-davos-forum-to-see-if-chan-chun-sing-did-spore-proud/
Article `download()` failed with 404 Client Error: Not Found for url: http://the-japan-news.com/news/article/0004163603 on URL http://the-japan-news.com/news/article/0004163603
You must `download()` an article first!
Article `download()` failed with 503 Server Error: Service Temporarily Unavailable for url: https://www.brecorder

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Article `download()` failed with 404 Client Error: Not Found for url: http://www.ledger-enquirer.com/news/business/article210841149.html on URL http://www.ledger-enquirer.com/news/business/article210841149.html
Article `download()` failed with 503 Server Error: Service Temporarily Unavailable for url: https://www.brecorder.com/2018/01/03/390508/adb-shows-interest-in-sewerage-water-supply-solid-waste-projects/ on URL https://www.brecorder.com/2018/01/03/390508/adb-shows-interest-in-sewerage-water-supply-solid-waste-projects/
Article `download()` failed with 404 Client Error: Not Found for url: http://www.guardian.co.tt/business-lead/2018-03-10/a-tourism-catalyst on URL http://www.guardian.co.tt/business-lead/2018-03-10/a-tourism-catalyst
You must `download()` an article first!
** Iteration Count 9700  **
** Error Count 538  **
Article `download()` failed with 403 Client Error: Forbidden for url: https://www.desmogblog.com/2017/12/07/how-american-cities-and-states-are-fighting-climate-ch

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


** Iteration Count 9900  **
** Error Count 548  **
Article `download()` failed with 403 Client Error: 2 for url: https://www.globalresearch.ca/european-development-bank-invests-billions-in-fossil-fuels-as-paris-prepares-for-one-planet-climate-summit/5622386 on URL https://www.globalresearch.ca/european-development-bank-invests-billions-in-fossil-fuels-as-paris-prepares-for-one-planet-climate-summit/5622386
Article `download()` failed with 404 Client Error: Not Found for url: http://www.ntv.co.ug/node/21072 on URL http://www.ntv.co.ug/node/21072
Article `download()` failed with 404 Client Error: Not Found for url: http://www.businessdayonline.com/fg-gets-486m-world-bank-support-improve-power-transmission-infrastructure/ on URL http://www.businessdayonline.com/fg-gets-486m-world-bank-support-improve-power-transmission-infrastructure/
You must `download()` an article first!
** Iteration Count 10000  **
** Error Count 552  **
### Saving Cache ###
Article `download()` failed with 503 Server

**Save Cache**

In [36]:
with open('../Temp_Output/article_cache.pkl', 'wb') as file:
    pickle.dump( cache, file)


## Attach Scraped Content 

In [37]:
with open('../Temp_Output/article_cache.pkl', 'rb') as file:
    cache = pickle.load(file)

In [6]:
grp_df['scraped_content'] = grp_df['article_id'].map(cache)
grp_df['article_text'] = [i[0] if pd.notnull(i) else np.nan for i in grp_df.scraped_content]
grp_df['article_keywords'] = [i[1] if pd.notnull(i) else np.nan for i in grp_df.scraped_content]

In [39]:
grp_df[grp_df.scraped_content.notnull()].shape[0]/grp_df.shape[0]

1.0

In [44]:
100 * grp_df[grp_df.scraped_content == (-1,-1)].shape[0]/len(grp_df)

5.573045179954054

In [45]:
grp_df.to_pickle('../Temp_Output/test_data.pkl')

In [47]:
grp_df.head()

Unnamed: 0,article_id,title,url,keep,feed_label,content,published,summary,scraped_content,article_text,article_keywords
10900,eebb9702,"India, World Bank sign financing agreement for...",http://www.abplive.in/business/india-world-ban...,True,NEWS WB- All Streams,,2017-12-21 09:22:12,"<table border=""0"" cellspacing=""3"" cellpadding=...","(New Delhi [India], Dec 20 (ANI): A financing ...","New Delhi [India], Dec 20 (ANI): A financing a...","[institutes, india, skill, financing, training..."
4268,6832ce57,Rs 40000-crore development projects in limbo i...,http://www.moneycontrol.com/news/business/econ...,True,NEWS AIIB - All Streams,,2017-12-10 09:40:00,"<table border=""0"" cellspacing=""3"" cellpadding=...","(Development projects worth more than Rs 40,00...","Development projects worth more than Rs 40,000...","[development, crore, andhra, eaps, state, proj..."
1663,30f8f65e,https://www.the-american-interest.com/2018/01/...,https://www.the-american-interest.com/2018/01/...,True,NEWS AFDB- All Streams,,2018-01-03 12:21:54,"<table border=""0"" cellspacing=""3"" cellpadding=...",(Ten Lessons\n\nDevelopment with Chinese Chara...,Ten Lessons\n\nDevelopment with Chinese Charac...,"[transitions, university, chinese, united, dev..."
3789,5ec16472,$300 Million to Expand and Upgrade Infrastruct...,https://jis.gov.jm/300-million-to-expand-and-u...,True,NEWS IDB - All Streams,,2018-04-27 16:28:43,"<table border=""0"" cellspacing=""3"" cellpadding=...","(Minister of Industry, Commerce, Agriculture a...","Minister of Industry, Commerce, Agriculture an...","[research, development, agriculture, fisheries..."
6657,989c9942,ADB Provides $346 Million to Upgrade State Hig...,http://www.business-standard.com/article/news-...,True,NEWS ADB - All Streams,,2017-12-10 01:00:00,"<table border=""0"" cellspacing=""3"" cellpadding=...",(The Asian Development Bank's (ADB) Board of D...,The Asian Development Bank's (ADB) Board of Di...,"[highways, improvement, india, state, upgrade,..."


## Review Results and add in Some Language Detection

IAP only has content in English currently so tagging articles in other languages is likely too complicated at this time as it wold also involve a translation step. Therefore we may want to filter out non English language articles. 

Newspaper 3k has this functionality - but it is slow - this work pretty fast. 

In [48]:
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException

In [49]:
detect_langs("""
'Nuku’alofa, February 22, 2018 – Following the severe impact of Tropical Cyclone Gita, the World Bank has now begun work to support the government of Tonga, which is leading a Rapid Damage Assessment to assist with recovery and reconstruction planning in the coming months.\n\n“Our work in mapping the damage wreaked by Cyclone Gita will be crucial to helping the government of Tonga to determine priority areas for recovery and reconstruction,” said World Bank Country Director for Papua New Guinea and the Pacific Islands, Michel Kerf. “In the immediate aftermath of recent natural disasters in the Pacific, including cyclones Winston (Fiji, 2016) and Pam (Vanuatu, 2015)_, the World Bank has been called upon to lead the immediate damage assessment process.”_\n\nThe World Bank, together with partners including the governments of Australia and New Zealand, the Asian Development Bank, Japan International Cooperation Agency, European Union and United Nations Development Programme, is now working alongside Tongan authorities to identify priority sectors for the rapid damage assessment, which include housing, agriculture and energy.\n\nAs part of this assessment work, a fleet of Unmanned Aerial Vehicles (UAVs, or drones) have been transported to Tonga with the support of the Australian government, to provide a comprehensive visual assessment of the damage caused by Cyclone Gita.\n\nTonga has received a payout of more than US$3.5 million from the Pacific Catastrophe Risk Insurance Company (PCRIC) – the first payout made by the region’s first catastrophe risk insurance platform established in 2016. PCRIC was formed as part of the World Bank’s regional project PCRAFI: Furthering Disaster Risk Finance in the Pacific, which provides technical assistance to 14 Pacific Island countries, with financial support from Germany, Japan, the United Kingdom and the United States of America.\n\n“Despite the tragic circumstances, it has been good to see the Pacific Catastrophe Risk Insurance Company delivering much-needed relief through its disaster insurance system,” _said Mr. Kerf. “This is the first payout of its kind, and is a testament to the hard work of many governments and development partners, who have worked hard over many years to create this critical support system for the Pacific Islands, home to many of the world’s most disaster at-risk countries.”_\n\nThe World Bank continues to stand as a dedicated partner in resilient development in the Pacific Islands.'""")

[en:0.9999970469600842]

In [50]:
def detect_lang(x):
    try:
        return detect_langs(x)
    except:
        return np.nan

In [51]:
grp_df['lang'] = grp_df.article_text.apply(detect_lang)

In [96]:
grp_df.lang[1][0].lang

'en'

In [116]:
grp_df[grp_df.lang.notnull()].lang.head(11)

10900     [en:0.999998227451202]
4268      [en:0.999997415757821]
1663     [en:0.9999980316883927]
3789       [en:0.99999784352335]
6657     [en:0.9999970965149094]
4243     [en:0.9999987195562964]
1326     [en:0.9999966798354436]
7033     [en:0.9999975767907536]
3240     [en:0.9999971268992086]
2191     [en:0.9999972939659458]
4574     [en:0.9999965224627279]
Name: lang, dtype: object

In [123]:
grp_df[grp_df.lang.notnull()].shape

(10831, 13)

In [122]:
grp_df[grp_df.lang.notnull()].lang.apply(lambda x: x[0].lang).shape

(10831,)

In [128]:
grp_df['top_lang'] = None
grp_df.loc[grp_df.lang.notnull(), 'top_lang'] = grp_df[grp_df.lang.notnull()].lang.apply(lambda x: x[0].lang)

## Quick Len MEtric 

In [68]:
grp_df['article_text_len'] = grp_df.article_text.apply(lambda x: len(str(x)))

**NOTE** - Article below is actually like an entire long news feed ... 

In [86]:
print(grp_df[grp_df.article_text_len == grp_df.article_text_len.max()].url.values)

['http://english.vietnamnet.vn/fms/society/192475/social-news-in-brief-21-12.html']


# Export Dataset 

In [133]:
grp_df.to_pickle('../Data/Feedly_Processed_DF.pkl')

In [4]:
grp_df = pd.read_pickle('../Data/Feedly_Processed_DF.pkl')

# TODO 

1. Detect Language
2. Error Test on larger set
3. Manually verify extact is generally correct 
4. Extract article content for all articles in DataDive dataset (Could take a long time)
5. Method for Identifying articles previously scraped (we need a unique identifier so we can only scrape new articles in the future) -- This is just something to keep in mind. `article_id` might be fine (based on the json items `fingerprint` field.

-------------------

# End