# Feedly Data Extraction Demo

Python 3

In [292]:
from feedly.client import *
from feedly import *
from newspaper import Article, ArticleException # http://newspaper.readthedocs.io/en/latest/
from time import sleep
import numpy as np
import pandas as pd
import datetime
import math
pd.set_option('max_rows',300)

In [36]:
## IAP Crds
TOKEN = "A2zjasgZJawkY8etL3a9w1QP_BFLH7YcnaW_s-7kR7oU8Nkrz-ZY8spKj_rGuqYtyAJ4vYItikat_WS35cBCKA9jqYrbg_frpzLL_987_THA8BB4cXYfVGReSQMoScif6g7HI72_aKHYcheyqFVjObZX6QYiCZbrDAyzE1XvvvORiy8MjTSwRXQoX3in0_ywGYgFsfxJRA5M073PVSJJDv0Tv67JxC-GlvFRV3xiLqthS3Ed_8Qzztk:feedlydev"
FEEDLY_REDIRECT_URI = "http://fabreadly.com/auth_callback"
FEEDLY_CLIENT_ID="d8f62d80-bd91-4b23-bdc3-c219d0489a26"



# Load the Feed

Reference: [Feedly Documentation](https://developer.feedly.com/cloud/)

In [37]:
import json
import requests

# Feedly
feedaccess = TOKEN

## Use url below to get the feed ids. 
myurl = 'https://cloud.feedly.com/v3/subscriptions'
headers = {'Authorization': 'OAuth ' + feedaccess}
res = requests.get(url=myurl, headers=headers)
con = res.json()
output = json.dumps(con , indent=4)

# See all IAP Feeds and their IDs

From the API you can pull specific feeds by ID, or you can pull everything. One issue is that IAP is also a content generator and actually pushes things to Feedly, so you also end up pulling that stuff. Anything marked EWS, or ewsdata.rightsindevelopment.org or a link to that site is an IAP generated entry, we need to filter these out. We should be able to do this faitly easily with the metadata that is available for each item. Other option is pull from a bunch of different feeds, but my guess is that filtering will actually be easier. 

Code below shows all the different feeds - the ALL feed is not listed but is coded as an option in the `pull_feed` function.

In [38]:
def see_feeds(feedaccess=feedaccess):
    """
        Get the list of IAP feeds and the feed id - we need this when pulling the feed data. 
    """
    myurl = 'https://cloud.feedly.com/v3/subscriptions'
    headers = {'Authorization': 'OAuth ' + feedaccess}
    res = requests.get(url=myurl, headers=headers)
    con = res.json()
    output = json.dumps(con , indent=4)
    df = pd.DataFrame([(c['title'] , c['categories'][0]['id']) for c in con])
    df.columns = ['Title','id']
    return df, con

In [39]:
df, raw  = see_feeds(feedaccess)

In [40]:
df.head()

Unnamed: 0,Title,id
0,All - EWS,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
1,EWS SA,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
2,ADB,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
3,WB,user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...
4,"Title: World Bank, Text: Loan",user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/cate...


In [41]:
def pull_feed(feed_id, feedcount, all_feeds=False,  feedaccess=feedaccess):
    """
    Pull the feed information from the Feedly API and returns a list of pulled JSON objects. 
    Returns a list in case we are pulling more then 1000 items, then we have multiple JSON objects. 
    
    feed_id: Id of the feed we want to pull from. (str)
    feedcount: Target number of items to pull from the feed. (int)
    all_feeds: If true then pulls all items in the IAP feed - value of feed_id will be ignored (Bool)
    feedaccess: Token Information (str)
    """
    
    feedcount = str(feedcount)
    current_count = 0
    continuation_rounds = math.ceil(int(feedcount) / 1000.0)
    json_data = []
    continuation_id = None
    if all_feeds:
        feed_id = 'user/d8f62d80-bd91-4b23-bdc3-c219d0489a26/category/global.all'

    for i in range(continuation_rounds):
        print('Pulling Data - Round %s' % str(i+1))
        myurl = "http://cloud.feedly.com/v3/streams/contents?streamId=" + feed_id + "&count=" + feedcount
        
        if continuation_id:
            myurl += "&continuation={}".format(continuation_id)
        headers = {'Authorization': 'OAuth ' + feedaccess}
        res = requests.get(url=myurl, headers=headers)
        con = res.json()
        json_data.append(con)
        
        if int(feedcount) > 1000:
            print(con.keys())
            continuation_id = con['continuation']
    
    print('Complete')
    return json_data

In [237]:
pulled_json = pull_feed('',20000,all_feeds=True)

Pulling Data - Round 1
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 2
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 3
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 4
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 5
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 6
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 7
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 8
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 9
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 10
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 11
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 12
dict_keys(['updated', 'continuation', 'items', 'id'])
Pulling Data - Round 13
dict_keys(['updated', 'continuation', 'items', 'i

# Process the Feed 

Convert to a dataframe

#TODO - Figure out what tags we need to preserve here - like from which news feed were they pulled - should be valuable for identifying the bank being mentioned.  

In [238]:
def process_pulled_data(json_data):
    df_data = []
    
    for grp in range(len(json_data)):
        data = json_data[grp]
        for i in range(len(data['items'])):

            vals = data['items'][i]
            article_data = []
            article_data += [vals['fingerprint'], vals['published'], vals['title'],vals['alternate'][0]['href'],vals['categories'][0]['label']]
            try:
                article_data.append(vals['content']['content'])
            except:
                article_data.append(None)

            try:
                article_data.append(vals['summary']['content'])
            except:
                article_data.append(None)
            df_data.append(article_data)
        
        
    df = pd.DataFrame(df_data, columns=None)
    df.columns = ['article_id','published','title','url','feed_label','content','summary']
    df.published = [datetime.datetime.fromtimestamp(i/1000.0) for i in df.published]
    return df

In [239]:
json2df = process_pulled_data(pulled_json)

## Filter the Items 

**Remove the EWS Posts - these are from IAP we don't need to process them **

In [240]:
json2df['keep'] = [False if 'ews.rightsindevelopment.org' in i else True for i in json2df.url]

In [241]:
json2df.keep.value_counts()

True     12098
False     7902
Name: keep, dtype: int64

In [242]:
json2df = json2df[json2df.keep]

**Filter out File Uploads and other Non-Articles**

It appears that if the `summary` field is empty the item is not an article. 

In [243]:
json2df = json2df[json2df['summary'].notnull()]
print(json2df.shape)

(11036, 8)


## De-Dupe a Bit 

In [277]:
grp_df = json2df.groupby(['article_id','title','url','keep']).agg({
    'content':'min',
    'summary':'min',
    'published':'max',
    'feed_label': lambda x: ','.join(set(x))}
    ).reset_index()

In [278]:
grp_df.shape

(9194, 8)

## Export - Pre Scrape

In [280]:
grp_df = grp_df.sample(frac=1)

In [281]:
grp_df.shape

(9194, 8)

In [282]:
grp_df.head()

Unnamed: 0,article_id,title,url,keep,summary,content,published,feed_label
7280,cd88676c,Tonga: World Bank Drone-Led Damage Assessments...,https://reliefweb.int/report/tonga/tonga-world...,True,"<table border=""0"" cellspacing=""3"" cellpadding=...",,2018-02-22 00:03:16,NEWS WB- All Streams
8244,e72cf1f7,China's unstoppable momentum,https://gooruf.com/uk/news/2018/01/17/china-un...,True,"<table border=""0"" cellspacing=""3"" cellpadding=...",,2018-01-17 06:56:52,NEWS AIIB - All Streams
2058,44fcbdb7,ADB agrees $375m loan for Madhya Pradesh irrig...,https://www.txfnews.com/Ticker/Redirect/84616f...,True,"<table border=""0"" cellspacing=""3"" cellpadding=...",,2018-06-01 03:32:41,NEWS ADB - All Streams
7052,c77bc305,Ethiopia: European Investment Bank Injects €3m...,http://allafrica.com/stories/201802060591.html,True,"<table border=""0"" cellspacing=""3"" cellpadding=...",,2018-02-06 06:46:17,NEWS EIB - All streams
6403,b6e91d1e,"AIIB approves two new applicants, expands memb...",http://www.xinhuanet.com/english/2018-05/02/c_...,True,"<table border=""0"" cellspacing=""3"" cellpadding=...",,2018-05-02 01:58:00,NEWS AIIB - All Streams


In [296]:
grp_df[['article_id','published','title','url','feed_label']].to_csv('../Temp_Output/article20k_pull4labeling.csv',index=False)

## Scrape the articles

**NOTE** - This is Slow - so may need to run in batches or overnight, or both. 

In [284]:
try:
    with open('../Temp_Output/article_cache.pkl', 'rb') as file:
        cache = pickle.load(file)
except:
    cache = {}
    print('Creating New Cache.. Is this Correct')

In [340]:
def get_text_via_Article(url, article_id):
    """
    Returns scraped article content - using the newspaper3k module (http://newspaper.readthedocs.io/en/latest/)
    """
    
    global cache
    
    if article_id not in cache:
        article = Article(url)
        article.download()
        try:
            article.parse()
        except ArticleException:  
            print('Encountered Exception',url)
            article.download()
            print('sleeping')
            sleep(10)
            try:
                article.parse()
            except ArticleException:  #Messy code here but just getting this working 
                print('Article Not Downloaded')
                return np.nan
        ## Now Process Article 
        article.nlp()
        cache[article_id] = (article.text, article.keywords)
        return None
    else:
        return None
    

In [None]:
for cnt, idx in enumerate(grp_df.index):
    if cnt%25 == 0:
        print(cnt)
    row = grp_df.loc[idx]
    get_text_via_Article(row['url'],row['article_id'])

0
Article `download()` failed with 404 Client Error: Not Found for url: https://www.adb.org/node/415051?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+adb_news+%28ADB.org+News+Releases+RSS%29 on URL http://feedproxy.google.com/~r/adb_news/~3/-LPJT1oP4-w/415051
Encountered Exception http://feedproxy.google.com/~r/adb_news/~3/-LPJT1oP4-w/415051
sleeping
Article `download()` failed with 404 Client Error: Not Found for url: https://www.adb.org/node/415051?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+adb_news+%28ADB.org+News+Releases+RSS%29 on URL http://feedproxy.google.com/~r/adb_news/~3/-LPJT1oP4-w/415051
Article Not Downloaded
25


In [337]:
import pickle

with open('../Temp_Output/article_cache.pkl', 'wb') as file:
    pickle.dump( cache, file)
test_df['scraped_content'] = test_df['article_id'].map(cache)

## Review Results and add in Some Language Detection

IAP only has content in English currently so tagging articles in other languages is likely too complicated at this time as it wold also involve a translation step. Therefore we may want to filter out non English language articles. 

Newspaper 3k has this functionality - but it is slow - this work pretty fast. 

In [45]:
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException

In [48]:
for i in test_df.head().scraped_content:
    try:
        print(detect_langs(i))
    except LangDetectException:  
        continue
    print(i[0:300],
         '\n')
    print ('*******')

[en:0.9999952972346535]
European Union institution the European Investment Bank (EIB) has renewed its partnership with French port Marseille Fos under a €50 million funding agreement to support five key development projects.



The projects, which require a total investment of €136 million, include connecting the two exist 

*******
[en:0.9999991131684797]
India signed a USD 500 million (Rs 3,371 crore) loan pact with World Bank today to provide additional financing for PMGSY rural road projects.

The loan has a 3-year grace period, and a maturity of 10 years, the finance Ministry said in a release.

It will provide additional financing for the Pradha 

*******
[en:0.9999952599411848]
9700 Jamaican families enrolled in study

Data on fathers’ impact on child development collected for the first time

The UWI JA KIDS Birth Cohort Study Research Team will host a conference at the University of the West Indies from May 31 to June 1 to share ground-breaking findings from their seven-y 

***

# TODO 

1. Detect Language
2. Error Test on larger set
3. Manually verify extact is generally correct 
4. Extract article content for all articles in DataDive dataset (Could take a long time)
5. Method for Identifying articles previously scraped (we need a unique identifier so we can only scrape new articles in the future) -- This is just something to keep in mind. 

-------------------

# End