# import the necessary package

In [1]:
import pandas as pd
import pickle
import random
pd.set_option('display.max_columns',60)
%matplotlib inline
import datetime

In [2]:
newsData_path = '../Data/Feedly_Processed_DF_cleaned.pkl'
countrylabel_path = '../Data/Labeled_Data/countries.csv'
banklabel_path = '../Data/Labeled_Data/banks.csv'

# load the dataset

**News Articles ** 

In [3]:
newsData = pd.read_pickle(newsData_path)

# Country Tags 

**Simple demo **

country code names : http://www.geonames.org/countries/

In [4]:
from geotext import GeoText

In [5]:
countryLabels = pd.read_csv(countrylabel_path)

In [6]:
newsData.head()

Unnamed: 0,article_id,title,url,feed_label,content,published,summary,article_text,article_keywords,article_text_len,top_lang
10900,eebb9702,"India, World Bank sign financing agreement for...",http://www.abplive.in/business/india-world-ban...,NEWS WB- All Streams,,2017-12-21 09:22:12,"<table border=""0"" cellspacing=""3"" cellpadding=...","New Delhi [India], Dec 20 (ANI): A financing a...","[institutes, india, skill, financing, training...",1031,en
4268,6832ce57,Rs 40000-crore development projects in limbo i...,http://www.moneycontrol.com/news/business/econ...,NEWS AIIB - All Streams,,2017-12-10 09:40:00,"<table border=""0"" cellspacing=""3"" cellpadding=...","Development projects worth more than Rs 40,000...","[development, crore, andhra, eaps, state, proj...",4390,en
1663,30f8f65e,https://www.the-american-interest.com/2018/01/...,https://www.the-american-interest.com/2018/01/...,NEWS AFDB- All Streams,,2018-01-03 12:21:54,"<table border=""0"" cellspacing=""3"" cellpadding=...",Ten Lessons\n\nDevelopment with Chinese Charac...,"[transitions, university, chinese, united, dev...",575,en
3789,5ec16472,$300 Million to Expand and Upgrade Infrastruct...,https://jis.gov.jm/300-million-to-expand-and-u...,NEWS IDB - All Streams,,2018-04-27 16:28:43,"<table border=""0"" cellspacing=""3"" cellpadding=...","Minister of Industry, Commerce, Agriculture an...","[research, development, agriculture, fisheries...",2621,en
6657,989c9942,ADB Provides $346 Million to Upgrade State Hig...,http://www.business-standard.com/article/news-...,NEWS ADB - All Streams,,2017-12-10 01:00:00,"<table border=""0"" cellspacing=""3"" cellpadding=...",The Asian Development Bank's (ADB) Board of Di...,"[highways, improvement, india, state, upgrade,...",2532,en


In [7]:
newsData.reset_index(inplace = True)

In [8]:
from collections import Counter

In [9]:
#iterate through title to look for country, then iterate though text

def get_top_countries(x):
    """
    given the text and title of an article
    parse out the names of the two most frequently mentioned
    countries in the order of frequency
    """
    geo = GeoText(x)
    counts = Counter(geo.countries)
    country_list =  [country.lower() for (country, counts) in counts.most_common(2)]
    return ", ".join([x for x in country_list]) 


In [10]:
#create combined column of article title and text
newsData['title_and_text'] = newsData['title'] + "\n" + newsData['article_text']

In [11]:
newsData['top_two_countries'] = newsData['title_and_text'].apply(lambda x : get_top_countries(x))


In [12]:
newsData['all_detected_countries'] = newsData['title_and_text'].apply(lambda x : get_top_countries(x))

In [13]:
dfshort = newsData[['article_id', 'top_two_countries', 'all_detected_countries']]
dfshort.to_csv('article_country_tags.csv')

In [14]:
dfshort.head()

Unnamed: 0,article_id,top_two_countries,all_detected_countries
0,eebb9702,india,india
1,6832ce57,india,india
2,30f8f65e,china,china
3,5ec16472,,
4,989c9942,india,india


In [15]:
#compare with labeled data and look at our accuracy


In [16]:
joined = countryLabels.merge(dfshort, how = 'inner', on = 'article_id')

In [17]:
df_matches = joined[['article_id', 'Country1','top_two_countries', 'all_detected_countries']]

In [18]:
df_matches.head()

Unnamed: 0,article_id,Country1,top_two_countries,all_detected_countries
0,10f9ed2,cambodia,"cambodia, japan","cambodia, japan"
1,c0eece9b,afghanistan,"afghanistan, afghanistan","afghanistan, afghanistan"
2,d1d79dd8,bangladesh,"philippines, bangladesh","philippines, bangladesh"
3,f0d65e5,thailand,thailand,thailand
4,4a557358,thailand,"thailand, cambodia","thailand, cambodia"


In [19]:
def get_list(x):
    return x.split(',')

In [20]:
df_matches['country_list'] = df_matches['Country1'].apply(lambda x: get_list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
df_matches['top_two_list'] = df_matches['top_two_countries'].apply(lambda x: get_list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
predictions = list(df_matches['top_two_list'])

In [23]:
actual= list(df_matches['country_list'])

In [24]:
# not handling if there are ties of two contries having the same counts...
# check against labelled data

In [25]:
import numpy as np
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score / min(len(actual), k)
def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [26]:
mapk(actual, predictions, k = 4)

0.7597619047619049

In [27]:
newsData.head()

Unnamed: 0,index,article_id,title,url,feed_label,content,published,summary,article_text,article_keywords,article_text_len,top_lang,title_and_text,top_two_countries,all_detected_countries
0,10900,eebb9702,"India, World Bank sign financing agreement for...",http://www.abplive.in/business/india-world-ban...,NEWS WB- All Streams,,2017-12-21 09:22:12,"<table border=""0"" cellspacing=""3"" cellpadding=...","New Delhi [India], Dec 20 (ANI): A financing a...","[institutes, india, skill, financing, training...",1031,en,"India, World Bank sign financing agreement for...",india,india
1,4268,6832ce57,Rs 40000-crore development projects in limbo i...,http://www.moneycontrol.com/news/business/econ...,NEWS AIIB - All Streams,,2017-12-10 09:40:00,"<table border=""0"" cellspacing=""3"" cellpadding=...","Development projects worth more than Rs 40,000...","[development, crore, andhra, eaps, state, proj...",4390,en,Rs 40000-crore development projects in limbo i...,india,india
2,1663,30f8f65e,https://www.the-american-interest.com/2018/01/...,https://www.the-american-interest.com/2018/01/...,NEWS AFDB- All Streams,,2018-01-03 12:21:54,"<table border=""0"" cellspacing=""3"" cellpadding=...",Ten Lessons\n\nDevelopment with Chinese Charac...,"[transitions, university, chinese, united, dev...",575,en,https://www.the-american-interest.com/2018/01/...,china,china
3,3789,5ec16472,$300 Million to Expand and Upgrade Infrastruct...,https://jis.gov.jm/300-million-to-expand-and-u...,NEWS IDB - All Streams,,2018-04-27 16:28:43,"<table border=""0"" cellspacing=""3"" cellpadding=...","Minister of Industry, Commerce, Agriculture an...","[research, development, agriculture, fisheries...",2621,en,$300 Million to Expand and Upgrade Infrastruct...,,
4,6657,989c9942,ADB Provides $346 Million to Upgrade State Hig...,http://www.business-standard.com/article/news-...,NEWS ADB - All Streams,,2017-12-10 01:00:00,"<table border=""0"" cellspacing=""3"" cellpadding=...",The Asian Development Bank's (ADB) Board of Di...,"[highways, improvement, india, state, upgrade,...",2532,en,ADB Provides $346 Million to Upgrade State Hig...,india,india


## parsing out dates in the article text

In [28]:
import datefinder
import datetime

In [57]:
def extract_dates(text:str, base_date:datetime, min_date=datetime.date(2000,1,1),num_dates=5):
    """
    Given a text, parse all dates found in the content
    
    text: article content
    min_date: minimum date for date to be considered valid
    """
    matches = datefinder.find_dates(text,base_date=base_date)
    valid_found_dates = set()
    for match_date in matches:
        match_date = match_date.date()
        if match_date > min_date:
            valid_found_dates.add(match_date)
    valid_found_dates = list(valid_found_dates)[:num_dates]
    valid_found_dates = [d.strftime('%Y-%m-%d') for d in valid_found_dates]
    
    return list(valid_found_dates)


In [58]:
newsData.head().apply(lambda x : extract_dates (x['article_text'], x['published']), axis=1)

0                 [2022-11-30, 2017-12-20, 2017-12-26]
1    [2016-09-10, 2017-07-31, 2017-05-10, 2016-10-2...
2                                                   []
3                                         [2018-04-25]
4                             [2017-03-10, 2023-12-10]
dtype: object

In [59]:
newsData['dates']=newsData.apply(lambda x : extract_dates (x['article_text'], x['published']), axis=1)

In [60]:
#newsData[['article_id','dates']].to_csv('article_date.csv')

## parsing out bank information

In [61]:
bankLabel = pd.read_csv(banklabel_path)

In [62]:
bankLabel.head()

Unnamed: 0,article_id,published,title,url,feed_label,Bank1,Bank2
0,10f9ed2,2018-01-11,ADB Provides Support for Three Infrastructure ...,http://moderndiplomacy.eu/2018/01/11/adb-provi...,NEWS ADB - All Streams,adb,
1,c0eece9b,2018-05-13,ADB Helps Inaugurate New Power Distribution Ne...,http://feedproxy.google.com/~r/adb_news/~3/2My...,NEWS ADB - All Streams,adb,
2,d1d79dd8,2018-02-20,ADB Provides $360 Million for Rolling Stock to...,http://feedproxy.google.com/~r/adb_news/~3/v9s...,NEWS ADB - All Streams,adb,
3,f0d65e5,2018-02-25,ADB provides financing to Thailand's B.Grimm P...,https://www.dealstreetasia.com/stories/adb-b-g...,NEWS ADB - All Streams,adb,
4,4a557358,2018-02-26,ADB's $235m loan to support B.Grimm Power expa...,https://www.power-technology.com/news/adbs-235...,NEWS ADB - All Streams,adb,


In [63]:
bankNewsData = pd.merge(bankLabel,newsData[['article_id','article_text','article_keywords']], on = 'article_id')

In [64]:
bankNames = bankNewsData['Bank1'].unique()
bankNames

array(['adb', 'afdb', 'aiib', 'ebrd', 'eib', 'idb', 'wb'], dtype=object)

In [65]:
def bank_tagging(data, bank_names):
    """
    Search bank names through 'feed_label'. 
    There are 13 banks that are within IAP's interest which 
    is stored in 'bankNames' list.
    Bank name is then appended to the unique 'article_id'.
    """
    results =[]
    for i in data.index:
        banks =[]
        for bank in data.loc[i, 'feed_label'].replace('-', '').lower().split():
            if bank in bank_names:
                banks.append(bank)
            else: pass
        results.append([data.loc[i, 'article_id'], ",".join(banks)])
    results_df = pd.DataFrame(results,columns=['article_id', 'bank'])
    return results_df

In [66]:
results_df1 = bank_tagging(newsData, bankNames)

In [70]:
newsData = newsData.merge(results_df1, how ='inner', on = 'article_id')

In [71]:
newsData.head()

Unnamed: 0,index,article_id,title,url,feed_label,content,published,summary,article_text,article_keywords,article_text_len,top_lang,title_and_text,top_two_countries,all_detected_countries,dates,wb,bank_x,bank_y,bank
0,10900,eebb9702,"India, World Bank sign financing agreement for...",http://www.abplive.in/business/india-world-ban...,NEWS WB- All Streams,,2017-12-21 09:22:12,"<table border=""0"" cellspacing=""3"" cellpadding=...","New Delhi [India], Dec 20 (ANI): A financing a...","[institutes, india, skill, financing, training...",1031,en,"India, World Bank sign financing agreement for...",india,india,"[2022-11-30, 2017-12-20, 2017-12-26]",wb,wb,wb,wb
1,4268,6832ce57,Rs 40000-crore development projects in limbo i...,http://www.moneycontrol.com/news/business/econ...,NEWS AIIB - All Streams,,2017-12-10 09:40:00,"<table border=""0"" cellspacing=""3"" cellpadding=...","Development projects worth more than Rs 40,000...","[development, crore, andhra, eaps, state, proj...",4390,en,Rs 40000-crore development projects in limbo i...,india,india,"[2016-09-10, 2017-07-31, 2017-05-10, 2016-10-2...",aiib,aiib,aiib,aiib
2,1663,30f8f65e,https://www.the-american-interest.com/2018/01/...,https://www.the-american-interest.com/2018/01/...,NEWS AFDB- All Streams,,2018-01-03 12:21:54,"<table border=""0"" cellspacing=""3"" cellpadding=...",Ten Lessons\n\nDevelopment with Chinese Charac...,"[transitions, university, chinese, united, dev...",575,en,https://www.the-american-interest.com/2018/01/...,china,china,[],afdb,afdb,afdb,afdb
3,3789,5ec16472,$300 Million to Expand and Upgrade Infrastruct...,https://jis.gov.jm/300-million-to-expand-and-u...,NEWS IDB - All Streams,,2018-04-27 16:28:43,"<table border=""0"" cellspacing=""3"" cellpadding=...","Minister of Industry, Commerce, Agriculture an...","[research, development, agriculture, fisheries...",2621,en,$300 Million to Expand and Upgrade Infrastruct...,,,[2018-04-25],idb,idb,idb,idb
4,6657,989c9942,ADB Provides $346 Million to Upgrade State Hig...,http://www.business-standard.com/article/news-...,NEWS ADB - All Streams,,2017-12-10 01:00:00,"<table border=""0"" cellspacing=""3"" cellpadding=...",The Asian Development Bank's (ADB) Board of Di...,"[highways, improvement, india, state, upgrade,...",2532,en,ADB Provides $346 Million to Upgrade State Hig...,india,india,"[2017-03-10, 2023-12-10]",adb,adb,adb,adb


In [72]:
tagging_output = newsData[['article_id', 'top_two_countries', 'bank', 'dates']].to_dict(orient = 'index')