In [1]:
%%capture

# get_corpus_path
# get_txt_orig_path

%run ./path_manager.ipynb

In [35]:
import re
import pandas as pd
import os
import glob
from joblib import Parallel, delayed

In [3]:
class CountryDetector:
    countries_noise_pattern = re.compile('[^a-zÀ-ÖØ-öø-ÿ ]+')
    
    def __init__(
        self, countries_map_file,
    ):
        self.countries_map_file = countries_map_file
        self.build_countries_map()

    def build_countries_map(self):
        '''
        Generates a map between a variant of a country name into the appropriate standard code from a whitelist.
        Assume that the whitelist `countries_map_file` is an excel file without header and the first column is the country code to use.
        The `countries_map_file` must be in UTF-8 encoding.
        '''
        countries_map_df = pd.read_csv(self.countries_map_file, header=None, index_col=0)
        self.countries_map = {}
        for ckey, clist in countries_map_df.iterrows():
            for c in clist.dropna():
                c = self.countries_noise_pattern.sub('', c.lower().strip())
                ckey = ckey.strip()
                self.countries_map[c.lower()] = {'code': ckey}
            
    def preprocess_text(self, text):
        text = text.lower()
        
        # Reduce false-positive because the $ symbol is removed and the remaining us gets detected
        text = text.replace('us$', ' ')

        text = self.countries_noise_pattern.sub('', text)
        text = text.replace('us dollar', ' ')
        
        return text
    
    def detect_countries(self, text):
        text = self.preprocess_text(text)

        country_freq = {}
        
        for c, val in sorted(self.countries_map.items(), key=lambda x: -len(x[0])):
            data = text

            code = val['code']
            freq = data.count(f' {c} ')
            
            country_freq[code] = country_freq.get(code, 0) + freq
            text = text.replace(c, '')
        
        country_freq = {i: j for i, j in country_freq.items() if j > 0}
        return country_freq

    def detect_countries_from_file(self, fname, return_df=True):
        try:
            with open(fname) as fl:
                text = fl.read()
        except UnicodeDecodeError:
            with open(fname, 'rb') as fl:
                text = fl.read()
                text = str(text, 'utf-8', 'ignore')

        doc_id = os.path.basename(fname).split('.')[0]
            
        return_val = {doc_id: self.detect_countries(text)}
        
        if return_df:
            return_val = pd.DataFrame(return_val)
            
        return return_val

In [4]:
country_detector = CountryDetector('./whitelists/whitelist_countries_multilingual.csv')

In [5]:
N_JOBS = 256  # number of simultaneous jobs to deploy

In [6]:
corpus_ids = ['IMF', 'WB']

In [None]:
%%time

for corpus_id in corpus_ids:
    print(f'Processing corpus {corpus_id}...')
    
    TXT_ORIG_DIR = get_txt_orig_path(corpus_id)
    fname_generator = glob.iglob(os.path.join(get_txt_orig_path(corpus_id), f'{corpus_id.lower()}_*.txt'))

    corpus_countries = Parallel(n_jobs=N_JOBS)(delayed(country_detector.detect_countries_from_file)(fname) for fname in fname_generator)

    corpus_countries = pd.concat(corpus_countries, axis=1).T
    corpus_countries.index.name = 'id'
    
    corpus_countries['OCC_CTRY'] = corpus_countries.sum(axis=1)
    
    country_metadata_fname = os.path.join(os.path.dirname(TXT_ORIG_DIR), f'{corpus_id.lower()}_country_counts.csv')
    corpus_countries.to_csv(country_metadata_fname)

Processing corpus IMF...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


Processing corpus WB...


In [11]:
corpus_countries.shape

(249352, 245)

In [12]:
corpus_countries.head()

Unnamed: 0_level_0,ABW,AFG,AGO,AIA,ALB,AND,ARE,ARG,ARM,ASM,...,WBG,WLF,WSM,XKX,YEM,YUG,ZAF,ZMB,ZWE,OCC_CTRY
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
wb_10003825,,,,,,,,,,,...,,,,,,,47.0,,,139.0
wb_10000577,,,,,,,,30.0,,,...,,,,,,,13.0,,,137.0
wb_1000385,,,,,,,,,,,...,,,,,,,,,,4.0
wb_10000660,,,,,,,,2.0,,,...,,,,,,,,,,65.0
wb_1000384,,,,,,,,,,,...,,,,,,,,,,57.0


In [13]:
country_metadata_fname

'/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_country_counts.csv'

# Dump data to database

In [66]:
import pymongo
import pandas as pd

# Tunnel from wbes2474 to server07  -> ssh -NL 27018:localhost:27017 wbes2474
client = pymongo.MongoClient(port=27017)  # host=wbes2474')

In [67]:
import os

In [68]:
db = client['nlp']
collection = db['countries']

In [4]:
%%time
wb_corpus_countries = pd.read_csv('/home/wb536061/wbes2474/NLP/CORPUS/WB/wb_country_counts.csv', index_col=0)
imf_corpus_countries = pd.read_csv('/home/wb536061/wbes2474/NLP/CORPUS/IMF/imf_country_counts.csv', index_col=0)

In [7]:
%%time
corpus_countries = []

for corpus_id in ['IMF', 'WB']:
    country_filepath = os.path.join(get_corpus_path(corpus_id), f'{corpus_id.lower()}_country_counts.csv')
    cc = pd.read_csv(country_filepath, index_col=0)
    
    corpus_countries.extend(cc.fillna(0).astype(int).reset_index().rename(columns={'id': '_id'}).to_dict('records'))

CPU times: user 1min 4s, sys: 4.23 s, total: 1min 8s
Wall time: 1min 8s


In [69]:
len(corpus_countries)

263794

In [9]:
%%time
# https://stackoverflow.com/a/17533368
now = pd.datetime.utcnow().isoformat()

for document in corpus_countries:
    collection.update_one(
        {"_id": document["_id"]},
        {
            "$setOnInsert": {"insertion_date": now},
            "$set": dict(list(document.items()) + [('last_update_date', now)])
        },
        upsert=True,
    )

CPU times: user 2min 44s, sys: 17.5 s, total: 3min 1s
Wall time: 17min 21s


In [36]:
len(corpus_countries)

263794

In [34]:
%%time
v = pd.DataFrame(corpus_countries[0:10000])
# v[v == 0] = None

v = v.dropna()
v
# .reset_index().to_dict('records')

CPU times: user 1.25 s, sys: 206 ms, total: 1.46 s
Wall time: 1.47 s


Unnamed: 0,_id,ABW,AFG,AGO,AIA,ALB,AND,ARE,ARG,ARM,...,WBG,WLF,WSM,XKX,YEM,YUG,ZAF,ZMB,ZWE,OCC_CTRY
0,imf_006452310f310c9630ce02a1f17884300c079b9d,0,0,0,0,0,0,0,0,0,...,0,0,6,0,0,0,0,0,0,136
1,imf_000169111a90a98ac66bb6d07e4682d6be8b6ac3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,2,0,0,270
2,imf_0065cb304805ed9acbfde8f6ef0596af2132c188,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,67
3,imf_0004fdded18df40f3680c5a2f343f773d8f18d6f,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,105
4,imf_00679871c9baeb1d4828a0978942fa567f9d27ae,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,imf_b53760fc9a900e7fd617efc4fdf924eb8b25c1d2,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,112
9996,imf_b538bbbf0c80523abc92ba0f85c6c0fd4d74ee76,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,3,0,0,425
9997,imf_b53a1fe3e821c5bdf8ee0be8a45797298badb7f4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,20
9998,imf_b53a61bd3cb7750a30e11632fbf230053699f07b,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,96


In [12]:
collection.count_documents({})

263794

In [70]:
meta_collection = db['metadata']

In [71]:
meta_collection.count_documents({})

236174

In [72]:
ids = [i['_id'] for i in corpus_countries]

In [73]:
%%time
res = meta_collection.find({'_id': {'$in': ids}}, projection=['_id', 'year'])

CPU times: user 67 µs, sys: 0 ns, total: 67 µs
Wall time: 74.9 µs


In [74]:
%%time
year_df = pd.DataFrame(list(res))
common_ids = set(year_df['_id'])
year_df.shape

CPU times: user 836 ms, sys: 69.4 ms, total: 906 ms
Wall time: 3.56 s


(236156, 2)

In [75]:
year_df.head()

Unnamed: 0,_id,year
0,imf_000169111a90a98ac66bb6d07e4682d6be8b6ac3,2012
1,imf_0004fdded18df40f3680c5a2f343f773d8f18d6f,2000
2,imf_0005ab3ce73fd116db64b77bb58fb05377daae61,2009
3,imf_000866bc21c58285ec2521640173b4c512337f48,1998
4,imf_000b9ff9931f38d47290baba821c3cdf096de9ce,2010


In [35]:
%%time
corpus_countries_df = pd.DataFrame(corpus_countries)

CPU times: user 25.7 s, sys: 3.58 s, total: 29.3 s
Wall time: 29.4 s


In [37]:
corpus_countries_df = corpus_countries_df.set_index('_id').drop('OCC_CTRY', axis=1)

In [79]:
%%time
# https://stackoverflow.com/a/17533368
# now = pd.datetime.utcnow().isoformat()
excluded_ids = []

for _id, countries in corpus_countries_df.iterrows():
    if _id not in common_ids:
        excluded_ids.append(_id)
        # print(f'{_id} not in metadata...')
        
    countries = countries[countries > 0].astype(int).to_dict()
    meta_collection.update_one(
        {"_id": _id},
        {
            "$setOnInsert": {"insertion_date": now},
            "$set": {'last_update_date': now, 'countries': countries}
        },
        upsert=True,
    )

CPU times: user 7min 8s, sys: 21.6 s, total: 7min 29s
Wall time: 14min 2s


In [57]:
row[row > 100].to_dict()

{}

In [53]:
corpus_countries_df.fillna(0, inplace=True)

In [116]:
%%time
joined_df = corpus_countries_df.merge(year_df, how='inner', on='_id')  # .drop('OCC_CTRY', axis=1)

CPU times: user 298 ms, sys: 194 ms, total: 492 ms
Wall time: 491 ms


In [118]:
%%time

panels = []
for year, group in joined_df.set_index('_id').groupby('year'):
    try:
        # laziness
        int(year)
    except:
        continue
    print(year)
    panel = (group.drop('year', axis=1).T / group.drop('year', axis=1).sum(axis=1)).sum(axis=1).reset_index().rename(columns={'index': 'iso_alpha', 0: 'popularity'})
    panel['year'] = int(year)
    panels.append(panel)
    
    
pp = pd.concat(panels, axis=0)
pp.reset_index(drop='index').to_csv('../APP/frontend/country_popularity.csv', index=False)

1946.0
1947.0
1948.0
1949.0
1950.0
1951.0
1952.0
1953.0
1954.0
1955.0
1956.0
1957.0
1958.0
1959.0
1960.0
1961.0
1962.0
1963.0
1964.0
1965.0
1966.0
1967.0
1968.0
1969.0
1970.0
1971.0
1972.0
1973.0
1974.0
1975.0
1976.0
1977.0
1978.0
1979.0
1980.0
1981.0
1982.0
1983.0
1984.0
1985.0
1986.0
1987
1988.0
1989.0
1990
1991
1992.0
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019.0
CPU times: user 1min 1s, sys: 1.82 s, total: 1min 3s
Wall time: 1min 3s


In [82]:
import pandas as pd 
import xml.etree.ElementTree as et 

In [80]:
with open('/usr/share/xml/iso-codes/iso_3166.xml') as fl:
    iso_3166 = fl.read()

In [90]:
xtree = et.parse("/usr/share/xml/iso-codes/iso_3166.xml")
xroot = xtree.getroot() 

In [91]:
df_cols = ["name", "iso3", "iso2"]
rows = []

for node in xroot: 
    s_name = node.attrib.get("name")
    alpha_3_code = node.attrib.get("alpha_3_code") if node is not None else None
    alpha_2_code = node.attrib.get("alpha_2_code") if node is not None else None
    
    rows.append({"name": s_name, "iso3": alpha_3_code, 
                 "iso2": alpha_2_code})

out_df = pd.DataFrame(rows, columns = df_cols)

In [101]:
countries_map = pd.read_csv('./whitelists/whitelist_countries_multilingual.csv', header=None)

In [111]:
pd.Series(countries_map[[0, 1]].rename(columns={0: 'iso3', 1: 'name'}).set_index('iso3').to_dict()['name']).to_json('./whitelists/country_iso3_map.json')