In [1]:
%%capture

# get_corpus_path
# get_txt_orig_path

%run ./path_manager.ipynb

In [2]:
import pandas as pd
import json
import re
import dateutil.parser

In [3]:
def generic_imf_normalizer(x):
    # Note: this removes all contiguous whitespaces and replaces `;` with `,`
    if isinstance(x, list):
        x = ' '.join(x)
    elif not isinstance(x, str):
        if isinstance(x, bytes):
            x = x.decode('utf-8', 'ignore')
        else:
            return x

    return re.sub('\s*;\s*', ',', re.sub('\s+', ' ', x).strip())

In [4]:
non_standard_headers = {
    'Author/Editor': 'authors',
    'Electronic Access': 'electronic_access',
    'Format': 'format',
    'ISBN/ISSN': 'isbn',
    'Pages': 'pages',
    'Price': 'price',
    'Publication Date': 'pub_date',
    'Stock No': 'stock',
    'Summary': 'summary',
}

drop_fields = set(['Series'])

path_original_dir = '/NLP/CORPUS/IMF/TXT_ORIG'
path_clean_dir = '/NLP/CORPUS/IMF/TXT_CLEAN'

imf_all = pd.read_json(os.path.join(get_corpus_path('IMF'), 'imf-all.json'))

# This is done because of malformatted imf-detail.json data.
with open(os.path.join(get_corpus_path('IMF'), 'imf-detail.json')) as fl:
    data = []
    imf_detail = fl.read().replace('][\n{"title"', ',{"title"')
    imf_detail = imf_detail.replace('}[\n{"title"', '},{"title"')

    imf_detail = json.loads(imf_detail)
    
    for d in imf_detail:
        tmp_d = {}
        tmp_d['corpus'] = 'imf'
        
        for k in d:
            if k in drop_fields:
                continue
                
            if k == 'files':
                fd = d[k]
                if len(fd) > 0:
                    tmp_d.update(fd[0])
                    tmp_d['id'] = 'imf_{}'.format(os.path.basename(tmp_d['path']).split('.')[0])
                    tmp_d['path_original'] = os.path.join(path_original_dir, tmp_d['id'] + '.txt')
                    tmp_d['path_clean'] = os.path.join(path_clean_dir, tmp_d['id'] + '.txt')
    
                else:
                    tmp_d['checksum'] = ''
                    tmp_d['path'] = ''
                    tmp_d['url'] = ''
                    tmp_d['id'] = None
                    tmp_d['path_original'] = ''
                    tmp_d['path_clean'] = ''
                    
                continue
                    
            tmp_d[non_standard_headers.get(k, k)] = d[k]
        data.append(tmp_d)
        
    imf_detail = pd.DataFrame(data)

    for c in imf_detail.columns:
        imf_detail[c] = imf_detail[c].map(generic_imf_normalizer)

# # with open(os.path.join(get_corpus_path('IMF'), 'imf-detail-fixed.json'), 'w') as fl:
# #     json.dump(json.dumps(d), fl)
# imf_detail = pd.read_json(os.path.join(get_corpus_path('IMF'), 'imf-detail.json'))

In [5]:
imf_detail = imf_detail.dropna(subset=['id'])
imf_detail = imf_detail.drop_duplicates(subset='id', keep='last')  # Assume that the last version is the better one

In [6]:
imf_detail.shape

(14150, 20)

In [7]:
METADATA_COLS = [
    'corpus', 'id', 'path_original', 'path_clean', 'filename_original', 'year',
    'major_doc_type', 'doc_type', 'author', 'collection', 'title', 'journal', 'volume',
    'date_published', 'digital_identifier', 'topics_src', 'url_pdf', 'url_txt', 'language_src',
    'adm_region', 'geo_region', 'country',

    # Not yet available at this stage...,
    # 'language_detected', 'language_score', 'tokens'  

    # WB specific fields
    'wb_lending_instrument', 'wb_product_line', 'wb_major_theme', 'wb_theme', 'wb_sector',
    'wb_subtopic_src', 'wb_project_id',
    # 'wb_environmental_category', 
]

In [8]:
imf_detail.columns

Index(['authors', 'checksum', 'corpus', 'download_url', 'electronic_access',
       'file_urls', 'format', 'id', 'isbn', 'language', 'pages', 'path',
       'path_clean', 'path_original', 'price', 'pub_date', 'stock', 'summary',
       'title', 'url'],
      dtype='object')

In [9]:
imf_detail.head(2)

Unnamed: 0,authors,checksum,corpus,download_url,electronic_access,file_urls,format,id,isbn,language,pages,path,path_clean,path_original,price,pub_date,stock,summary,title,url
3088,"Nigel A Chalk,Michael Keen,Victoria J Perry",6b4942ae1ded3aa38da9b596d4e75fa1,imf,/~/media/Files/Publications/WP/2018/wp18185.ashx,Free Full Text . Use the free Adobe Acrobat Re...,http://www.imf.org/~/media/Files/Publications/...,Paper,imf_6cba1c84a4fb1d3dbf4dfbbd011f115eb03d1d68,9781484372548/1018-5941,English,48,full/6cba1c84a4fb1d3dbf4dfbbd011f115eb03d1d68....,/NLP/CORPUS/IMF/TXT_CLEAN/imf_6cba1c84a4fb1d3d...,/NLP/CORPUS/IMF/TXT_ORIG/imf_6cba1c84a4fb1d3db...,$18.00 (Academic Rate:$18.00),"August 7, 2018",WPIEA2018185,This paper assesses the landmark Tax Cuts and ...,The Tax Cuts and Jobs Act: An Appraisal,http://www.imf.org/~/media/Files/Publications/...
3089,International Monetary Fund. Asia and Pacific ...,9e2190d86a05a390f9e1663506ecaae7,imf,/~/media/Files/Publications/CR/2018/cr18255.ashx,Free Full Text . Use the free Adobe Acrobat Re...,http://www.imf.org/~/media/Files/Publications/...,Paper,imf_b5b7de057454edbaad791b1eb067d6a3cc88cbfd,9781484373200/1934-7685,English,45,full/b5b7de057454edbaad791b1eb067d6a3cc88cbfd....,/NLP/CORPUS/IMF/TXT_CLEAN/imf_b5b7de057454edba...,/NLP/CORPUS/IMF/TXT_ORIG/imf_b5b7de057454edbaa...,$18.00 (Academic Rate:$18.00),"August 6, 2018",1INDEA2018004,Selected Issues,India : Selected Issues,http://www.imf.org/~/media/Files/Publications/...


# Cleaning

In [10]:
clean_imf_detail = pd.DataFrame()

for c in imf_detail.columns:
    clean_imf_detail[c] = imf_detail[c].map(generic_imf_normalizer)

In [11]:
clean_imf_detail['price'].head(5)

3088    $18.00 (Academic Rate:$18.00)
3089    $18.00 (Academic Rate:$18.00)
3090    $18.00 (Academic Rate:$18.00)
3091                             Free
3092                             Free
Name: price, dtype: object

In [12]:
imf_detail['pub_date'][imf_detail['pub_date'].str.len() > 30].head(5)

3330    Corporate income taxes Developed countries Eme...
3356    Caribbean Fiscal consolidation Fiscal policy G...
3433    This paper considers two options on when and h...
3448    Albania Dollarization Euro Area Interest rates...
3490    Financial data Gross domestic product National...
Name: pub_date, dtype: object

In [13]:
# For some reason, the data is again malformed.
v1_invalid_to_valid_map = {
    'format': 'price',
    'isbn': 'pub_date',
    'pages': 'format',
    'price': 'stock',
    'stock': 'isbn',
    'pub_date': 'summary'
}

v1_valid_to_invalid_map = {j: i for i, j in v1_invalid_to_valid_map.items()}
v1_invalid_reformat_ids = []

v2_invalid_reformat_ids = []

for r, row in imf_detail.iterrows():
    try:
        y = dateutil.parser.parse(row['pub_date'])
    except:
        try:
            y = dateutil.parser.parse(row[v1_valid_to_invalid_map['pub_date']])
            v1_invalid_reformat_ids.append(row['id'])
        except:
            v2_invalid_reformat_ids.append(row['id'])
            

if len(v2_invalid_reformat_ids) > 30:
    raise ValueError('Too many unexpected malformed data!!!')
    

v1_temp_df = imf_detail[imf_detail['id'].isin(v1_invalid_reformat_ids)].copy()
v1_temp_df = v1_temp_df.drop('summary', axis=1)  # summary will be mapped from the malformed pub_date
v1_temp_df = v1_temp_df.rename(columns=v1_invalid_to_valid_map)

valid_imf_detail = imf_detail[~imf_detail['id'].isin(v1_invalid_reformat_ids)]

imf_detail = pd.concat([valid_imf_detail, v1_temp_df], axis=0).reset_index()
imf_detail = imf_detail[imf_detail['pub_date'].str.len() < 50]  # Some rows have the summary in the pub_date

# Build other core fields

In [14]:
imf_detail['pub_date'] = pd.to_datetime(imf_detail['pub_date'].map(dateutil.parser.parse))
imf_detail['year'] = imf_detail['pub_date'].dt.year
imf_detail['filename_original'] = imf_detail.download_url.map(os.path.basename)
imf_detail['author'] = imf_detail['authors']
imf_detail['language_src'] = imf_detail['language']
imf_detail['date_published'] = imf_detail['pub_date'].dt.strftime('%m/%d/%Y')
imf_detail['digital_identifier'] = imf_detail['isbn']
imf_detail['url_pdf'] = imf_detail['url']

In [15]:
not_available_fields = []
for c in METADATA_COLS:
    if c not in imf_detail.columns:
        print(c)
        not_available_fields.append(c)

major_doc_type
doc_type
collection
journal
volume
topics_src
url_txt
adm_region
geo_region
country
wb_lending_instrument
wb_product_line
wb_major_theme
wb_theme
wb_sector
wb_subtopic_src
wb_project_id


In [16]:
for c in not_available_fields:
    imf_detail[c] = None

In [17]:
corpus_id = 'IMF'
fname = f"{corpus_id.lower()}_metadata.csv"

imf_detail.reset_index()[METADATA_COLS].to_csv(
    os.path.join(get_corpus_path(corpus_id), fname),
    index=False
)