In [9]:
"""Module for parsing headlines from the source"""
import re
from collections import defaultdict

import pandas
import unicodedata
import geonamescache


In [10]:
gc = geonamescache.GeonamesCache()
all_cities = gc.get_cities()
all_countries = gc.get_countries()

prepared_dict = {}
for record_id, city_data in all_cities.items():
    prepared_city_name = unicodedata.normalize('NFKD', city_data['name']).encode('ascii', 'ignore').decode('utf-8')
    prepared_dict[record_id] = re.compile(r'\b{}\b'.format(prepared_city_name))


In [14]:
# going to keep structure like the following, where score is a number of words in city that matches the header
# {
#   'Mystery Virus Spreads in Recife, Brazil': {'score': 1, 'city_name': 'Recife', 'county_name': 'Brazil'},
#   'Zika Outbreak in Wichita Falls': {'score': 2, 'city_name': 'Wichita Falls', 'county_name': 'United States'}
# }
header_match_score = defaultdict(lambda : {
    'score': 0, 'city_name': '', 'county_name': '', 'longitude': '', 'latitude': ''
})

with open('./headlines.txt', 'r') as f_handler:
    for headline in f_handler.readlines():
        headline = headline.strip()
        for record_id, reg_exp in prepared_dict.items():
            res = re.search(reg_exp, unicodedata.normalize('NFKD', headline).encode('ascii', 'ignore').decode('utf-8'))
            
            if res and res.group(0):
                city_name = all_cities[record_id].get('name')
                city_score = len(city_name.split())
                
                if header_match_score[headline]['score'] < city_score:
                    country_code = all_cities[record_id]['countrycode']
                    country_name = all_countries.get(country_code)
                    
                    if country_name:
                        country_name = country_name.get('name')
                        latitude = all_cities[record_id]['latitude']
                        longitude = all_cities[record_id]['longitude']
                        
                        header_match_score[headline]['score'] = city_score
                        header_match_score[headline]['city_name'] = city_name
                        header_match_score[headline]['county_name'] = country_name
                        header_match_score[headline]['latitude'] = latitude
                        header_match_score[headline]['longitude'] = longitude
                        header_match_score[headline]['country_code'] = country_code

In [15]:
prepared_list_of_headers_and_cities = []
for headline, data in header_match_score.items():
    if not data['score'] == 0:
        prepared_list_of_headers_and_cities.append(
            (headline, data['city_name'], 
             data['county_name'], 
             data['country_code'], 
             data['latitude'], 
             data['longitude'])
        )
    
data_frame = pandas.DataFrame(
    columns=('headline', 'city', 'country', 'country_code', 'latitude', 'longitude'),
    data=prepared_list_of_headers_and_cities)

data_frame.to_json('./prepared_countries.json')

In [16]:
data_frame


Unnamed: 0,headline,city,country,country_code,latitude,longitude
0,Zika Outbreak Hits Miami,Miami,United States,US,25.77427,-80.19366
1,Could Zika Reach New York City?,New York City,United States,US,40.71427,-74.00597
2,First Case of Zika in Miami Beach,Miami Beach,United States,US,25.79065,-80.13005
3,"Mystery Virus Spreads in Recife, Brazil",Recife,Brazil,BR,-8.05389,-34.88111
4,Dallas man comes down with case of Zika,Dallas,United States,US,32.78306,-96.80667
...,...,...,...,...,...,...
601,Rumors about Rabies spreading in Jerusalem hav...,Jerusalem,Israel,IL,31.76904,35.21633
602,More Zika patients reported in Indang,Indang,Philippines,PH,14.19528,120.87694
603,Suva authorities confirmed the spread of Rotav...,Suva,Fiji,FJ,-18.14161,178.44149
604,More Zika patients reported in Bella Vista,Bella Vista,Argentina,AR,-27.03424,-65.30196
