In [1]:
"""Module for parsing headlines from the source"""
import re
from collections import defaultdict

import pandas
import unicodedata
import geonamescache


In [2]:
gc = geonamescache.GeonamesCache()
all_cities = gc.get_cities()
all_countries = gc.get_countries()

prepared_dict = {}
for record_id, city_data in all_cities.items():
    prepared_city_name = unicodedata.normalize('NFKD', city_data['name']).encode('ascii', 'ignore').decode('utf-8')
    prepared_dict[record_id] = re.compile(r'\b{}\b'.format(prepared_city_name))


In [4]:
# going to keep structure like the following, where score is a number of words in city that matches the header
# {
#   'Mystery Virus Spreads in Recife, Brazil': {'score': 1, 'city_name': 'Recife', 'county_name': 'Brazil'},
#   'Zika Outbreak in Wichita Falls': {'score': 2, 'city_name': 'Wichita Falls', 'county_name': 'United States'}
# }
header_match_score = defaultdict(lambda : {'score': 0, 'city_name': '', 'county_name': ''})

with open('./headlines.txt', 'r') as f_handler:
    for headline in f_handler.readlines():
        headline = headline.strip()
        for record_id, reg_exp in prepared_dict.items():
            res = re.search(reg_exp, unicodedata.normalize('NFKD', headline).encode('ascii', 'ignore').decode('utf-8'))
            
            if res and res.group(0):
                city_name = all_cities[record_id].get('name')
                city_score = len(city_name.split())
                
                if header_match_score[headline]['score'] < city_score:
                    header_match_score[headline]['score'] = city_score
                    header_match_score[headline]['city_name'] = city_name
                    
                    country_name = all_countries.get(all_cities[record_id]['countrycode'])
                    if country_name:
                        country_name = country_name.get('name')
                        header_match_score[headline]['county_name'] = country_name

In [5]:
prepared_list_of_headers_and_cities = []
for headline, data in header_match_score.items():
    prepared_list_of_headers_and_cities.append((headline, data['city_name'], data['county_name']))
    
data_frame = pandas.DataFrame(columns=('headline', 'city', 'country'), data=prepared_list_of_headers_and_cities)

In [6]:
print(data_frame)


                                              headline           city  \
0                             Zika Outbreak Hits Miami          Miami   
1                      Could Zika Reach New York City?  New York City   
2                    First Case of Zika in Miami Beach    Miami Beach   
3              Mystery Virus Spreads in Recife, Brazil         Recife   
4              Dallas man comes down with case of Zika         Dallas   
..                                                 ...            ...   
601  Rumors about Rabies spreading in Jerusalem hav...      Jerusalem   
602              More Zika patients reported in Indang         Indang   
603  Suva authorities confirmed the spread of Rotav...           Suva   
604         More Zika patients reported in Bella Vista    Bella Vista   
605                     Zika Outbreak in Wichita Falls  Wichita Falls   

           country  
0    United States  
1    United States  
2    United States  
3           Brazil  
4    United States