In [8]:
#Read in headlines text file as list
with open("data/headlines.txt") as f:
    headlines = [line.rstrip('\n') for line in f.readlines()]

In [9]:

import geonamescache
from unidecode import unidecode
import pandas as pd
import re
from collections import defaultdict


In [10]:
#Country and city list from geonamescache
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
cities = gc.get_cities()

In [11]:
#Create modified versions of city and country lists for use in headline matching 
country_list = [unidecode(x['name']) for x in countries.values()]
city_list = [unidecode(x['name']) for x in cities.values()]

In [12]:
def build_word_dict(source_list):
    '''Return a dictionary built from the source_list. 
    The keys in the output are the unique first words of the items in the source_list
    The corresponding values are a list of the items that have the key as the first word.
    '''
    output = defaultdict(list)
    
    for item in source_list:
        output[item.split(' ')[0]].append(item)

    return(output)

In [13]:

#Create a dict
country_dict = build_word_dict(country_list)
city_dict = build_word_dict(city_list)

In [14]:
def match_item(headline, item_dict):
    '''Searches in the headline for occurences of items in item_list. 
    Returns the longest match based on number of matching words.
    If there is a tie in length, returns only the first match.
    '''
    
    #Create empty result list
    match_list = []
    
    #Split the headlines into words so we can process word by word, which limits the search space
    words = headline.split(" ")
    
    #Do some cleanup of the words
    #Cleaning up 's does impact a few city names
    words = [word.replace("'s","") if word[-2:] == "'s" else word for word in words]
    #Clean up punctuation
    words = [word[0:-1] if word[-1:] in [",","?","!"] else word for word in words]
    
    for word in words:
        if item_dict[word]:
            for item in item_dict[word]:
                regex = r"\b("+item+r")"                    
                compiled_re = re.compile(regex, flags=re.IGNORECASE)
                result = compiled_re.search(headline)
                if result is not None:
                    match_list.append(result.group())

    if len(match_list) == 0:
        return(None, None)
    else:
        lengths = [len(match) for match in match_list]
        return(match_list[lengths.index(max(lengths))], match_list )

In [15]:
#Run country and city matches to generate an output dataset
data = []
for headline in headlines[0:]:
    row = {}

    city, _ = match_item(headline, city_dict)
    country, _ = match_item(headline, country_dict)

    
    row['Headline'] = headline
    row['City'] = city
    row['Country'] = country
    data.append(row)

df = pd.DataFrame(data)

In [16]:
pd.set_option('display.max_colwidth', -1)
df.sample(10)

Unnamed: 0,Headline,City,Country
395,Herpes Vaccine is now Required in Mattoon,Mattoon,
549,Zika Reported in Ciudad Acuna,Ciudad Acuna,
195,Chikungunya Hits Denver,Denver,
380,Case of Measles Reported in Springdale,Springdale,
271,Rotavirus Vaccine is now Required in Starkville,Starkville,
309,Spanish flu spreading in Madrid,Madrid,
487,Spanish Flu Spreading through Madrid,Madrid,
251,Zika Strikes St. Petersburg,St. Petersburg,
319,Authorities are Worried about the Spread of Chickenpox in Richmond,Richmond,
467,Authorities are Worried about the Spread of Syphilis in Pyongyang,Pyongyang,


In [26]:
import pickle

In [32]:
#create a file
picklefile = open('data/headline_city_country', 'wb')

In [33]:
#pickle the dataframe
pickle.dump(df, picklefile)

In [34]:
#read the pickle file
picklefile = open('data/headline_city_country', 'rb')
#unpickle the dataframe
df2 = pickle.load(picklefile)
#close file
#picklefile.close()

In [35]:
pd.set_option('display.max_colwidth', -1)
df2.sample(10)

Unnamed: 0,Headline,City,Country
572,Varicella Exposure in Cambridge,Cambridge,
102,Zika afflicts patient in Calamba,Calamba,
478,Vineland authorities confirmed the spread of Chlamydia,Vineland,
333,Zika case confirmed in Lorain,Lorain,
432,Schools in Bridgeton Closed Due to Mumps Outbreak,Bridgeton,
183,New medicine wipes out Chikungunya in Tucson,Tucson,
142,Authorities are Worried about the Spread of Tuberculosis in Abuja,Abuja,
350,Spike of Dengue Cases in Stockholm,Stockholm,
51,Zika Virus Transmission Detected in Havana,Havana,
192,Zika Arrives in North Miami,North Miami,


In [36]:
#close file
picklefile.close()