### Imports

In [10]:
import pandas as pd
import numpy as np

pd.options.display.max_rows = 1000

### Read Data

In [2]:
df_us = pd.read_csv('./Data/combined_us.csv')

### Automated State Inferences

In [5]:
# Create a dataframe of most common state code for each city name
# Aggregation method from:
# https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
df_inf_state = df_us.groupby('city')['state'].agg(lambda x: x.value_counts().index[0])

In [6]:
# Define function assigning most common states above to unspecified states in dataframe
def infer_state(row):
    if row['state'] != 'unspecified':
        return row['state']
    else:
        return df_inf_state[row['city']]

In [7]:
# Apply inferences to dataframe
df_us['state'] = df_us.apply(infer_state, axis = 1)

### Manual Inferences

In [8]:
# Create masks to limit data to records with unspecified cities but assigned states, excluding state-level locations
mask_city = (df_us['city'] == 'unspecified')
mask_state = (df_us['state'] != 'unspecified')
mask_type = (df_us['place_type'] != 'admin')

# Determine columns for assessment
list_cols = ['place_type','place_full_name','city','state']

# Create dataframe of all unique values
df_man = df_us.loc[mask_city & mask_state & mask_type,list_cols].drop_duplicates().reset_index(drop = True)

In [12]:
# Read in the current manual adjustment CSV. If none exists, skip this step
try:
    df_man_old = pd.read_csv('./Data/manual_inferences.csv', index_col = 0)
    
    # Add the above dataframe and drop newer records without an inferred city
    df_man = df_man_old.append(df_man).drop_duplicates(subset = ['place_full_name'], keep = 'first').reset_index(drop = True)
    
except:
    # Create column to track corrected records
    df_man['corrected'] = 0
    print('No existing manual inferences file.')

In [13]:
def input_inference(row):
    if row['corrected'] == 1:
        return row['city']
    else:
        place = row['place_full_name']
        return input(f'Set {place} "city" as :')

In [14]:
# Run through all undesignated cities within the combined dataframe
df_man['city'] = df_man.apply(input_inference, axis = 1)

In [15]:
# Correct Washington, DC, neighborhoods to DC state code
df_man['state'] = df_man.apply(lambda row: 'DC' if (row['state'] == 'WA') & (row['city'] == 'Washington')
                                                else row['state']
                               , axis = 1)

In [16]:
# Update all records to indicate they have been corrected
df_man['corrected'] = 1

# Write dataframe of corrections to data folder
df_man.to_csv('./Data/manual_inferences.csv')

In [17]:
# Define function to update dataframe if the full place name is in the manual inference table
def update_manuals(row):
    if row['place_full_name'] in df_man['place_full_name']:
        row['state'] = df_man.loc[df_man['place_full_name'] == row['place_full_name'], 'state']
        row['city'] = df_man.loc[df_man['place_full_name'] == row['place_full_name'], 'city']
        return row
    else:
        return row

In [18]:
# Update US dataframe with manual inferences
df_us = df_us.apply(update_manuals, axis = 1)

### Final Cleaning

In [24]:
# Drop all remaining data with unspecified state names
df_us = df_us[df_us['state'] != 'unspecified']

### Output Updated US Data

In [25]:
# Update combined US data file
df_us.to_csv('./Data/combined_us.csv', index = False)