# Extract

In [44]:
# Use operating system library to get paths
import os
path_input = os.path.join(os.path.dirname(os.getcwd()), 'data', 'complete.csv')
# Read data
import pandas as pd
df = pd.read_csv(path_input, dtype={'zipcode':str})

# Clean

#### General Cleaning

In [12]:
# Change Certain Values
df.loc[df['name']=="Siphon Coffee", 'serves_alcohol'] = True

# Change values across column
df['neighborhood'] = df['neighborhood'].str.replace("West U", "West University")

# Handle NA Values
df.loc[df['zipcode'].isna(), 'zipcode'] = "Unknown"
df['zipcode'] = df['zipcode'].fillna(value=0)
df['zipcode'] = df['zipcode'].fillna(method='ffill')

#### Dates

In [30]:
# General dates
# Import datetime dependencies
from datetime import date, datetime
from dateutil.relativedelta import *
import numpy as np

# Convert string to numpy.datetime64 format
dateStart = "2022-04-01"
dateEnd = "2023-03-31"
dateStart = np.datetime64(dateStart)
dateEnd = np.datetime64(dateEnd)

# Convert string to datetime.datetime format
dateStart = "4/1/2022"
dateEnd = "3/31/2023"
dateStart = datetime.strptime(dateStart, '%m/%d/%Y')
dateEnd = datetime.strptime(dateEnd, '%m/%d/%Y')

# Find today's date
today = date.today()

In [40]:
# Date columns
# Convert from string format to datetime
df['dob_datetime'] = pd.to_datetime(df['dob_str'], errors='coerce')

# Convert numerical or string column to numpy.datetime64
df['dob_datetime'] = pd.to_datetime(df['dob_mmddyy'], errors='coerce')

# Convert from Excel format (integer) to datetime
import xlrd
# Convert df column
df.loc[[isinstance(x, int) for x in df['dob_excel']], 'dob_excel'] = [xlrd.xldate_as_datetime(xl_date, 0) for xl_date in df.loc[[isinstance(x, int) for x in df['dob_excel']], 'dob_excel']]
df.loc[[isinstance(x, str) for x in df['dob_excel']], 'dob_excel'] = pd.to_datetime(df.loc[[isinstance(x, str) for x in df['dob_excel']], 'dob_excel'])
df['dob_excel'] = pd.to_datetime(df['dob_excel'], errors='coerce')

In [42]:
# Ages
# Import datetime dependencies
from datetime import date, datetime
from dateutil.relativedelta import *

### Convert DOB to Age
# Create function to calculate ages based on DoB
def findAge(row):
    # Use current date or date of service
    now = date.today() # row["serviceDate"]
    # If data was collected set age based on calculation, else mark NA
    if pd.notnull(row["dob_datetime"]):
        age = relativedelta(now, row["dob_datetime"]).years
    else:
        age = np.nan
    return age
# Apply the function to the df
df["age"] = df.apply(findAge, axis=1)

#### Incomes

In [None]:
# If the monthly incomeAnnual is greater than $10k it is probably mistakenly an annual income
# To rectify the problem, divide outlier incomes (>$10,000) by 12 to find monthly incomes
df.loc[df['incomeMonthly']>10000, 'incomeMonthly'] = df.loc[df['incomeMonthly']>10000, 'incomeMonthly'] / 12
# Calculate annual incomes based on monthly income * 12
df['incomeAnnual'] = df['incomeMonthly'] * 12

# Annual Incomes
# If the Annual incomeAnnual is less than $2,500 it is probably mistakenly an annual income
# To rectify the problem, multiply outlier incomes (<$2,500) by 12 to find monthly incomes
df.loc[df['incomeAnnual']<2500, 'incomeAnnual'] = df.loc[df['incomeAnnual']<2500, 'incomeAnnual'] * 12

# Income Bin and Cut
incomeBinList = [0, 15000, 30000, 55000, np.inf]
df['incomeBracket'] = pd.cut(df['incomeAnnual'], bins=incomeBinList, include_lowest=True)
df['incomeBracket'] = df['incomeBracket'].astype('str')

#### Zip Codes

In [None]:
# Select only first five digits of a zip already in string type
df['zipcode'] = df['zipcode'].str[:5]
# Fill in zeros for 00--- zip codes
df['zipcode'] = df['zipcode'].str.zfill(5)

# Put in five digit string format from full address
df['address_full'] = df['address'] + " " + df['city'] + " " + df['zipcode']
df['homeZip'] = df['address_full'].str[-10:]
df['homeZip'] = df['homeZip'].str.extract(r'(\d{5})')

#### Geocoding

In [32]:
# Find lat/long of an address using GoogleMaps (requires free-ish API Key from Google Developers site)
import googlemaps
APIKEY_GOOGLE = os.environ.get('GOOGLEMAPS_API')
# Get lats and longs using Google
from tqdm import tqdm
tqdm.pandas()
gmaps = googlemaps.Client(key=APIKEY_GOOGLE)

def getgooglegeo(row):
    try: 
        loca = gmaps.geocode(row['address_full'])[0]['geometry']['location']
        if loca:
            lati = loca['lat']
            longi = loca['lng']
        else:
            lati = "None"
            longi = "None"
    except:
        lati = "None"
        longi = "None"        
    return pd.Series([lati, longi])
    
df[['latitude', 'longitude']] = df.progress_apply(getgooglegeo, axis=1)

In [None]:
# Find lat/long of an address using Open Street Maps (less reliable than Google)
import pgeocode
nomi = pgeocode.Nominatim('us')
tqdm.pandas()

def getPGEO(row):
    loca = nomi.query_postal_code(row['zipcode'])
    if loca.latitude:
        lati = loca.latitude
        longi = loca.longitude
    else:
        lati = "None"
        longi = "None"
    return pd.Series([lati, longi])

df[['latitude', 'longitude']] = df.progress_apply(getPGEO, axis=1)

In [None]:
# Assign to census tract based on address
import censusgeocode as cg

def assignTract(row):
    try:
        lati = row['latitude']
        longi = row['longitude']
        result = cg.coordinates(x=longi, y=lati)['Census Tracts'][0]['GEOID']
    except:
        result = np.nan
    return result

# Apply the function
df['tract'] = df.progress_apply(assignTract, axis=1)

In [None]:
# Find distances
# Find distances between zip codes
import pgeocode
dist = pgeocode.GeoDistance('us')
# Find the distance from one zip code to another
distance = dist.query_postal_code(zipcodeA, zipcodeB)

# Find distances between lat/longs
import geopy.distance
coordsA = (latitudeA, longitudeA)
coordsB = (latitudeB, longitudeB)
distance = geopy.distance.geodesic(coordsA, coordsB).miles

# Filter

### Pandas.loc

In [None]:
# Set conditions
cond_equalsTrue = df['serves_coffee']==True
cond_equalsValue = df['neighborhood']=='West U'
cond_isin = df['neighborhood'].isin(['Montrose', 'Midtown'])
cond_notin = ~df['neighborhood'].isin(['Montrose', 'Midtown'])
cond_isna = df['serves_coffee'].isna()
cond_notna = df['serves_coffee'].notna()
cond_isnull = pd.isnull(df['serves_coffee'])
cond_notnull = pd.notnull(df['serves_coffee'])
cond_isnumer = pd.isnumeric(df['zipcode'])
cond_isstring = ~pd.isnumeric(df['zipcode'])

# Loc function
df.loc[cond_equalsTrue]
df.loc[cond_equalsTrue | cond_isin] # or
df.loc[(cond_equalsTrue & cond_equalsValue) | cond_isin] # and or

### Slicing

In [None]:
a[start:stop:step] # start through not past stop, by step
a[-1]    # last item in the array
a[-2:]   # last two items in the array
a[:-2]   # everything except the last two items
a[::-1]    # all items in the array, reversed
a[1::-1]   # the first two items, reversed
a[:-3:-1]  # the last two items, reversed
a[-3::-1]  # everything except the last two items, reversed

# Sort

In [None]:
# Sorting
df = df.sort_values(by='ColA', ascending=True, ignore_index=True)