In [69]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

## Import and clean data

In [70]:
# Import and clean variable metadata
metadata = pd.read_csv('qol-data/csvFiles/metadata.csv')
metadata = metadata.drop(0)
metadata = metadata.iloc[:, 0:5]
metadata = metadata.dropna()


In [71]:
# Get selected NPAs
selected = pd.read_csv("./selectedNPA.csv")
selected['order'] = selected.index

In [72]:
characterRaw = pd.read_csv('qol-data/csvFiles/character.csv', header=1).dropna(thresh=10)
characterRaw = characterRaw[characterRaw.NPA.notnull()]
characterRaw.NPA = characterRaw.NPA.astype(int)

characterMerged = pd.merge(selected, characterRaw, left_on="NPA",right_on="NPA",how="left")
characterMerged.to_pickle('./qol-data/pickles/character.pkl')

In [73]:
economyRaw = pd.read_csv('qol-data/csvFiles/economy.csv', header=1).dropna(thresh=10)
economyRaw = economyRaw[economyRaw.NPA.notnull()]
economyRaw.NPA = economyRaw.NPA.astype(int)

economyMerged = pd.merge(selected,economyRaw,left_on="NPA",right_on="NPA",how="left")
economyMerged[['Household_Income_2018','Household_Income_moe_2018']] = economyMerged[['Household_Income_2018','Household_Income_moe_2018']].astype(float)
economyMerged.to_pickle('./qol-data/pickles/economy.pkl')

In [74]:
educationRaw = pd.read_csv('qol-data/csvFiles/education.csv',header=1).dropna(thresh=10)
educationRaw = educationRaw[educationRaw.NPA.notnull()]
educationRaw.NPA = educationRaw.NPA.astype(int)

educationMerged = pd.merge(selected,educationRaw,left_on="NPA",right_on="NPA",how="left")
educationMerged[['Bachelors_Degree_2018','Bachelors_Degree_moe_2018','High_School_Diploma_2018','High_School_Diploma_moe_2018']] = educationMerged[['Bachelors_Degree_2018','Bachelors_Degree_moe_2018','High_School_Diploma_2018','High_School_Diploma_moe_2018']].astype(float)
educationMerged.to_pickle('./qol-data/pickles/education.pkl')

In [75]:
engagementRaw = pd.read_csv('qol-data/csvFiles/engagement.csv',header=1).dropna(thresh=10)
engagementRaw = engagementRaw[engagementRaw.NPA.notnull()]
engagementRaw.NPA = engagementRaw.NPA.astype(int)

engagementMerged = pd.merge(selected,engagementRaw,left_on="NPA",right_on="NPA",how="left")
engagementMerged.to_pickle('./qol-data/pickles/engagement.pkl')

In [76]:
environmentRaw = pd.read_csv('qol-data/csvFiles/environment.csv',header=1).dropna(thresh=10)
environmentRaw = environmentRaw[environmentRaw.NPA.notnull()]
environmentRaw.NPA = environmentRaw.NPA.astype(int)

environmentMerged = pd.merge(selected,environmentRaw,left_on="NPA",right_on="NPA",how="left")
environmentMerged.to_pickle('./qol-data/pickles/environment.pkl')

In [77]:
healthRaw = pd.read_csv('qol-data/csvFiles/health.csv',header=1).dropna(thresh=10)
healthRaw = healthRaw[healthRaw.NPA.notnull()]
healthRaw.NPA = healthRaw.NPA.astype(int)

healthMerged = pd.merge(selected,healthRaw,left_on="NPA",right_on="NPA",how="left")
healthMerged.to_pickle('./qol-data/pickles/health.pkl')

In [78]:
housingRaw = pd.read_csv('qol-data/csvFiles/housing.csv',header=1).dropna(thresh=10)
housingRaw = housingRaw[housingRaw.NPA.notnull()]
housingRaw.NPA = housingRaw.NPA.astype(int)

housingMerged = pd.merge(selected,housingRaw,left_on="NPA",right_on="NPA",how="left")
housingMerged[list(housingMerged.select_dtypes(include=['object']).columns)] = housingMerged[list(housingMerged.select_dtypes(include=['object']).columns)].astype(float)
housingMerged.to_pickle('./qol-data/pickles/housing.pkl')

In [79]:
safetyRaw = pd.read_csv('qol-data/csvFiles/safety.csv',header=1).dropna(thresh=10)
safetyRaw = safetyRaw[safetyRaw.NPA.notnull()]
safetyRaw.NPA = safetyRaw.NPA.astype(int)

safetyMerged = pd.merge(selected,safetyRaw,left_on="NPA",right_on="NPA",how="left")
safetyMerged['Property_Crime_Rate_2011'] = safetyMerged['Property_Crime_Rate_2011'].astype(float)
safetyMerged.to_pickle('./qol-data/pickles/safety.pkl')

In [80]:
transportationRaw = pd.read_csv('qol-data/csvFiles/transportation.csv',header=1).dropna(thresh=10)
transportationRaw = transportationRaw[transportationRaw.NPA.notnull()]
transportationRaw.NPA = transportationRaw.NPA.astype(int)

transportationMerged = pd.merge(selected,transportationRaw,left_on="NPA",right_on="NPA",how="left")
transportationMerged[['Long_Commute_2018','Long_Commute_moe_2018']] = transportationMerged[['Long_Commute_2018','Long_Commute_moe_2018']].astype(float)
transportationMerged.to_pickle('./qol-data/pickles/transportation.pkl')

## Combine filtered data into one file

In [81]:
import os
from functools import reduce
dataFiles = os.listdir('./qol-data/pickles/')
dataFiles

['character.pkl',
 'economy.pkl',
 'education.pkl',
 'engagement.pkl',
 'environment.pkl',
 'health.pkl',
 'housing.pkl',
 'safety.pkl',
 'transportation.pkl']

In [82]:
def getPickle(file):
    return pd.read_pickle('./qol-data/pickles/{}'.format(file))

# List comprehension to create list of dataframes
dataFrames = [getPickle(file) for file in dataFiles]


In [83]:
# Reduce this list to one master frame with left joins
master = reduce(lambda left,right: pd.merge(left,right,on=["NPA","order"],how="left"),dataFrames)

In [84]:
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Columns: 585 entries, NPA to Transit_Ridership_Total_2013
dtypes: float64(563), int64(2), object(20)
memory usage: 77.8+ KB


In [85]:
master.to_pickle('./qol-data/master.pkl')