In [238]:
version = !python -V
print('Python version:', version[0])
print('Pandas version:', pd.__version__)

Python version: Python 3.6.10 :: Anaconda, Inc.
Pandas version: 0.25.3


In [237]:
import pandas as pd
import json
import csv

df = pd.read_csv('../data/pgh_capital_projects.csv')
specs = './specifications/specifications.json'

## Create Frequency Table of Null Values

In [231]:
null_table = df.isnull().sum()
null_table

id                         0
name                       2
task_description           7
area                       2
budgeted_amount            3
status                     0
asset_id                 478
asset_type                 0
fiscal_year                0
start_date                 0
inactive                 394
neighborhood             378
council_district         376
ward                     378
tract                    374
public_works_division    378
pli_division             378
police_zone              379
fire_zone                378
latitude                   0
longitude                  0
dtype: int64

In [234]:
# export to JSON file
null_table.to_json('./output/null_table_notebook.json')

## Query Dataset Based on Specified Fields

In [236]:
def lowercase_list(l):
    """Transform list items to string type and convert to lowercase

    Args:
        l: List of specification values
    """
    return [str(i).lower() for i in l]

def compare_lists(spec, l):
    """Return a boolean series based on whether each dataframe contains specified value
    
    Args:
        spec: Specification column name (String)
        l: List of specification values
    """
    return df[spec].str.lower().isin(lowercase_list(l))

# Set up step: Convert fiscal_year to string so we can use compare_lists() on that column
df['fiscal_year'] = df['fiscal_year'].astype(str)
df.dtypes

with open(specs) as s:
    specification = json.load(s)
    status = specification['status']
    neighborhood = specification['neighborhood']
    fiscal_year = specification['fiscal_year']
    area = lowercase_list(specification['area'])

    result = df.loc[compare_lists('status', status) \
          & compare_lists('neighborhood', neighborhood) \
          & compare_lists('fiscal_year', fiscal_year) \
          & compare_lists('area', area)][['id','status','neighborhood','fiscal_year','area']]
    
    result.to_json('./output/query_output_notebook')