# Data Preprocessing Based on Saved Files from NB 01 and 01b

Import necessary packages

In [71]:
import glob
import json
import os
import pandas as pd
import warnings
import sqlite3
import numerize


## Cleaning Zillow home data

### Import data and combine into df

In [36]:

warnings.filterwarnings("ignore")
DATA_FOLDER = '../data/raw'
#making a list of file names to easily access them
zillow_files = [os.path.join(DATA_FOLDER, 'zillow',  file) 
             for file in os.listdir(DATA_FOLDER + "/zillow") 
             if file.endswith('.json')]

combined_data = []
for file in zillow_files:
  #found a stackoverflow how to read json files line by line as dictionaries
  with open(file, 'r') as f:
    
    for line in f:
      combined_data.append(json.loads(line))
    
    
#converting the list of dictionaries to a pandas dataframe
combined_df = pd.DataFrame.from_dict(combined_data)
# display(combined_df.head())



### Drop unnecessary columns, convert to proper dtypes

In [37]:
combined_cleaned = combined_df[['id', 'price', 'address', 
    'addressZipcode', 'addressState', 'beds', 'baths', 'area', 
    'zestimate', 'brokerName']]
#free up space
del combined_df


I only want houses which are real (have at least 1 bedroom)

In [38]:
combined_real = combined_cleaned[combined_cleaned['beds'] > 0]

Now all houses have non-null beds, baths, etc
Next is to strip the $ sign from price and convert to proper data types

In [39]:
#convert the price to float and remove the $ and , and + signs.
#some houses had a + sign at the end of the price, but I took the price without the sign
combined_real['price']  = combined_real['price'].str.replace('$', '').str.replace(',','').str.replace('+', '')
combined_real['price'] = combined_real['price'].astype(float)

Bedroom can only be an integer, so will cast it as an int16. ID is also an int. 

In [40]:
combined_real['beds'] = combined_real['beds'].astype('int16')
combined_real['id'] = combined_real['id'].astype('int64')
combined_real.drop_duplicates(subset='address', inplace=True)



## Cleaning Zip Data

### Load in the data

In [41]:
zip_data = json.load(open('../data/raw/zip_info.json'))

make a function which gets only the 'value' from each metric

In [42]:
def get_value(x):
  """
  Note that x should be a list of strings(which are dictionaries) which has wanted
  value associated to 'value'
  """
  try:
    data_list = [item['value'] for item in x['highlights']]
  
    data_list.append(x['selectedProfile']['label'])
    return data_list
    # return data_list.append(x['selectedProfile']['label'])

  except:
    return None



In [43]:
#extract the values using list comprehension
zip_info = [get_value(x) for x in zip_data]
zip_df = pd.DataFrame(zip_info)



Get the column names by iterating over the "label" values in one row of the zip_data

In [44]:

cols = [x['label'] for x in zip_data[0]['highlights']]
#the highlights doesnt have the label for zip code, so we add it manually
cols.append('zcta')
#replace the spaces with _ and make them lowercase, replace ' in bachelor's degree
cols = [x.replace(' ', '_').replace('\'','').lower() for x in cols]




In [45]:
zip_df.columns = cols

Clean up the ZCTA to proper zip

In [48]:

zip_df['zcta'] = zip_df['zcta'].astype(str).apply(lambda x: x[6:])


### Convert zip df to proper type

Now, I want to remove the zip codes with 0 households (which also removes '-' from other columns in same row)

In [49]:
zip_df.dropna(subset=['total_households'], inplace=True)
zip_df = zip_df[zip_df['total_households'].astype(int) > 0]

Assign proper types to zip_df

In [61]:

zip_df['total_population'] = zip_df['total_population'].astype(int)
zip_df['median_household_income'] = zip_df['median_household_income'].astype(int)
zip_df['bachelors_degree_or_higher'] = zip_df['bachelors_degree_or_higher'].astype(float)
zip_df['employment_rate'] = zip_df['employment_rate'].astype(float)
zip_df['total_housing_units'] = zip_df['total_housing_units'].astype(int)
zip_df['without_health_care_coverage'] = zip_df['without_health_care_coverage'].astype(float)
zip_df['total_employer_establishments'] = zip_df['total_employer_establishments'].astype(int)
zip_df['total_households'] = zip_df['total_households'].astype(int)
zip_df['hispanic_or_latino_(of_any_race)'] = zip_df['hispanic_or_latino_(of_any_race)'].astype(int)
zip_df['zcta'] = zip_df['zcta'].astype(str)
zip_df['zip_code'] = zip_df['zip_code'].astype(str)

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 0 to 17
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   total_population                  15 non-null     object
 1   median_household_income           15 non-null     object
 2   bachelors_degree_or_higher        15 non-null     object
 3   employment_rate                   15 non-null     object
 4   total_housing_units               15 non-null     object
 5   without_health_care_coverage      15 non-null     object
 6   total_employer_establishments     15 non-null     object
 7   total_households                  15 non-null     object
 8   hispanic_or_latino_(of_any_race)  15 non-null     object
 9   zcta                              15 non-null     object
 10  zip_code                          15 non-null     object
dtypes: object(11)
memory usage: 1.4+ KB


None

Change medium household income to proper number

In [74]:
#AI helped with the following line
zip_df['median_household_income'] = zip_df['median_household_income'].apply(pd.to_numeric)


In [76]:
DATA_FOLDER_P = os.path.join('../data')

Connect to db

In [77]:
conn = sqlite3.connect(os.path.join(DATA_FOLDER_P, './home_prices.db'))

Add the two dfs as tables in db

In [78]:
combined_real.to_sql('zillow', conn, if_exists='replace', index=False)
zip_df.to_sql('zip_info', conn, if_exists='replace', index=False)

15

In [79]:

conn.close()