# Data Preprocessing Based on Saved Files from NB 01 and 01b

Import necessary packages

In [701]:
import glob
import json
import os
import pandas as pd
import warnings
import sqlite3


## Cleaning Zillow home data

### Import data and combine into df

In [702]:

warnings.filterwarnings("ignore")
DATA_FOLDER = '../data/raw'
#making a list of file names to easily access them
zillow_files = [os.path.join(DATA_FOLDER, 'zillow',  file) 
             for file in os.listdir(DATA_FOLDER + "/zillow") 
             if file.endswith('.json')]

combined_data = []
for file in zillow_files:
  #found a stackoverflow how to read json files line by line as dictionaries
  with open(file, 'r') as f:
    
    for line in f:
      combined_data.append(json.loads(line))
    
    
#converting the list of dictionaries to a pandas dataframe
combined_df = pd.DataFrame.from_dict(combined_data)
display(combined_df.head())



Unnamed: 0,zpid,id,rawHomeStatusCd,marketingStatusSimplifiedCd,imgSrc,hasImage,detailUrl,statusType,statusText,countryCurrency,...,brokerName,carouselPhotos,providerListingId,builderName,hasOpenHouse,openHouseStartDate,openHouseEndDate,openHouseDescription,isPropertyResultCDP,lotAreaString
0,84767046,84767046,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/1f2b4fa68f1...,True,https://www.zillow.com/homedetails/100-Station...,FOR_SALE,Condo for sale,$,...,Compass,[{'url': 'https://photos.zillowstatic.com/fp/1...,,,,,,,,
1,350214275,350214275,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/24f272219e6...,True,https://www.zillow.com/homedetails/66-Harvard-...,FOR_SALE,Condo for sale,$,...,Keller Williams Realty,[{'url': 'https://photos.zillowstatic.com/fp/2...,,,,,,,,
2,295325070,295325070,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/69f69381eff...,True,https://www.zillow.com/homedetails/100-Lovejoy...,FOR_SALE,Condo for sale,$,...,Advisors Living - Boston,[{'url': 'https://photos.zillowstatic.com/fp/6...,,,,,,,,
3,56462837,56462837,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/c7c01d3067e...,True,https://www.zillow.com/homedetails/425-Ferry-S...,FOR_SALE,Multi-family home for sale,$,...,Keller Williams Realty Boston Northwest,[{'url': 'https://photos.zillowstatic.com/fp/c...,,,,,,,,
4,352060843,352060843,ForSale,For Sale by Agent,https://photos.zillowstatic.com/fp/6891291fe45...,True,https://www.zillow.com/homedetails/131-Spring-...,FOR_SALE,Condo for sale,$,...,RE/MAX Destiny,[{'url': 'https://photos.zillowstatic.com/fp/6...,,,,,,,,


### Drop unnecessary columns, convert to proper dtypes

In [703]:
combined_cleaned = combined_df[['id', 'price', 'address', 
    'addressZipcode', 'addressState', 'beds', 'baths', 'area', 
    'zestimate', 'brokerName']]
#free up space
del combined_df


I only want houses which are real (have at least 1 bedroom)

In [704]:
combined_real = combined_cleaned[combined_cleaned['beds'] > 0]

Now all houses have non-null beds, baths, etc
Next is to strip the $ sign from price, add a town column, and convert to proper data types

In [705]:
#convert the price to float and remove the $ and , and + signs.
#some houses had a + sign at the end of the price, but I took the price without the sign
combined_real['price']  = combined_real['price'].str.replace('$', '').str.replace(',','').str.replace('+', '')
combined_real['price'] = combined_real['price'].astype(float)

Bedroom can only be an integer, so will cast it as an int16. ID is also an int. Add the town column for maybe more insights down the line

In [706]:
combined_real['beds'] = combined_real['beds'].astype('int16')
combined_real['id'] = combined_real['id'].astype('int64')
combined_real['town'] = combined_real['address'].str.split(',').str[1]
combined_real.drop_duplicates(subset='address', inplace=True)
display(combined_real.info())


## Cleaning Zip Data

### Load in the data

In [707]:
zip_data = json.load(open('../data/raw/zip_info.json'))

make a function which gets only the 'value' from each metric

In [708]:
def get_value(x):
  """
  Note that x should be a list of strings(which are dictionaries) which has wanted
  value associated to 'value'
  """
  try:
    data_list = [item['value'] for item in x['highlights']]
  
    data_list.append(x['selectedProfile']['label'])
    return data_list
    # return data_list.append(x['selectedProfile']['label'])

  except:
    return None



In [709]:
#extract the values using list comprehension
zip_info = [get_value(x) for x in zip_data]
zip_df = pd.DataFrame(zip_info)
# display(zip_df.head())


Get the column names by iterating over the "label" values in one row of the zip_data

In [710]:

cols = [x['label'] for x in zip_data[0]['highlights']]
#the highlights doesnt have the label for zip code, so we add it manually
cols.append('zcta')
#replace the spaces with _ and make them lowercase, replace ' in bachelor's degree
cols = [x.replace(' ', '_').replace('\'','').lower() for x in cols]
print(type(cols))



<class 'list'>


In [711]:

zip_df.columns = cols

display(zip_df)


Unnamed: 0,total_population,median_household_income,bachelors_degree_or_higher,employment_rate,total_housing_units,without_health_care_coverage,total_employer_establishments,total_households,hispanic_or_latino_(of_any_race),zcta
0,4228,170379,75.8,51.7,2321.0,1.9,1087.0,2002.0,256.0,ZCTA5 02108
1,3921,162179,81.5,77.5,2639.0,1.5,1340.0,1993.0,205.0,ZCTA5 02109
2,2921,126157,85.0,55.2,1754.0,0.7,1917.0,1312.0,132.0,ZCTA5 02110
3,9716,72321,61.5,55.9,4827.0,0.9,963.0,3746.0,745.0,ZCTA5 02111
4,29073,74155,54.8,57.2,15080.0,2.8,894.0,14133.0,5191.0,ZCTA5 02118
5,37939,145357,70.0,78.0,19147.0,1.8,700.0,17406.0,3893.0,ZCTA5 02127
6,7126,106625,81.0,83.7,4598.0,2.6,180.0,4042.0,341.0,ZCTA5 02113
7,14469,121418,83.9,75.9,9805.0,0.7,744.0,7719.0,1038.0,ZCTA5 02114
8,31604,51454,67.1,49.7,11448.0,1.7,714.0,9845.0,3216.0,ZCTA5 02115
9,23408,131648,77.3,63.7,13392.0,1.9,1979.0,11915.0,1657.0,ZCTA5 02116


### Convert zip df to proper type

Now, I want to remove the zip codes with 0 households (which also removes '-' from other columns in same row)

In [712]:

zip_df.dropna(subset=['total_households'], inplace=True)
zip_df = zip_df[zip_df['total_households'].astype(int) > 0]

# display(zip_df)

In [713]:
%load_ext sql
%config SqlMagic.autocommit=True # for engines that do not support autommit

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [714]:
DATA_FOLDER_P = os.path.join('../data')

In [715]:
conn = sqlite3.connect(os.path.join(DATA_FOLDER_P, './home_prices.db'))
# # conn.commit()
# # conn.close()

In [716]:
# display(combined_real.head())
combined_real.to_sql('zillow', conn, if_exists='replace', index=False)
zip_df.to_sql('zip_info', conn, if_exists='replace', index=False)

15

Make the schema for the zillow data

In [718]:
conn.close()