# Unit 5 Data Wrangling

In [14]:
import numpy as np
import pandas as pd
from datetime import datetime

import matplotlib
import matplotlib.pyplot as pp

%matplotlib inline

## 1. Sale Price by County

#### Frequency: Monthly
#### Time range: 2008/03 - 2019 /05
#### No Data: NA

In [15]:
# Clean Sales Price 
# read in California counties' sales price excel. Set encoding

CA_sales_prices = pd.read_excel(r'C:\Users\yulmee\Documents\Yul-Mee\Springboard\Capstone 1\dataset\cleaned\Zillow_Sale_Prices_County_2008_2015_cleaned.xlsx',encoding='latin-1')

In [21]:
# Convert year-month to date instead: 2008-10 to 2008-10-01 and then to datetime type. 
# For my notes - see http://jonathansoma.com/lede/foundations/classes/pandas%20columns%20and%20functions/apply-a-function-to-every-row-in-a-pandas-dataframe/ 
# section: Use .apply with axis=1 to send every single row to a function
def set_first_of_month(row):
    datestring = row['year-month'] +'-01'
    return datetime.strptime(datestring, '%Y-%m-%d')

# select county data

def get_cleaned_county_sales_price(CA_sales_prices, county_name):

    county_sales_price = CA_sales_prices.loc[CA_sales_prices['RegionName'] == county_name]
    
    # Melt table to get year and month prices as rows. 
    alm_melt = county_sales_price.melt(id_vars=['RegionID', 'RegionName'], var_name='year-month', value_name='SalePrice')
    
    alm_melt['Date']=alm_melt.apply(set_first_of_month, axis=1)
    alm_melt.drop(['year-month', 'RegionID'],axis=1,inplace=True) 

    sales_price_final = alm_melt
    
    return sales_price_final


In [5]:
# Get cleaned sales price data for Alameda county
alameda_county_name = 'Alameda County'
alameda_sales_price_final = get_cleaned_county_sales_price(CA_sales_prices, alameda_county_name)
 

In [6]:
# Get cleaned sales price data for Sacramento county
sacramento_county_name = 'Sacramento County'
sacramento_sales_price_final = get_cleaned_county_sales_price(CA_sales_prices, sacramento_county_name)


In [7]:
# Get cleaned sales price data for Los Angeles county
LA_county_name = 'Los Angeles County'
LA_sales_price_final = get_cleaned_county_sales_price(CA_sales_prices, LA_county_name)

# Combine into one dataframe

#frames = [alameda_sales_price_final, sacramento_sales_price_final, LA_sales_price_final]
frames = [alameda_sales_price_final]
master_df= pd.concat(frames)


## 5. Population data - annual by county
   - Removed unnecessary columns.
   - Updated column title to date format
    


In [8]:
pop_data = pd.read_csv(r'C:\Users\yulmee\Documents\Yul-Mee\Springboard\Capstone 1\dataset\cleaned\Population_PEP_2018_PEPANNRES_with_ann_cleaned_1.csv',  encoding='latin-1')

pop_data.head()

def get_cleaned_pop(pop_data, county_name):
    # Get county specific data
    df = pop_data.loc[pop_data['Geography'] == county_name]
    
    # Melt the data
    df_melt = df.melt(id_vars=['Geography'], var_name='Date', value_name='Population')
    df_melt.rename(columns={'Geography':'RegionName'}, inplace=True)
    
    # set datetime index to convert data from annual to monthly
    df_melt['Date'] = pd.to_datetime(df_melt['Date'])
    df_melt.set_index('Date', inplace=True)
    df_melt_monthly = df_melt.resample('M').ffill().reset_index()
    
    return df_melt_monthly

county_name='Alameda County' 

# Melt table to get year and month prices as rows. 
alameda_pop = get_cleaned_pop(pop_data, county_name) 

county_name='Sacramento County'
sacramento_pop = get_cleaned_pop(pop_data, county_name) 

county_name='Los Angeles County'
LA_pop = get_cleaned_pop(pop_data, county_name) 

#frames = [alameda_pop, sacramento_pop, LA_pop]
frames = [alameda_pop]
master_pop= pd.concat(frames) 

master_pop.head()
master_pop.dtypes

Date          datetime64[ns]
RegionName            object
Population             int64
dtype: object

In [19]:
# Merge sales prices and unemployment - monthly tables
master_df1 = master_df[:10]
master_pop1 = master_pop[:10]

master_df1.Date = pd.to_datetime(master_df1.Date)
master_pop1.Date = pd.to_datetime(master_pop1.Date)


master_df1 = pd.merge(master_df1, master_pop1, on=['RegionName','Date'], how='outer')

In [20]:
master_df1

Unnamed: 0,RegionName,SalePrice,Date,Population
0,Alameda County,525400.0,2008-03-01,
1,Alameda County,502000.0,2008-04-01,
2,Alameda County,469500.0,2008-05-01,
3,Alameda County,445600.0,2008-06-01,
4,Alameda County,437500.0,2008-07-01,
5,Alameda County,427000.0,2008-08-01,
6,Alameda County,412500.0,2008-09-01,
7,Alameda County,390800.0,2008-10-01,
8,Alameda County,369500.0,2008-11-01,
9,Alameda County,351800.0,2008-12-01,


6. Historical Housing Affordability Index - Monthly/Quarterly
7. Crime rate - Annual
8. combine into one dataframe.