In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

In [2]:
# Load csv into df
housing_data = pd.read_csv("../Resources/housingDataUpdated.csv")
housing_data.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,city,zipcode,high_school,middle_school,elementary_school
0,"3157 NE MARINE DR, Portland OR 97035",65000,Floating Home - 1 Story,1,1.0,800,1964,,unknown,Multnomah,Portland,97035,Current Price:,Jefferson,Faubion
1,"17452 NE GLISAN ST #7, Portland OR 97230",72000,Manufactured - Double Wide Manufact,2,2.0,1152,1988,,unknown,Multnomah,Portland,97230,Reynolds,Reynolds,Hartley
2,"9034 SE 78TH PL, Portland OR 97206",79950,Manufactured - Double Wide Manufact,3,2.0,1344,1997,,unknown,Clackamas,Portland,97206,Current Price:,Milwaukie,Whitman
3,"16000 SE POWELL BLVD 75, Portland OR 97236",79950,Manufactured - Double Wide Manufact,3,2.0,1404,1990,,unknown,Multnomah,Portland,97236,Centennial,Centennial,Powell Butte
4,"12846 SE RAMONA ST 6, Portland OR 97236",93900,Manufactured - Double Wide Manufact,3,2.0,1297,1997,,unknown,Multnomah,Portland,97236,David Douglas,Alice Ott,Gilbert Hts


In [3]:
# Find lot_size with null values
null_lots = housing_data.loc[housing_data["lot_size"].isnull(), :]
null_lots.shape

(701, 15)

In [4]:
# Simplify home types 
for i in housing_data.index:
    if "Floating" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Floating"
    if "Condo" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Condo"
    if "Single Family" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Single Family"
    if "Manufactured" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Manufactured"
    
housing_data.home_type.unique()   

array(['Floating', 'Manufactured', 'Condo', 'Single Family'], dtype=object)

In [5]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(housing_data)}')

# Change lot size to 0 for floating homes and condos
for i in housing_data.index:
    if housing_data.at[i, "home_type"] == "Floating":
        housing_data.at[i, "lot_size"] = 0
    if housing_data.at[i, "home_type"] == "Condo":
        housing_data.at[i, "lot_size"] = 0

# Drop listing with null lot_size
cleaned_housing_data = housing_data.drop(housing_data[housing_data["lot_size"].isnull()].index)
      
# Print length of data
print(f'Updated Amount of Listings: {len(cleaned_housing_data)}')

Current Amount of Listings: 1830
Updated Amount of Listings: 1732


In [6]:
# Drop listings with unclear Highschool data
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Current Price:"].index, inplace = True)
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Other"].index, inplace = True)
cleaned_housing_data.shape

(1722, 15)

In [7]:
# Create mean cost per zipcode
zipcode = cleaned_housing_data[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)
zipcodeRanker.reset_index(drop=False, inplace=True)
zipcodeRanker.rename(columns={"index":"zipcode_rank","price":"zipcodeAVGcost"}, inplace=True)
zipcodeRanker["zipcode_rank"]=zipcodeRanker["zipcode_rank"]+1


# Merge into df
cleaned_housing_data_final = pd.merge(cleaned_housing_data, zipcodeRanker, on="zipcode")
cleaned_housing_data_final.rename(columns={"price_y":"zipcodeAVGcost"}, inplace = True)
cleaned_housing_data_final.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,city,zipcode,high_school,middle_school,elementary_school,zipcode_rank,zipcodeAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,410832.85
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,410832.85
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,410832.85
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,Portland,97230,Reynolds,H.B. Lee,Scott,29,410832.85
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,410832.85


In [8]:
# Save to csv
cleaned_housing_data_final.to_csv("../Resources/housingDataUpdatedandCleaned.csv", index=False)