In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/wlifferth/build-an-ml-web-app/main/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/wlifferth/build-an-ml-web-app/main/test.csv')

In [None]:
train.head()

In [None]:
# Lets look at our most common values in each column
for column in train.columns:
    print(column)
    print(train[column].value_counts().nlargest(5))
    print()

In [None]:
# Zipcode looks interesting, but we'll deal with that later
# City also could be interesting but we might need to do some extra work with it
# Same for state, another categorical variable
# Date sold probably isn't helpful, because we know all of these were supposed to have occurred around the same time
# bathrooms--finally we're getting into our bread an butter
# bedrooms, similar
# livingArea also good
# HomeType is super intersting, and it's helpful to realize we're only looking at 4 kinds of homes
# homeStatus--so this is important, some of these houses haven't actually been sold. We actually want to just filter out the ones that are pending, so we'll do that soon.
# Lot area--interesting to see alot of homes dont have any lot--this makes sense for condos and such
# LotUnit--this is good to know--some of our areas from above are in square feet, but others are in acres, so we'll need to unify those
# address is probably too specific to be helpful to us
# and finally price, the thing we're trying to predict

In [None]:
# Lets start simply, we know we only want to look at houses that have sold--not ones that are pending. So lets filter on home status:

train_only_sold = train[train['homeStatus'] == 'RECENTLY_SOLD'].copy()

# then we can actually drop the homeStatus column because we don't need it any more

train_only_sold = train_only_sold.drop('homeStatus', axis=1)
train_only_sold.head()

In [None]:
# We also said that we probably don't care about dateSold, but just to be sure, lets look at those values
# We can use matplotlib for this!

plt.hist(train_only_sold['dateSold'])

In [None]:
# Timestamps are pretty weird, but we can tell that most of these came from around the same time, and we're unlikely to get any information from this, so lets just drop it as well
train_without_date = train_only_sold.drop('dateSold', axis=1)

train_without_date.head()

In [None]:
# While we're at it, lets drop address too

train_without_address = train_without_date.drop('address', axis=1)

# It almost fits on one screen!
train_without_address.head()

In [None]:
# Lets also drop id
train_without_id = train_without_address.drop('id', axis=1)

train_without_id.head()

In [None]:
# Now why don't we graph all our numerical columns!

numerical_variables = ['bathrooms', 'bedrooms', 'livingArea', 'lotArea', 'price']


for variable in numerical_variables:
    plt.figure(figsize=(8,3))
    plt.title(variable)
    plt.hist(train_without_id[variable])
    plt.show()

In [None]:
# All of them are super skewed! This is usually an indicator that we have some outliers that are making analysis tricky
# There are a lot of ways to deal with outliers, but if you don't have a lot of them, one of the easiest methods is to just get rid of them!

# Before we get rid of them, theres some other work we should do

In [None]:
# Right now we have a lotArea column, but we also have a lotUnits column that tells us if the lotArea is in
# square feet or acres

# There are 43560 square feet in each acre, so if the units is acre, we should multiply the area by 43560

def convert_lot_area(row):
    if row['lotUnit'] == 'acres':
        return row['lotArea'] * 43560
    else:
        return row['lotArea']

train_without_id['lotArea'] = train_without_id.apply(convert_lot_area, axis=1)
train_without_id.head()

In [None]:
# This now looks a lot more normal! Most homes have small yards (or no yards) but a few have bigger yards!
plt.hist(train_without_id['lotArea'])

In [None]:
# Great! Now we're ready to deal with outliers--if you remember from stats class, an outlier is a value that is more than 3 standard deviations away from the mean

distance_from_mean = np.abs(train_without_id['bedrooms'] - train_without_id['bedrooms'].mean())

distance_from_mean

In [None]:
std_x_3 = train_without_id['bedrooms'].std() * 3

std_x_3

In [None]:
bedrooms_wihtout_outliers = train_without_id[distance_from_mean < std_x_3]['bedrooms']

plt.title('bedrooms with outliers')
plt.hist(train_without_id['bedrooms'])
plt.show()

plt.title('bedrooms without outliers')
plt.hist(bedrooms_wihtout_outliers)

In [None]:
# Something is an outlier if it is more than 3 standard deviations away from the mean
# And we can get both the mean and standard deviations of our columns really easily!

train_no_outliers = train_without_id.copy()

train_no_outliers.fillna(train_no_outliers.mean())

for variable in numerical_variables:
    distance_from_mean = np.abs(train_no_outliers[variable].mean() - train_no_outliers[variable])
    train_no_outliers = train_no_outliers[distance_from_mean < (train_no_outliers[variable].std() * 5)]

In [None]:
print(f'Rows before outlier removal: {len(train_without_id)}')
print(f'Rows after outlier removal: {len(train_no_outliers)}')

In [None]:
for variable in numerical_variables:
    plt.figure(figsize=(8,3))
    plt.title(variable)
    plt.hist(train_no_outliers[variable], bins=7)
    plt.show()

In [None]:
# Now we can do something called bivariate analysis--where we see how much two variables interact

sns.heatmap(train_no_outliers.corr())

In [None]:
# So bathrooms and livingArea seems to be pretty moderately correlated with price, but zipcode  and lotArea are hardly correlated. Can anyone guess why this is?

In [None]:
# So we've gotten a sense for our numerical data, but now we need to figure out what we're going to do with our categorical data
# State
# City
# Zip Code
# Home Type

In [None]:
# One strategy we use all the time is called one-hot encoding--this strategy works best if you have a set number of values, like, under 100
# This strategy involves turning a single categorical variable with n values, into n binary variables

# So instead of 

colors = pd.DataFrame({
    'color': ['red', 'blue', 'red', 'green'],
})

colors

In [None]:
# we get

colors_one_hot = pd.DataFrame({
    'is_red': [True, False, True, False],
    'is_blue': [False, True, True, True],
    'is_greem': [False, False, False, True]
})

colors_one_hot

In [None]:
# Luckily pandas actually gives us a really easy way to do this

pd.get_dummies(colors, columns=['color'])

In [None]:
# Lets one-hot encode homeType!

train_one_hot = pd.get_dummies(train_no_outliers, columns=['homeType'])

train_one_hot

In [None]:
# Now what about zipcode?
# It really has too many values to one-hot encode effectively, and there's a chance we see new zip codes we haven't se before
# This is a great chance to think about what zipcode will tell us
# Is there some other data related to zip code that would help us?

In [None]:
zip_code_df = pd.read_csv('median_income_by_zip_code.csv')

zip_code_df['median_income']

In [None]:
train_with_median_income = pd.merge(train_one_hot, zip_code_df, how='left', left_on='zipcode', right_on='zip_code')

train_with_median_income['median_income'].fillna(train_with_median_income['median_income'].mean(), inplace=True)

train_with_median_income.drop(['zipcode', 'zip_code'], axis=1, inplace=True)

In [None]:
train_with_median_income