In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# ignore warnings
import warnings
warnings.filterwarnings("ignore")
import prepare
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = prepare.wrangle_zillow()

In [None]:

df.head()

In [None]:
train, test = train_test_split(df, random_state=123)

In [None]:
train.fips.value_counts()

In [None]:
# Convert fips to county name, use https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt

train.fips = train.fips.replace({6037.0:'Los_Angeles_County', 6059.0:'Orange_County', 6111.0:'Ventura'})
train.head()

In [None]:
for col in ['fips', 'regionidcity', 'regionidcounty', 'regionidzip']:
    train[col] = train[col].astype('object')

In [None]:
#  scale the numeric columns
scaler = MinMaxScaler()
numeric_columns = list(train.select_dtypes('number').columns)
train[numeric_columns] = scaler.fit_transform(train[numeric_columns])

In [None]:
train.regionidzip.value_counts()

In [None]:
train.head()

In [None]:
df = train[["bathroomcnt", "bedroomcnt", "lotsizesquarefeet", "yearbuilt", "calculatedfinishedsquarefeet", "taxamount", "taxvaluedollarcnt", "logerror", "fips", "longitude", "latitude", "regionidzip"]]

In [None]:
sns.pairplot(df, hue="fips")

In [None]:
sns.heatmap(df.corr(), cmap='Blues', annot=True, center=0)

In [None]:
sns.scatterplot(x='longitude', y='latitude', data=df)
plt.show()

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='longitude',y='latitude', hue='fips', size='taxvaluedollarcnt',sizes=(20,200),data=df)
plt.legend()
plt.show()

In [None]:
population_logerror_mean = df.logerror.mean()
zipcode_logerror_mean = df.groupby("regionidzip")['regionidzip','logerror'].mean().reset_index()

In [None]:
f"{population_logerror_mean} is the logerror of the population"

In [None]:
zipcode_logerror_mean.head()

$H_0$: there is no difference between the logerror of the population and the logerror of a specific zipcode.

$H_a$: there is a difference between the logerror of the population and the logerror of a specific zipcode.

In [None]:
alpha = 0.01
zipcodes = df.regionidzip.unique()

In [None]:
zipcode = zipcodes[0]

# Run a t-test between the population mean and the mean of a subgroup (zipcode)
t, p = stats.ttest_1samp(df[df.regionidzip == zipcode].logerror, population_logerror_mean)

In [None]:
p

In [None]:
if p < alpha:
    print("Reject the null hypothesis")
else:
    print("Failed to reject the null hypothesis")

### Is logerror significantly different for properties in LA County vs Orange County vs Ventura County?

In [None]:
la = df[df.fips == "Los_Angeles_County"]
orange = df[df.fips == "Orange_County"]
ventura = df[df.fips == "Ventura"]

$H_0$: There is no difference of Logerror between LA County and Orange County.

$H_a$: There is a difference of Logerror between LA County and Orange County.

In [None]:
alpha = .01 # confidence level of .99

t, p = stats.ttest_ind(la.logerror, orange.logerror)

if p < alpha:
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Take away: Reject the null hypothesis, which means there is a statistic significant difference of logerror between La County and Orange County.

$H_0$: There is no difference of Logerror between LA County and Ventura County.

$H_a$: There is a difference of Logerror between LA County and Ventura County.

In [None]:
alpha = .01 

t, p = stats.ttest_ind(la.logerror, ventura.logerror)

if p < alpha:
    print("Reject the null hypothesis")
else:
    print("Failed to reject the null hypothesis")

$H_0$: There is no difference of Logerror between Orange County and Ventura County.

$H_a$: There is a difference of Logerror between Orange County and Ventura County.

In [None]:
alpha = .01 

t, p = stats.ttest_ind(orange.logerror, ventura.logerror)

if p < alpha:
    print("Reject the null hypothesis")
else:
    print("Failed to reject the null hypothesis")

Conclusions: Except there is a statistic significant difference of logerror between La County and Orange County, there is no statistic significant difference between other variables.