In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

pd.options.display.max_rows = 100

In [2]:
import wrangle_zillow as w

## Acquire:

In [3]:
df = w.acquire_zillow()

Using cached csv


In [4]:
df.head()

Unnamed: 0,id,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,...,censustractandblock,logerror,transactiondate,airconditioningdesc,architecturalstyledesc,buildingclassdesc,heatingorsystemdesc,propertylandusedesc,storydesc,typeconstructiondesc
0,1727539,14297519,,,,3.5,4.0,,,3.5,...,60590630000000.0,0.025595,2017-01-01,,,,,Single Family Residential,,
1,1387261,17052889,,,,1.0,2.0,,,1.0,...,61110010000000.0,0.055619,2017-01-01,,,,,Single Family Residential,,
2,11677,14186244,,,,2.0,3.0,,,2.0,...,60590220000000.0,0.005383,2017-01-01,,,,,Single Family Residential,,
3,2288172,12177905,,,,3.0,4.0,,8.0,3.0,...,60373000000000.0,-0.10341,2017-01-01,,,,Central,Single Family Residential,,
4,1970746,10887214,1.0,,,3.0,3.0,,8.0,3.0,...,60371240000000.0,0.00694,2017-01-01,Central,,,Central,Condominium,,


In [5]:
df.shape

(77380, 68)

## Prepare:

In [6]:
# Identify and remove the non-single units
not_single = [246, 248, 247, 267, 31]
df = df[~df.propertylandusetypeid.isin(not_single)]

In [7]:
df.shape

(74065, 68)

In [8]:
# Take a look at what our nulls look like
col_nulls = w.nulls_by_col(df)

In [9]:
w.nulls_by_rows(df)

Unnamed: 0,num_cols_missing,num_rows,pct_cols_missing
0,23,2,0.338235
1,24,13,0.352941
2,25,24,0.367647
3,26,65,0.382353
4,27,316,0.397059
5,28,454,0.411765
6,29,5263,0.426471
7,30,3443,0.441176
8,31,9789,0.455882
9,32,12479,0.470588


In [10]:
# Investigating the columns that would be dropped by a 50% proportion requirement
poss_imputes = col_nulls.index[col_nulls.pct_rows_null > .49]
for col in poss_imputes:
    print(col)
    print(df[col].value_counts(dropna=False))
    print('----------')

airconditioningtypeid
NaN     49335
1.0     22943
13.0     1567
5.0       167
11.0       53
Name: airconditioningtypeid, dtype: int64
----------
architecturalstyletypeid
NaN     73859
7.0       172
8.0        19
21.0        7
2.0         5
3.0         3
Name: architecturalstyletypeid, dtype: int64
----------
basementsqft
NaN       74015
900.0         2
700.0         2
100.0         2
640.0         2
515.0         2
273.0         2
912.0         2
588.0         1
819.0         1
1809.0        1
604.0         1
126.0         1
669.0         1
314.0         1
800.0         1
786.0         1
396.0         1
112.0         1
645.0         1
384.0         1
1969.0        1
252.0         1
600.0         1
1218.0        1
280.0         1
3112.0        1
1416.0        1
224.0         1
512.0         1
380.0         1
204.0         1
200.0         1
90.0          1
405.0         1
300.0         1
674.0         1
1252.0        1
254.0         1
3560.0        1
352.0         1
168.0         1
394.0

1. fireplacecnt - all NaN can be 0, not encoded
3. hashottuborspa - all NaN can be 0, encode
4. poolcnt - all NaN can be 0, encode
5. threequarterbathnbr - NaN can be 0, not encoded
6. taxdelinquiencyflag - NaN can be No, can change to 1/0 encoded

The other columns are not worth 'saving' due to redundant information, inability/unnecessary to impute, or too many unexplained nulls

In [11]:
cols = ['fireplacecnt', 'hashottuborspa', 'poolcnt', 'threequarterbathnbr', 'taxdelinquencyflag']

In [12]:
for col in cols:
    df[col] = df[col].fillna(value=0)

In [13]:
for col in cols:
    print(df[col].value_counts(dropna=False))

0.0    65804
1.0     6996
2.0      994
3.0      233
4.0       35
5.0        3
Name: fireplacecnt, dtype: int64
0.0    72528
1.0     1537
Name: hashottuborspa, dtype: int64
0.0    57973
1.0    16092
Name: poolcnt, dtype: int64
0.0    63972
1.0    10009
2.0       75
3.0        8
7.0        1
Name: threequarterbathnbr, dtype: int64
0    71417
Y     2648
Name: taxdelinquencyflag, dtype: int64


In [15]:
# Dropping our columns that do not have 50% threshold for column or row of non-nulls
df = w.handle_missing_values(df, .5, .5)

In [16]:
df.shape

(73668, 39)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73668 entries, 0 to 77379
Data columns (total 39 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            73668 non-null  int64  
 1   parcelid                      73668 non-null  int64  
 2   bathroomcnt                   73668 non-null  float64
 3   bedroomcnt                    73668 non-null  float64
 4   buildingqualitytypeid         46900 non-null  float64
 5   calculatedbathnbr             73603 non-null  float64
 6   calculatedfinishedsquarefeet  73664 non-null  float64
 7   finishedsquarefeet12          73465 non-null  float64
 8   fips                          73668 non-null  float64
 9   fireplacecnt                  73668 non-null  float64
 10  fullbathcnt                   73603 non-null  float64
 11  hashottuborspa                73668 non-null  float64
 12  heatingorsystemtypeid         48505 non-null  float64
 13  l

In [19]:
# Rows to drop bc they are not useful, redundant, or cause leakage
to_drop = ['id', 'parcelid', 'calculatedbathnbr', 'finishedsquarefeet12', 'lotsizesquarefeet', 'propertycountylandusecode',
          'propertylandusetypeid', 'propertyzoningdesc', 'rawcensustractandblock', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
          'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount', 'censustractandblock', 'transactiondate', 'heatingorsystemdesc', 
          'propertylandusedesc']

In [20]:
df = df.drop(columns=to_drop)

In [21]:
df.shape

(73668, 21)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73668 entries, 0 to 77379
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   bathroomcnt                   73668 non-null  float64
 1   bedroomcnt                    73668 non-null  float64
 2   buildingqualitytypeid         46900 non-null  float64
 3   calculatedfinishedsquarefeet  73664 non-null  float64
 4   fips                          73668 non-null  float64
 5   fireplacecnt                  73668 non-null  float64
 6   fullbathcnt                   73603 non-null  float64
 7   hashottuborspa                73668 non-null  float64
 8   heatingorsystemtypeid         48505 non-null  float64
 9   latitude                      73668 non-null  float64
 10  longitude                     73668 non-null  float64
 11  poolcnt                       73668 non-null  float64
 12  regionidcity                  72241 non-null  float64
 13  r