In [2]:
# regular imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import env

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings("ignore")

# Wrangling
from sklearn.model_selection import train_test_split

In [3]:
# from our acquire.py:
def get_connection(db, user=env.user, host=env.host, password=env.password):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'
    
sql_query = '''
SELECT * FROM properties_2017
JOIN predictions_2017 USING (parcelid)
WHERE transactiondate < '2018'
AND propertylandusetypeid = 261;
'''
    
def get_zillow_data():
    df = pd.read_sql(sql_query, get_connection('zillow'))
    df = df.drop(columns='id')
    return df

In [4]:
df = get_zillow_data()

In [5]:
df.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,14297519,,,,3.5,4.0,,,3.5,,...,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630072012.0,0.03,2017-01-01
1,17052889,,,,1.0,2.0,,,1.0,,...,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010023006.0,0.06,2017-01-01
2,14186244,,,,2.0,3.0,,,2.0,,...,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590218022012.0,0.01,2017-01-01
3,12177905,,,,3.0,4.0,,8.0,3.0,,...,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373001001006.0,-0.1,2017-01-01
4,12095076,1.0,,,3.0,4.0,,9.0,3.0,,...,276684.0,773303.0,2016.0,496619.0,9516.26,,,60374608001014.0,-0.0,2017-01-01


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52441 entries, 0 to 52440
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      52441 non-null  int64  
 1   airconditioningtypeid         13638 non-null  float64
 2   architecturalstyletypeid      70 non-null     float64
 3   basementsqft                  47 non-null     float64
 4   bathroomcnt                   52441 non-null  float64
 5   bedroomcnt                    52441 non-null  float64
 6   buildingclasstypeid           0 non-null      object 
 7   buildingqualitytypeid         33740 non-null  float64
 8   calculatedbathnbr             52304 non-null  float64
 9   decktypeid                    389 non-null    float64
 10  finishedfloor1squarefeet      4381 non-null   float64
 11  calculatedfinishedsquarefeet  52359 non-null  float64
 12  finishedsquarefeet12          52194 non-null  float64
 13  f

In [7]:
df.describe()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingqualitytypeid,calculatedbathnbr,decktypeid,finishedfloor1squarefeet,...,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyyear,censustractandblock,logerror
count,52441.0,13638.0,70.0,47.0,52441.0,52441.0,33740.0,52304.0,389.0,4381.0,...,14561.0,81.0,52357.0,52440.0,52441.0,52440.0,52437.0,2079.0,52318.0,52441.0
mean,12999115.87,2.44,7.1,678.98,2.3,3.3,6.26,2.3,66.0,1545.61,...,1.41,1.0,196533.75,529688.16,2016.0,333465.48,6453.51,14.1,60502401189819.95,0.02
std,3411444.02,3.85,2.67,711.83,1.02,0.95,1.72,1.02,0.0,674.87,...,0.54,0.0,254340.14,751894.55,0.0,570527.8,8755.57,2.39,1859036273451.59,0.18
min,10711855.0,1.0,2.0,38.0,0.0,0.0,1.0,1.0,66.0,184.0,...,1.0,1.0,129.0,1000.0,2016.0,161.0,49.18,4.0,60371011101000.0,-4.66
25%,11510217.0,1.0,7.0,263.5,2.0,3.0,5.0,2.0,66.0,1151.0,...,1.0,1.0,77071.0,193747.5,2016.0,76020.75,2656.22,14.0,60374012032004.25,-0.02
50%,12577645.0,1.0,7.0,512.0,2.0,3.0,6.0,2.0,66.0,1410.0,...,1.0,1.0,131811.0,373612.0,2016.0,218032.0,4647.74,15.0,60376203011013.5,0.01
75%,14129530.0,1.0,7.0,809.5,3.0,4.0,8.0,3.0,66.0,1759.0,...,2.0,1.0,226334.0,619301.25,2016.0,408606.75,7377.5,15.0,60590423261751.25,0.04
max,167687839.0,13.0,21.0,3560.0,18.0,14.0,12.0,18.0,66.0,6912.0,...,6.0,1.0,9164901.0,49061236.0,2016.0,48952198.0,586639.3,99.0,483030105084015.06,5.26


In [8]:
print(f'r0ws: {df.shape[0]}')
print(f'co1umns: {df.shape[1]}')

r0ws: 52441
co1umns: 60


In [9]:
for col in df.columns:
    print(col)
    print(df[col].value_counts())

parcelid
11991059    3
11957553    2
12478591    2
12035592    2
12443331    2
           ..
11494547    1
10976762    1
12650619    1
12650850    1
12826780    1
Name: parcelid, Length: 52320, dtype: int64
airconditioningtypeid
 1.00    11895
13.00     1568
 5.00      159
11.00       16
Name: airconditioningtypeid, dtype: int64
architecturalstyletypeid
 7.00    62
 3.00     3
 2.00     2
21.00     2
 8.00     1
Name: architecturalstyletypeid, dtype: int64
basementsqft
  900.00    2
  640.00    2
  100.00    2
  515.00    2
  273.00    2
  912.00    2
  314.00    1
  819.00    1
1,809.00    1
  604.00    1
  126.00    1
  588.00    1
  786.00    1
  669.00    1
  800.00    1
  396.00    1
  112.00    1
  645.00    1
  384.00    1
1,969.00    1
  252.00    1
  600.00    1
1,218.00    1
  280.00    1
  300.00    1
  224.00    1
  512.00    1
  380.00    1
  204.00    1
  200.00    1
   90.00    1
3,112.00    1
  405.00    1
1,252.00    1
  254.00    1
3,560.00    1
  352.00    1
  168.00

Nulls By Column

Sure, using built in methods is easy enough. But what about getting our nulls by column and nulls by row?

Let's look at nulls by column. Let's start by using .isnull():

In [10]:
df.isnull().head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,False,True,True,True,False,False,True,True,False,True,...,False,False,False,False,False,True,True,False,False,False
1,False,True,True,True,False,False,True,True,False,True,...,False,False,False,False,False,True,True,False,False,False
2,False,True,True,True,False,False,True,True,False,True,...,False,False,False,False,False,True,True,False,False,False
3,False,True,True,True,False,False,True,False,False,True,...,False,False,False,False,False,True,True,False,False,False
4,False,False,True,True,False,False,True,False,False,True,...,False,False,False,False,False,True,True,False,False,False


In [11]:
df.head()

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,14297519,,,,3.5,4.0,,,3.5,,...,485713.0,1023282.0,2016.0,537569.0,11013.72,,,60590630072012.0,0.03,2017-01-01
1,17052889,,,,1.0,2.0,,,1.0,,...,88000.0,464000.0,2016.0,376000.0,5672.48,,,61110010023006.0,0.06,2017-01-01
2,14186244,,,,2.0,3.0,,,2.0,,...,85289.0,564778.0,2016.0,479489.0,6488.3,,,60590218022012.0,0.01,2017-01-01
3,12177905,,,,3.0,4.0,,8.0,3.0,,...,108918.0,145143.0,2016.0,36225.0,1777.51,,,60373001001006.0,-0.1,2017-01-01
4,12095076,1.0,,,3.0,4.0,,9.0,3.0,,...,276684.0,773303.0,2016.0,496619.0,9516.26,,,60374608001014.0,-0.0,2017-01-01


In [14]:
df.isnull().sum()[:11]

parcelid                        0
airconditioningtypeid       38803
architecturalstyletypeid    52371
basementsqft                52394
bathroomcnt                     0
bedroomcnt                      0
buildingclasstypeid         52441
buildingqualitytypeid       18701
calculatedbathnbr             137
decktypeid                  52052
finishedfloor1squarefeet    48060
dtype: int64

In [15]:
df.shape[0] # Number of rows

52441

In [17]:
(df.isnull().sum()/df.shape[0]*100)[:] # Percentage of nulls in each column

parcelid                                       0.00
airconditioningtypeid                         73.99
architecturalstyletypeid                      99.87
basementsqft                                  99.91
bathroomcnt                                    0.00
bedroomcnt                                     0.00
buildingclasstypeid                          100.00
buildingqualitytypeid                         35.66
calculatedbathnbr                              0.26
decktypeid                                    99.26
finishedfloor1squarefeet                      91.65
calculatedfinishedsquarefeet                   0.16
finishedsquarefeet12                           0.47
finishedsquarefeet13                         100.00
finishedsquarefeet15                         100.00
finishedsquarefeet50                          91.65
finishedsquarefeet6                           99.69
fips                                           0.00
fireplacecnt                                  86.19
fullbathcnt 

In [18]:
nulls_col = pd.DataFrame({'num_rows_missing': df.isnull().sum(), 
              'percent_rows_missing': (df.isnull().sum() / df.shape[0] * 100)})

nulls_col.sort_values(by='num_rows_missing', ascending=False).head(10)

Unnamed: 0,num_rows_missing,percent_rows_missing
finishedsquarefeet15,52441,100.0
buildingclasstypeid,52441,100.0
finishedsquarefeet13,52441,100.0
storytypeid,52394,99.91
basementsqft,52394,99.91
yardbuildingsqft26,52378,99.88
architecturalstyletypeid,52371,99.87
typeconstructiontypeid,52365,99.86
fireplaceflag,52360,99.85
finishedsquarefeet6,52276,99.69


### nulls_by_col function

In [19]:
def nulls_by_col(df):
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    prcnt_miss = num_missing / rows * 100
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'percent_rows_missing': prcnt_miss})
    return cols_missing.sort_values(by='num_rows_missing', ascending=False)

Nulls by Row



In [20]:
# look at the number/percent of nulls in each row
df.isnull().sum(axis=1)

0        30
1        27
2        28
3        27
4        25
         ..
52436    29
52437    28
52438    26
52439    28
52440    29
Length: 52441, dtype: int64

In [21]:
#both NaN and None are considered as null
df.head(1)[df.head(1).isna()]

Unnamed: 0,parcelid,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,calculatedbathnbr,decktypeid,...,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,logerror,transactiondate
0,,,,,,,,,,,...,,,,,,,,,,


In [23]:
#into a percentage:
df.isnull().sum(axis=1) / df.shape[1] * 100

0                      50.00
1                      45.00
2                      46.67
3                      45.00
4                      41.67
                ...         
52436                  48.33
52437                  46.67
52438                  43.33
52439                  46.67
52440                  48.33
Length: 52441, dtype: float64

In [24]:
nulls_row = pd.DataFrame({'num_cols_missing': df.isnull().sum(axis=1),
              'percent_cols_missing': df.isnull().sum(axis=1)/df.shape[1]*100})

nulls_row

Unnamed: 0,num_cols_missing,percent_cols_missing
0,30,50.00
1,27,45.00
2,28,46.67
3,27,45.00
4,25,41.67
...,...,...
52436,29,48.33
52437,28,46.67
52438,26,43.33
52439,28,46.67


In [None]:
#if we wanted parcel id back in
# nulls_row = df.merge(nulls_row,
#                         left_index=True,
#                         right_index=True)[['parcelid', 'num_cols_missing', 'percent_cols_missing']]

# nulls_row

In [25]:
nulls_row.sort_values(by='num_cols_missing', ascending=False)

Unnamed: 0,num_cols_missing,percent_cols_missing
28116,42,70.00
9001,42,70.00
41468,41,68.33
13429,40,66.67
48597,39,65.00
...,...,...
42219,20,33.33
21757,20,33.33
20841,20,33.33
10432,19,31.67


### nulls_by_row function

In [31]:
def nulls_by_row(df):
    num_missing = df.isnull().sum(axis=1)
    prcnt_miss = num_missing / df.shape[1] * 100
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': prcnt_miss})
#     rows_missing = df.merge(rows_missing,
#                         left_index=True,
#                         right_index=True)[['parcelid', 'num_cols_missing', 'percent_cols_missing']]
    return rows_missing.sort_values(by='num_cols_missing', ascending=False)

### summarize function

In [32]:
def summarize(df):
    '''
    summarize will take in a single argument (a pandas dataframe) 
    and output to console various statistics on said dataframe, including:
    # .head()
    # .info()
    # .describe()
    # .value_counts()
    # observation of nulls in the dataframe
    '''
    print('SUMMARY REPORT')
    print('=====================================================\n\n')
    print('Dataframe head: ')
    print(df.head(3))
    print('=====================================================\n\n')
    print('Dataframe info: ')
    print(df.info())
    print('=====================================================\n\n')
    print('Dataframe Description: ')
    print(df.describe())
    num_cols = [col for col in df.columns if df[col].dtype != 'O']
    cat_cols = [col for col in df.columns if col not in num_cols]
    print('=====================================================')
    print('DataFrame value counts: ')
    for col in df.columns:
        if col in cat_cols:
            print(df[col].value_counts(), '\n')
        else:
            print(df[col].value_counts(bins=10, sort=False), '\n')
    print('=====================================================')
    print('nulls in dataframe by column: ')
    print(nulls_by_col(df))
    print('=====================================================')
    print('nulls in dataframe by row: ')
    print(nulls_by_row(df))
    print('=====================================================')

In [33]:
summarize(df)

SUMMARY REPORT


Dataframe head: 
   parcelid  airconditioningtypeid  architecturalstyletypeid  basementsqft  \
0  14297519                    NaN                       NaN           NaN   
1  17052889                    NaN                       NaN           NaN   
2  14186244                    NaN                       NaN           NaN   

           bathroomcnt           bedroomcnt buildingclasstypeid  \
0                 3.50                 4.00                None   
1                 1.00                 2.00                None   
2                 2.00                 3.00                None   

   buildingqualitytypeid    calculatedbathnbr  decktypeid  ...  \
0                    NaN                 3.50         NaN  ...   
1                    NaN                 1.00         NaN  ...   
2                    NaN                 2.00         NaN  ...   

   structuretaxvaluedollarcnt    taxvaluedollarcnt       assessmentyear  \
0                  485,713.00         1,023,

(106.198, 2308.1]     39149
(2308.1, 4488.2]      11821
(4488.2, 6668.3]       1003
(6668.3, 8848.4]        160
(8848.4, 11028.5]        42
(11028.5, 13208.6]       12
(13208.6, 15388.7]        2
(15388.7, 17568.8]        2
(17568.8, 19748.9]        1
(19748.9, 21929.0]        2
Name: finishedsquarefeet12, dtype: int64 

Series([], Name: finishedsquarefeet13, dtype: int64) 

Series([], Name: finishedsquarefeet15, dtype: int64) 

(171.716, 1412.3]     2185
(1412.3, 2640.6]      1905
(2640.6, 3868.9]       210
(3868.9, 5097.2]        57
(5097.2, 6325.5]        13
(6325.5, 7553.8]         7
(7553.8, 8782.1]         2
(8782.1, 10010.4]        1
(10010.4, 11238.7]       0
(11238.7, 12467.0]       1
Name: finishedsquarefeet50, dtype: int64 

(374.781, 901.8]    57
(901.8, 1423.6]     72
(1423.6, 1945.4]    16
(1945.4, 2467.2]     8
(2467.2, 2989.0]     4
(2989.0, 3510.8]     3
(3510.8, 4032.6]     2
(4032.6, 4554.4]     2
(4554.4, 5076.2]     0
(5076.2, 5598.0]     1
Name: finishedsquarefeet

(-4.666, -3.664]        2
(-3.664, -2.672]        0
(-2.672, -1.68]        26
(-1.68, -0.688]       109
(-0.688, 0.304]     50948
(0.304, 1.296]       1221
(1.296, 2.287]         76
(2.287, 3.279]         57
(3.279, 4.271]          1
(4.271, 5.263]          1
Name: logerror, dtype: int64 

2017-06-30    810
2017-04-28    617
2017-05-31    566
2017-07-28    546
2017-08-31    528
             ... 
2017-05-07      1
2017-05-13      1
2017-05-20      1
2017-06-11      1
2017-09-25      1
Name: transactiondate, Length: 257, dtype: int64 

nulls in dataframe by column: 
                              num_rows_missing  percent_rows_missing
finishedsquarefeet15                     52441                100.00
buildingclasstypeid                      52441                100.00
finishedsquarefeet13                     52441                100.00
storytypeid                              52394                 99.91
basementsqft                             52394                 99.91
yardbuildingsqf

In [46]:
#remove columns
def remove_columns(df, cols_to_remove):
    df = df.drop(columns = cols_to_remove)
    return df

In [47]:
#handle missing values
def handle_missing_values(df, prop_required_columns =0.5, prop_required_row=0.75):
    threshold = int(round(prop_required_columns * len(df.index), 0))
    #axis 1 : drop columns that have missing values
    df = df.dropna(axis=1, thresh=threshold)
    threshold = int(round(prop_required_row * len(df.columns),0))
    #axis 0 : drop rows that have missing values
    df = df.dropna(axis=1, thresh=threshold)
    return df

In [48]:
# combining everything in a cleaning function:
def data_prep(df, cols_to_remove=[], prop_required_column=0.5, prop_required_row=0.75):
    df = remove_columns(df, cols_to_remove)
    df = handle_missing_values(df, prop_required_column, prop_required_row)
    return df

In [49]:
df = data_prep(df)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52441 entries, 0 to 52440
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   parcelid                      52441 non-null  int64  
 1   bathroomcnt                   52441 non-null  float64
 2   bedroomcnt                    52441 non-null  float64
 3   buildingqualitytypeid         33740 non-null  float64
 4   calculatedbathnbr             52304 non-null  float64
 5   calculatedfinishedsquarefeet  52359 non-null  float64
 6   finishedsquarefeet12          52194 non-null  float64
 7   fips                          52441 non-null  float64
 8   fullbathcnt                   52304 non-null  float64
 9   heatingorsystemtypeid         33935 non-null  float64
 10  latitude                      52441 non-null  float64
 11  longitude                     52441 non-null  float64
 12  lotsizesquarefeet             52072 non-null  float64
 13  p