In [0]:
import pandas as pd
import numpy as np
from scipy import stats

In [0]:
houseprices = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/houseprices.csv')
houseprices.head(3)
houseprices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
id               1460 non-null int64
mssubclass       1460 non-null int64
mszoning         1460 non-null object
lotfrontage      1201 non-null float64
lotarea          1460 non-null int64
street           1460 non-null object
alley            91 non-null object
lotshape         1460 non-null object
landcontour      1460 non-null object
utilities        1460 non-null object
lotconfig        1460 non-null object
landslope        1460 non-null object
neighborhood     1460 non-null object
condition1       1460 non-null object
condition2       1460 non-null object
bldgtype         1460 non-null object
housestyle       1460 non-null object
overallqual      1460 non-null int64
overallcond      1460 non-null int64
yearbuilt        1460 non-null int64
yearremodadd     1460 non-null int64
roofstyle        1460 non-null object
roofmatl         1460 non-null object
exterior1st      1460 non-n

In [0]:
#How many rows of data does the table contain?
print("The table contains {} rows of data ".format(houseprices.shape[0]))

The table contains 1460 rows of data 


In [0]:
#Are there any nulls in any columns? Yes there are.
missing_columns = list(houseprices.columns[houseprices.isnull().any()])
print('Missing columns are : {}' .format(missing_columns)) 

Missing columns are : ['lotfrontage', 'alley', 'masvnrtype', 'masvnrarea', 'bsmtqual', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfintype2', 'electrical', 'fireplacequ', 'garagetype', 'garageyrblt', 'garagefinish', 'garagequal', 'garagecond', 'poolqc', 'fence', 'miscfeature']


In [0]:
#How many distinct values are there in the mszoning column?
print("There are {} unique values in the {} column.".format(houseprices['mszoning'].nunique(),'mszoning'))

There are 5 unique values in the mszoning column.


In [0]:
#How many distinct values are there in the street column?
print("There are {} unique values in the {} column.".format(houseprices['street'].nunique(),"street"))

There are 2 unique values in the street column.


In [0]:
#Describe the range of the saleprice.
houseprices['saleprice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: saleprice, dtype: float64

In [0]:
#Describe the range of the lotarea.
houseprices['lotarea'].describe()

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
max      215245.000000
Name: lotarea, dtype: float64

In [0]:
houseprices.columns

Index(['id', 'mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'alley', 'lotshape', 'landcontour', 'utilities', 'lotconfig',
       'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype',
       'housestyle', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd',
       'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype',
       'masvnrarea', 'exterqual', 'extercond', 'foundation', 'bsmtqual',
       'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1',
       'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating',
       'heatingqc', 'centralair', 'electrical', 'firstflrsf', 'secondflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paved

In [0]:
('openporchsf'or 'saleprice') in  missing_columns

False

There is no null values in openporchsf and saleprice columns.


In [0]:
#Is there a statistically significant difference between the prices of houses that have an open porch versus houses
#that do not have an open porch? 
openporch_houses = houseprices.iloc[(houseprices['openporchsf'] != 0).values]
no_openporch_houses = houseprices.iloc[(houseprices['openporchsf'] == 0).values]
stats.ttest_ind(openporch_houses['saleprice'], no_openporch_houses['saleprice'])

Ttest_indResult(statistic=17.31355834212178, pvalue=3.209770265256579e-61)

There is a difference of 17.3 standard deviations between these two means and the p-value is extremely small implying that this is a statistically significant difference. 

In [0]:
#Is there a significant correlation between the lotarea and saleprice variables? Are they positively or negatively correlated?
stats.pearsonr(houseprices['lotarea'], houseprices['saleprice'])

(0.2638433538714057, 1.1231391549193063e-24)

Correlation coefficient between lotarea and saleprice columns is 0.26 and p-value is extremelly small. We can say that there is a significant positive correlation between two columns.