# Cleaning house listings for sale

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
#!pip install pydotplus

from IPython.display import Image, display #for tree plot 
import pydotplus 
from scipy import misc

import plotly.express as px

import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)


## Import the data

In [2]:
df=pd.read_csv('RealEstateNewYork.csv',sep=',',low_memory=False)

In [3]:
df.shape

(8652, 15)

In [4]:
df_clean=df.copy()

## Acquaintance with the data

In [5]:
df_clean.describe(include='all')

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,lat,lon,county
count,8652.0,8446.0,8555.0,4448.0,5727.0,8652,6823.0,6685.0,8304.0,8632,8652,8649,8305.0,8305.0,8637
unique,,,,,,11,,,,8574,1,1126,,,63
top,,,,,,single_family,,,,378 Ohayo Mountain Rd,NY,New York City,,,Suffolk
freq,,,,,,5360,,,,4,8652,618,,,945
mean,886085.2,3.27883,2.326943,1.809128,3.451545,,227217.7,2213.703665,1954.759634,,,,41.627811,-74.663843,
std,1962067.0,1.699035,1.391906,2.429346,6.443206,,6082248.0,8857.273151,39.266962,,,,1.060644,1.705469,
min,499.0,0.0,0.0,1.0,1.0,,65.0,0.0,1720.0,,,,34.603396,-79.756718,
25%,250000.0,2.0,1.0,1.0,2.0,,5000.0,1226.0,1929.75,,,,40.749807,-74.993601,
50%,500000.0,3.0,2.0,2.0,2.0,,10454.0,1724.0,1958.0,,,,41.011198,-73.952879,
75%,850000.0,4.0,3.0,2.0,3.0,,31428.5,2392.0,1983.0,,,,42.721893,-73.738491,


In [6]:
df_clean.isnull().sum().sum()

12308

In [7]:
df_clean.isnull().sum()

price            0
beds           206
baths           97
garage        4204
stories       2925
house_type       0
lot_sqft      1829
sqft          1967
year_built     348
address         20
state            0
city             3
lat            347
lon            347
county          15
dtype: int64

## Deal with missing data

### 1. Delete missing values:

In [8]:
df_clean = df_clean.dropna(subset=['city','county']).reset_index(drop=True)
df_clean

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,lat,lon,county
0,139900,3.0,2.0,,,single_family,12632.0,1190.0,1920.0,154 Maple Ave,NY,Delanson,42.734360,-74.185005,Schenectady
1,395000,4.0,3.0,2.0,2.0,single_family,30056.0,2987.0,1982.0,1169 Hidden Valley Trl,NY,Webster,43.243062,-77.440707,Monroe
2,185000,4.0,2.0,1.0,1.0,single_family,7501.0,1863.0,1965.0,7869 Oneida Trl,NY,Bridgeport,43.163020,-75.982109,Onondaga
3,440000,4.0,3.0,2.0,2.0,single_family,17860.0,1940.0,1965.0,16 Brookland Farms Rd,NY,Poughkeepsie,41.635654,-73.910101,Dutchess
4,975700,5.0,6.0,3.0,2.0,single_family,25544.0,5660.0,1999.0,7534 Plum Hollow Cir,NY,Liverpool,43.146254,-76.169151,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8630,1049000,6.0,2.0,1.0,,multi_family,2896.0,,1920.0,74-15 88th Ave,NY,Woodhaven,40.689428,-73.867066,Queens
8631,1495000,1.0,1.0,,16.0,coop,,,1925.0,45 5th Ave Apt 17C,NY,New York City,40.734161,-73.994557,New York
8632,1395000,3.0,1.0,,5.0,condos,,,1890.0,705 Carroll St Apt 4R,NY,New York City,40.673940,-73.977606,Kings
8633,4500000,6.0,4.0,,4.0,townhomes,,,,608 3rd St,NY,New York City,40.668649,-73.975270,Kings


In [9]:
df_clean.isnull().sum()

price            0
beds           205
baths           96
garage        4188
stories       2921
house_type       0
lot_sqft      1817
sqft          1960
year_built     347
address         20
state            0
city             0
lat            333
lon            333
county           0
dtype: int64

### 2. Fill NaNs :

In [10]:
df_clean.year_built = df_clean.year_built.fillna(df_clean.year_built.median())
df_clean.beds = df_clean.beds.fillna(df_clean.beds.median())
df_clean.baths = df_clean.baths.fillna(df_clean.baths.median())
df_clean.garage =df_clean.garage.fillna(0, inplace = False)
df_clean.stories = df_clean.stories.fillna(df_clean.stories.median())
df_clean.sqft = df_clean.sqft.fillna(df_clean.sqft.median())
df_clean.lot_sqft = df_clean.lot_sqft.fillna(df_clean.lot_sqft.median())
df_clean.address = df_clean.address.fillna('Not Specified')

In [11]:
df_clean.isnull().sum()

price           0
beds            0
baths           0
garage          0
stories         0
house_type      0
lot_sqft        0
sqft            0
year_built      0
address         0
state           0
city            0
lat           333
lon           333
county          0
dtype: int64

## Drop the unnecessary columns

In [12]:
df_clean = df_clean.drop(columns=(['lon', 'lat']))

## Remove duplicates

In [13]:
df_clean= df_clean.drop_duplicates().reset_index(drop=True)
df_clean

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900,3.0,2.0,0.0,2.0,single_family,12632.0,1190.0,1920.0,154 Maple Ave,NY,Delanson,Schenectady
1,395000,4.0,3.0,2.0,2.0,single_family,30056.0,2987.0,1982.0,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000,4.0,2.0,1.0,1.0,single_family,7501.0,1863.0,1965.0,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000,4.0,3.0,2.0,2.0,single_family,17860.0,1940.0,1965.0,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700,5.0,6.0,3.0,2.0,single_family,25544.0,5660.0,1999.0,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8622,1049000,6.0,2.0,1.0,2.0,multi_family,2896.0,1725.0,1920.0,74-15 88th Ave,NY,Woodhaven,Queens
8623,1495000,1.0,1.0,0.0,16.0,coop,10454.0,1725.0,1925.0,45 5th Ave Apt 17C,NY,New York City,New York
8624,1395000,3.0,1.0,0.0,5.0,condos,10454.0,1725.0,1890.0,705 Carroll St Apt 4R,NY,New York City,Kings
8625,4500000,6.0,4.0,0.0,4.0,townhomes,10454.0,1725.0,1958.0,608 3rd St,NY,New York City,Kings


## Changing data types

In [14]:
df_clean['year_built']=df_clean['year_built'].astype(np.int64)

df_clean['beds']=df_clean['beds'].astype(np.float64)

df_clean['baths']=df_clean['baths'].astype(np.float64)

df_clean['stories']=df_clean['stories'].astype(np.int64)

df_clean['lot_sqft']=df_clean['lot_sqft'].astype(np.int64)

df_clean['sqft']=df_clean['sqft'].astype(np.int64)

df_clean['garage']=df_clean['garage'].astype(np.int64)

In [15]:
df_clean.dtypes

price           int64
beds          float64
baths         float64
garage          int64
stories         int64
house_type     object
lot_sqft        int64
sqft            int64
year_built      int64
address        object
state          object
city           object
county         object
dtype: object

## Using pandas describe() to find outliers


In [16]:
df_clean.describe(include='all')

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
count,8627.0,8627.0,8627.0,8627.0,8627.0,8627,8627.0,8627.0,8627.0,8627,8627,8627,8627
unique,,,,,,11,,,,8558,1,1122,62
top,,,,,,single_family,,,,Not Specified,NY,New York City,Suffolk
freq,,,,,,5350,,,,20,8627,609,944
mean,886679.5,3.273328,2.323751,0.93219,2.955141,,181558.9,2097.299177,1954.845369,,,,
std,1964671.0,1.678394,1.382996,1.964717,5.283079,,5409682.0,7780.62562,38.499541,,,,
min,499.0,0.0,0.0,0.0,1.0,,65.0,0.0,1720.0,,,,
25%,249999.0,2.0,1.0,0.0,2.0,,6000.0,1360.0,1930.0,,,,
50%,500000.0,3.0,2.0,1.0,2.0,,10454.0,1725.0,1958.0,,,,
75%,850000.0,4.0,3.0,2.0,2.0,,21780.0,2159.5,1981.0,,,,


## Detecting & handling outliers

### 1. Price outliers : 

In [17]:
df_clean['price'].describe()

count    8.627000e+03
mean     8.866795e+05
std      1.964671e+06
min      4.990000e+02
25%      2.499990e+05
50%      5.000000e+05
75%      8.500000e+05
max      5.800000e+07
Name: price, dtype: float64

In [18]:
z_score = (df_clean['price'] - df_clean['price'].mean())/df_clean['price'].std()

In [19]:
price_outliers = abs(z_score)>3
sum(price_outliers)

98

In [20]:
min(df_clean.price[price_outliers])

6890000

In [21]:
max(df_clean.price[price_outliers])

58000000

In [22]:
df_clean[(df_clean['price'] < 100000) | (df_clean['price'] > 6890000)]

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
89,78000,6.0,3.0,0,2,multi_family,3450,2440,1920,2128 Jerauld Ave,NY,Niagara Falls,Niagara
94,24900,4.0,2.0,0,2,single_family,33933,2662,1860,1009 State Route 48,NY,Fulton,Oswego
97,94900,3.0,2.0,1,2,multi_family,23174,2312,1860,2056 Harwood Dr,NY,Sandy Creek,Oswego
101,89900,3.0,2.0,2,2,single_family,4800,1455,1935,1900 Tracy St,NY,Endicott,Broome
141,34900,6.0,2.0,0,2,multi_family,6961,1962,1900,415 S Roberts Rd,NY,Dunkirk,Chautauqua
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8429,85500,5.0,2.0,1,3,single_family,4251,1486,1911,66 N Gordon St,NY,Gouverneur,St. Lawrence
8513,89900,3.0,2.0,0,3,single_family,3872,1308,1909,23 Dickinson St,NY,Rochester,Monroe
8530,6995000,5.0,6.0,0,20,condo_townhome_rowhome_coop,10454,4400,1989,2373 Broadway Ph 123,NY,New York City,New York
8596,49900,2.0,1.0,0,2,single_family,1307,1056,1920,418 Bissell Ave,NY,Rome,Oneida


### 1.1 Handling price outliers :

In [23]:
df_clean['price'] = np.where((df_clean.price<100000),np.nan,df_clean.price)
df_clean['price'] = np.where((df_clean.price>6890000),np.nan,df_clean.price)
df_clean.isnull().sum()

price         531
beds            0
baths           0
garage          0
stories         0
house_type      0
lot_sqft        0
sqft            0
year_built      0
address         0
state           0
city            0
county          0
dtype: int64

In [24]:
df_clean.price = df_clean.price.fillna(df_clean.price.median())
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

### 2. Bed outliers : 

In [25]:
df_clean['beds'].describe()

count    8627.000000
mean        3.273328
std         1.678394
min         0.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        26.000000
Name: beds, dtype: float64

In [26]:
z_score = (df_clean['beds'] - df_clean['beds'].mean())/df_clean['beds'].std()

In [27]:
beds_outliers = abs(z_score)>3
sum(beds_outliers)

87

In [28]:
min(df_clean.beds[beds_outliers])

9.0

In [29]:
max(df_clean.beds[beds_outliers])

26.0

In [30]:
df_clean[df_clean['beds']>13]

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
1554,999000.0,21.0,16.0,0,2,multi_family,3498739,12000,1880,34315 State Route 10,NY,Hamden,Delaware
1800,4250000.0,14.0,7.0,0,4,townhomes,2530,6180,1920,275 6th Ave,NY,New York City,Kings
3261,1800000.0,15.0,9.0,0,2,multi_family,6399,8248,1924,55 Waller Ave,NY,White Plains,Westchester
4010,625000.0,15.0,15.0,0,4,apartment,7480,8556,1965,5 Mather St,NY,Binghamton,Broome
5434,1500000.0,21.0,2.0,0,3,multi_family,9897,11832,1900,174 Alexander St,NY,Rochester,Monroe
5480,799000.0,14.0,8.0,0,2,multi_family,435600,7146,1945,181 Ulster Ave,NY,Ulster Park,Ulster
6027,1575000.0,14.0,6.0,0,2,multi_family,2056,5103,1926,1565 Saint Peters Ave,NY,Bronx,Bronx
6143,6445000.0,26.0,20.0,0,3,multi_family,5000,1725,1926,28-50 37th St,NY,Astoria,Queens
6554,535000.0,20.0,20.0,0,2,apartment,10454,8120,1910,518 E 83rd St,NY,New York,New York
7526,1200000.0,18.0,6.0,0,2,multi_family,2742,1725,1930,1686 Gates Ave,NY,Ridgewood,Queens


### 2.1 Handling bed outliers :

In [31]:
df_clean['beds'] = np.where((df_clean.beds>13),np.nan,df_clean.beds)
df_clean.isnull().sum()


price          0
beds          11
baths          0
garage         0
stories        0
house_type     0
lot_sqft       0
sqft           0
year_built     0
address        0
state          0
city           0
county         0
dtype: int64

In [32]:
df_clean=df_clean.dropna(subset=['beds'])
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

### 3. Bath outliers : 

In [33]:
df_clean['baths'].describe()

count    8616.000000
mean        2.313370
std         1.336392
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        16.000000
Name: baths, dtype: float64

In [34]:
z_score = (df_clean['baths'] - df_clean['baths'].mean())/df_clean['baths'].std()

In [35]:
baths_outliers = abs(z_score)>3
sum(baths_outliers)

113

In [36]:
min(df_clean.baths[baths_outliers])

7.0

In [37]:
max(df_clean.baths[baths_outliers])


16.0

In [38]:
df_clean[df_clean['baths']>7].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,175000.0,3.0,9.0,0,2,mobile,100188,1008,2004,6 Spruce St,NY,Highland,Ulster
1,4200000.0,8.0,9.0,0,2,single_family,91476,9350,2011,215 Roses Grove Rd,NY,Southampton,Suffolk
2,535000.0,6.0,9.0,0,2,single_family,82764,8000,1958,35 Ezekills Holw,NY,Sag Harbor,Suffolk
3,5975000.0,6.0,8.0,0,2,single_family,44867,7036,2022,294 Abrahams Path,NY,East Hampton,Suffolk
4,5495000.0,6.0,10.0,4,2,single_family,84071,1725,2006,34 Parrish Pond Ln,NY,Southampton,Suffolk
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,535000.0,9.0,10.0,0,3,townhomes,10454,8000,1958,457 W 24th St,NY,New York City,New York
60,535000.0,8.0,12.0,3,2,single_family,65340,10315,2009,501 Parsonage Ln,NY,Sagaponack,Suffolk
61,1200000.0,8.0,11.0,0,2,multi_family,436,1725,1888,226 Jay St,NY,Albany,Albany
62,5799000.0,7.0,9.0,3,3,single_family,254390,8898,1860,15 Sarosca Farm Ln,NY,Purchase,Westchester


### 3.1 Handling bath outliers :

In [39]:
df_clean['baths'] = np.where((df_clean.baths>7),np.nan,df_clean.baths)
df_clean.isnull().sum()

price          0
beds           0
baths         64
garage         0
stories        0
house_type     0
lot_sqft       0
sqft           0
year_built     0
address        0
state          0
city           0
county         0
dtype: int64

In [40]:
df_clean=df_clean.dropna(subset=['baths'])
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

### 4. Garage outliers : 

In [41]:
df_clean['garage'].describe()

count    8552.000000
mean        0.928555
std         1.965238
min         0.000000
25%         0.000000
50%         1.000000
75%         2.000000
max        99.000000
Name: garage, dtype: float64

In [42]:
z_score = (df_clean['garage'] - df_clean['garage'].mean())/df_clean['garage'].std()

In [43]:
garage_outliers = abs(z_score)>3
sum(garage_outliers)

25

In [44]:
min(df_clean.garage[garage_outliers])

7

In [45]:
max(df_clean.garage[garage_outliers])

99

In [46]:
df_clean[df_clean['garage']>=7].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,424900.0,4.0,2.0,10,2,single_family,220849,2210,1900,109 Crane St,NY,Charlton,Saratoga
1,525000.0,4.0,2.0,8,1,single_family,56192,1725,1997,91 Church Ln,NY,Middle Island,Suffolk
2,790000.0,4.0,3.0,8,2,single_family,683892,2800,1840,7635 State Highway 357,NY,Franklin,Delaware
3,329000.0,1.0,1.0,77,1,coop,10454,800,1958,6565 Wetherole St Apt 5A,NY,Rego Park,Queens
4,219900.0,3.0,1.0,10,1,single_family,37026,1464,1976,1678 Minsteed Rd,NY,Newark,Wayne
5,2450000.0,5.0,7.0,7,2,single_family,2644963,7374,2004,1723 Nys Route 4,NY,Stillwater,Saratoga
6,500000.0,4.0,3.0,8,2,single_family,35719,2464,1979,32 Revere Rd,NY,Queensbury,Warren
7,524900.0,4.0,3.0,12,2,single_family,334541,2240,2006,675 Taylor Rd,NY,Davenport,Delaware
8,675500.0,5.0,4.0,10,2,single_family,274428,6000,2007,13889 Henskee Rd,NY,Alden,Erie
9,1100000.0,9.0,7.0,8,2,multi_family,435600,5185,1905,22 Bellvale Lakes Rd,NY,Warwick,Orange


### 4.1 Handling garage outliers :

In [47]:
df_clean['garage'] = np.where((df_clean.garage>=7),np.nan,df_clean.garage)
df_clean.isnull().sum()

price          0
beds           0
baths          0
garage        25
stories        0
house_type     0
lot_sqft       0
sqft           0
year_built     0
address        0
state          0
city           0
county         0
dtype: int64

In [48]:
df_clean.garage = df_clean.garage.fillna(df_clean.garage.median())
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

### 5. Sqft outliers : 

In [49]:
df_clean['sqft'].describe()

count      8552.000000
mean       2058.181010
std        7793.401598
min           0.000000
25%        1352.000000
50%        1725.000000
75%        2139.000000
max      642284.000000
Name: sqft, dtype: float64

In [50]:
z_score = (df_clean['sqft'] - df_clean['sqft'].mean())/df_clean['sqft'].std()

In [51]:
sqft_outliers = abs(z_score)>3
sum(sqft_outliers)

7

In [52]:
min(df_clean.sqft[sqft_outliers])

36590

In [53]:
max(df_clean.sqft[sqft_outliers])

642284

In [54]:
df_clean[df_clean['sqft']<320].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,3750000.0,2.0,2.0,0.0,2,condos,10454,0,1930,160 Central Park S Units 3101 & 3118,NY,New York,New York
1,3200000.0,6.0,7.0,0.0,2,single_family,10454,0,1899,258 Stuyvesant Ave,NY,Brooklyn,Kings
2,279000.0,1.0,1.0,0.0,2,coop,191664,260,1987,231 Dune Rd Unit 603,NY,Westhampton Beach,Suffolk
3,159000.0,2.0,2.0,0.0,1,single_family,27443,228,1970,1390 Tupper Rd,NY,Long Lake,Hamilton
4,535000.0,1.0,1.0,0.0,2,single_family,4792,286,1930,556 Alyssa Way,NY,Cambridge,Washington
5,169000.0,1.0,1.0,0.0,1,single_family,93654,300,2021,36349 County Route 22,NY,Theresa,Jefferson
6,279000.0,0.0,1.0,0.0,2,coop,10454,300,1958,25 Tudor City Pl Apt 2006,NY,New York City,New York
7,275000.0,0.0,1.0,0.0,2,coop,10454,300,1958,Not Specified,NY,New York City,New York
8,469000.0,1.0,1.0,0.0,2,condo_townhome_rowhome_coop,10454,0,1903,152 E 35th St Apt 2G,NY,New York,New York
9,4000000.0,0.0,2.0,0.0,2,single_family,10454,0,1958,10510 150th St,NY,Queens,Queens


In [55]:
df_clean[df_clean['sqft']>=36590].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,1100000.0,2.0,1.0,0.0,2,coop,10454,37782,1930,253 W 16th St # 1B,NY,New York,New York
1,5250000.0,3.0,5.0,0.0,2,coop,10454,170106,1959,900 Fifth Ave Apt 7B,NY,New York,New York
2,3000000.0,2.0,2.0,1.0,2,single_family,91912,80270,1951,514 Centre Island Rd,NY,Centre Island,Nassau
3,650000.0,5.0,4.0,2.0,2,single_family,36590,36590,1993,5 Martha Pl,NY,Port Jefferson Station,Suffolk
4,695000.0,0.0,1.0,0.0,2,condos,10454,226516,1983,347 W 57th St Apt 16E,NY,New York,New York
5,1395000.0,1.0,1.0,0.0,2,coop,10454,114445,1899,60 Pineapple St Apt 5H,NY,Brooklyn,Kings
6,535000.0,0.0,1.0,0.0,2,coop,10454,642284,1961,205 W End Ave Apt 16R,NY,New York,New York


### 5.1 Handling sqft outliers :

In [56]:
df_clean['sqft'] = np.where((df_clean.sqft>=36590),np.nan,df_clean.sqft)
df_clean['sqft'] = np.where((df_clean.sqft<=300),np.nan,df_clean.sqft)
df_clean.isnull().sum()

price          0
beds           0
baths          0
garage         0
stories        0
house_type     0
lot_sqft       0
sqft          18
year_built     0
address        0
state          0
city           0
county         0
dtype: int64

In [57]:
df_clean.sqft = df_clean.sqft.fillna(df_clean.sqft.median())
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

### 6. Lot Sqft outliers :

###          6.1 Removing values of "Lot Sqft" that are smaller or equal to "Sqft" :

In [58]:
df_clean

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900.0,3.0,2.0,0.0,2,single_family,12632,1190.0,1920,154 Maple Ave,NY,Delanson,Schenectady
1,395000.0,4.0,3.0,2.0,2,single_family,30056,2987.0,1982,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000.0,4.0,2.0,1.0,1,single_family,7501,1863.0,1965,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000.0,4.0,3.0,2.0,2,single_family,17860,1940.0,1965,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700.0,5.0,6.0,3.0,2,single_family,25544,5660.0,1999,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8622,1049000.0,6.0,2.0,1.0,2,multi_family,2896,1725.0,1920,74-15 88th Ave,NY,Woodhaven,Queens
8623,1495000.0,1.0,1.0,0.0,16,coop,10454,1725.0,1925,45 5th Ave Apt 17C,NY,New York City,New York
8624,1395000.0,3.0,1.0,0.0,5,condos,10454,1725.0,1890,705 Carroll St Apt 4R,NY,New York City,Kings
8625,4500000.0,6.0,4.0,0.0,4,townhomes,10454,1725.0,1958,608 3rd St,NY,New York City,Kings


In [59]:

count=0

for i, j in df_clean.iterrows():
    if((df_clean['lot_sqft'][i]) <= (df_clean['sqft'][i])):
        count+=1
        df_clean.drop([i], axis=0, inplace=True)

print(count)
print()

df_clean = df_clean.reset_index(drop=True)
df_clean

421



Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900.0,3.0,2.0,0.0,2,single_family,12632,1190.0,1920,154 Maple Ave,NY,Delanson,Schenectady
1,395000.0,4.0,3.0,2.0,2,single_family,30056,2987.0,1982,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000.0,4.0,2.0,1.0,1,single_family,7501,1863.0,1965,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000.0,4.0,3.0,2.0,2,single_family,17860,1940.0,1965,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700.0,5.0,6.0,3.0,2,single_family,25544,5660.0,1999,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8126,1049000.0,6.0,2.0,1.0,2,multi_family,2896,1725.0,1920,74-15 88th Ave,NY,Woodhaven,Queens
8127,1495000.0,1.0,1.0,0.0,16,coop,10454,1725.0,1925,45 5th Ave Apt 17C,NY,New York City,New York
8128,1395000.0,3.0,1.0,0.0,5,condos,10454,1725.0,1890,705 Carroll St Apt 4R,NY,New York City,Kings
8129,4500000.0,6.0,4.0,0.0,4,townhomes,10454,1725.0,1958,608 3rd St,NY,New York City,Kings


### 6.2 The Outliers : 

In [60]:
df_clean['lot_sqft'].describe()

count    8.131000e+03
mean     1.905612e+05
std      5.571666e+06
min      4.360000e+02
25%      6.970000e+03
50%      1.045400e+04
75%      2.221600e+04
max      4.355564e+08
Name: lot_sqft, dtype: float64

In [61]:
z_score = (df_clean['lot_sqft'] - df_clean['lot_sqft'].mean())/df_clean['lot_sqft'].std()

In [62]:
lot_sqft_outliers = abs(z_score)>3
sum(lot_sqft_outliers)

6

In [63]:
min(df_clean.lot_sqft[lot_sqft_outliers])

18360540

In [64]:
max(df_clean.lot_sqft[lot_sqft_outliers])

435556440

In [65]:
df_clean[df_clean['lot_sqft']>=18360540].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,2300000.0,3.0,4.0,2.0,2,single_family,18360540,5312.0,2008,1435 Webster St,NY,Malone,Franklin
1,165000.0,5.0,2.0,2.0,2,multi_family,213444000,1725.0,1900,613 Orchard St,NY,Schenectady,Schenectady
2,999000.0,5.0,4.0,2.0,2,multi_family,435556440,2500.0,2011,260 Zerega Ave,NY,Bronx,Bronx
3,119000.0,3.0,2.0,0.0,1,mobile,18992160,1552.0,1986,1 Apple Grv,NY,New Hartford,Oneida
4,535000.0,6.0,3.0,0.0,2,multi_family,113865840,2168.0,1890,329 9th St,NY,Troy,Rensselaer
5,535000.0,2.0,1.0,1.0,1,mobile,39639600,910.0,1974,20 Quarry Hill Est,NY,Akron,Erie


### 6.2.1 Handling Lot Sqft outliers 

In [66]:
df_clean['lot_sqft'] = np.where((df_clean.lot_sqft>=18360540),np.nan,df_clean.lot_sqft)
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      6
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

In [67]:
df_clean=df_clean.dropna(subset=['lot_sqft'])
df_clean.isnull().sum()


price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

In [68]:
df_clean

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900.0,3.0,2.0,0.0,2,single_family,12632.0,1190.0,1920,154 Maple Ave,NY,Delanson,Schenectady
1,395000.0,4.0,3.0,2.0,2,single_family,30056.0,2987.0,1982,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000.0,4.0,2.0,1.0,1,single_family,7501.0,1863.0,1965,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000.0,4.0,3.0,2.0,2,single_family,17860.0,1940.0,1965,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700.0,5.0,6.0,3.0,2,single_family,25544.0,5660.0,1999,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8126,1049000.0,6.0,2.0,1.0,2,multi_family,2896.0,1725.0,1920,74-15 88th Ave,NY,Woodhaven,Queens
8127,1495000.0,1.0,1.0,0.0,16,coop,10454.0,1725.0,1925,45 5th Ave Apt 17C,NY,New York City,New York
8128,1395000.0,3.0,1.0,0.0,5,condos,10454.0,1725.0,1890,705 Carroll St Apt 4R,NY,New York City,Kings
8129,4500000.0,6.0,4.0,0.0,4,townhomes,10454.0,1725.0,1958,608 3rd St,NY,New York City,Kings


### 7. Stories outliers :

In [69]:
df_clean['stories'].describe()

count    8125.000000
mean        2.992000
std         5.423095
min         1.000000
25%         2.000000
50%         2.000000
75%         2.000000
max        96.000000
Name: stories, dtype: float64

In [70]:
z_score = (df_clean['stories'] - df_clean['stories'].mean())/df_clean['stories'].std()

In [71]:
stories_outliers = abs(z_score)>3
sum(stories_outliers)

182

In [72]:
min(df_clean.stories[stories_outliers])

20

In [73]:
max(df_clean.stories[stories_outliers])

96

In [74]:
df_clean[df_clean['stories']>=20].reset_index(drop=True)

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,1049000.0,1.0,1.0,0.0,41,condos,10454.0,783.0,1999,2 Columbus Ave Apt 2D,NY,New York City,New York
1,500000.0,0.0,1.0,0.0,21,coop,10454.0,509.0,1963,225 E 57th St Apt 4R,NY,New York City,New York
2,550000.0,1.0,1.0,0.0,30,coop,10454.0,550.0,1964,140 W End Ave Apt 22E,NY,New York City,Queens
3,1499900.0,1.0,1.0,0.0,20,coop,10454.0,800.0,1962,175 W 13th St Apt 16C,NY,New York City,New York
4,235000.0,1.0,1.0,0.0,22,coop,25125.0,1725.0,1963,100 W 57th St Unit 12R,NY,New York City,New York
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,2375000.0,2.0,3.0,0.0,40,condos,4457.0,1677.0,2018,30 E 31st St Unit 15,NY,New York City,New York
178,535000.0,5.0,6.0,0.0,20,condo_townhome_rowhome_coop,10454.0,4400.0,1989,2373 Broadway Ph 123,NY,New York City,New York
179,3811500.0,2.0,3.0,0.0,36,condos,10454.0,2380.0,2021,1059 Third Ave Unit 20A,NY,New York,New York
180,1299000.0,0.0,1.0,0.0,25,coop,10454.0,789.0,1965,20 E 9th St Unit 7G,NY,New York City,New York


### 7.1 Handling sqft outliers :

In [75]:
df_clean['stories'] = np.where((df_clean.stories>=20),np.nan,df_clean.stories)
df_clean.isnull().sum()

price           0
beds            0
baths           0
garage          0
stories       182
house_type      0
lot_sqft        0
sqft            0
year_built      0
address         0
state           0
city            0
county          0
dtype: int64

In [76]:
df_clean=df_clean.dropna(subset=['stories'])
df_clean.isnull().sum()

price         0
beds          0
baths         0
garage        0
stories       0
house_type    0
lot_sqft      0
sqft          0
year_built    0
address       0
state         0
city          0
county        0
dtype: int64

In [77]:
df_clean = df_clean.reset_index(drop=True)
df_clean

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900.0,3.0,2.0,0.0,2.0,single_family,12632.0,1190.0,1920,154 Maple Ave,NY,Delanson,Schenectady
1,395000.0,4.0,3.0,2.0,2.0,single_family,30056.0,2987.0,1982,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000.0,4.0,2.0,1.0,1.0,single_family,7501.0,1863.0,1965,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000.0,4.0,3.0,2.0,2.0,single_family,17860.0,1940.0,1965,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700.0,5.0,6.0,3.0,2.0,single_family,25544.0,5660.0,1999,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7938,1049000.0,6.0,2.0,1.0,2.0,multi_family,2896.0,1725.0,1920,74-15 88th Ave,NY,Woodhaven,Queens
7939,1495000.0,1.0,1.0,0.0,16.0,coop,10454.0,1725.0,1925,45 5th Ave Apt 17C,NY,New York City,New York
7940,1395000.0,3.0,1.0,0.0,5.0,condos,10454.0,1725.0,1890,705 Carroll St Apt 4R,NY,New York City,Kings
7941,4500000.0,6.0,4.0,0.0,4.0,townhomes,10454.0,1725.0,1958,608 3rd St,NY,New York City,Kings


## Counting categorical features

In [78]:
df_clean['house_type'].value_counts()

single_family                  5250
coop                            885
multi_family                    717
condos                          651
townhomes                       198
land                            112
mobile                          103
condo_townhome_rowhome_coop      19
apartment                         4
condop                            2
farm                              2
Name: house_type, dtype: int64

In [79]:
df_clean.to_csv('RealEstateNewYork_Clean.csv')