In [1]:
# DC Properties
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing

In [2]:
location = 'datasets/DC_Properties.csv'
df = pd.read_csv(location)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index(['Unnamed: 0', 'BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'NUM_UNITS', 'ROOMS',
       'BEDRM', 'AYB', 'YR_RMDL', 'EYB', 'STORIES', 'SALEDATE', 'PRICE',
       'QUALIFIED', 'SALE_NUM', 'GBA', 'BLDG_NUM', 'STYLE', 'STRUCT', 'GRADE',
       'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'KITCHENS', 'FIREPLACES',
       'USECODE', 'LANDAREA', 'GIS_LAST_MOD_DTTM', 'SOURCE', 'CMPLX_NUM',
       'LIVING_GBA', 'FULLADDRESS', 'CITY', 'STATE', 'ZIPCODE', 'NATIONALGRID',
       'LATITUDE', 'LONGITUDE', 'ASSESSMENT_NBHD', 'ASSESSMENT_SUBNBHD',
       'CENSUS_TRACT', 'CENSUS_BLOCK', 'WARD', 'SQUARE', 'X', 'Y', 'QUADRANT'],
      dtype='object')

In [5]:
df.isnull().sum()

Unnamed: 0                 0
BATHRM                     0
HF_BATHRM                  0
HEAT                       0
AC                         0
NUM_UNITS              52261
ROOMS                      0
BEDRM                      0
AYB                      271
YR_RMDL                78029
EYB                        0
STORIES                52305
SALEDATE               26770
PRICE                  60741
QUALIFIED                  0
SALE_NUM                   0
GBA                    52261
BLDG_NUM                   0
STYLE                  52261
STRUCT                 52261
GRADE                  52261
CNDTN                  52261
EXTWALL                52261
ROOF                   52261
INTWALL                52261
KITCHENS               52262
FIREPLACES                 0
USECODE                    0
LANDAREA                   0
GIS_LAST_MOD_DTTM          0
SOURCE                     0
CMPLX_NUM             106696
LIVING_GBA            106696
FULLADDRESS            52917
CITY          

In [7]:
df.drop(['CMPLX_NUM', 'LIVING_GBA', 'FULLADDRESS', 'CITY', 'STATE', 'ASSESSMENT_SUBNBHD', 'CENSUS_BLOCK', 
         'NUM_UNITS', 'YR_RMDL','STYLE', 'STRUCT', 'GRADE', 'CNDTN', 'EXTWALL', 'ROOF', 'INTWALL', 'KITCHENS',
         'NATIONALGRID', 'X', 'Y'], axis=1, inplace=True)

In [8]:
df.columns

Index(['Unnamed: 0', 'BATHRM', 'HF_BATHRM', 'HEAT', 'AC', 'ROOMS', 'BEDRM',
       'AYB', 'EYB', 'STORIES', 'SALEDATE', 'PRICE', 'QUALIFIED', 'SALE_NUM',
       'GBA', 'BLDG_NUM', 'FIREPLACES', 'USECODE', 'LANDAREA',
       'GIS_LAST_MOD_DTTM', 'SOURCE', 'ZIPCODE', 'LATITUDE', 'LONGITUDE',
       'ASSESSMENT_NBHD', 'CENSUS_TRACT', 'WARD', 'SQUARE', 'QUADRANT'],
      dtype='object')

In [9]:
df.isnull().sum()

Unnamed: 0               0
BATHRM                   0
HF_BATHRM                0
HEAT                     0
AC                       0
ROOMS                    0
BEDRM                    0
AYB                    271
EYB                      0
STORIES              52305
SALEDATE             26770
PRICE                60741
QUALIFIED                0
SALE_NUM                 0
GBA                  52261
BLDG_NUM                 0
FIREPLACES               0
USECODE                  0
LANDAREA                 0
GIS_LAST_MOD_DTTM        0
SOURCE                   0
ZIPCODE                  1
LATITUDE                 1
LONGITUDE                1
ASSESSMENT_NBHD          1
CENSUS_TRACT             1
WARD                     1
SQUARE                   0
QUADRANT               237
dtype: int64

In [10]:
df.drop(['GBA', 'STORIES'], axis=1, inplace=True)

In [13]:
df = df.dropna()

In [14]:
df.isnull().sum()

Unnamed: 0           0
BATHRM               0
HF_BATHRM            0
HEAT                 0
AC                   0
ROOMS                0
BEDRM                0
AYB                  0
EYB                  0
SALEDATE             0
PRICE                0
QUALIFIED            0
SALE_NUM             0
BLDG_NUM             0
FIREPLACES           0
USECODE              0
LANDAREA             0
GIS_LAST_MOD_DTTM    0
SOURCE               0
ZIPCODE              0
LATITUDE             0
LONGITUDE            0
ASSESSMENT_NBHD      0
CENSUS_TRACT         0
WARD                 0
SQUARE               0
QUADRANT             0
dtype: int64

In [15]:
df.dtypes

Unnamed: 0             int64
BATHRM                 int64
HF_BATHRM              int64
HEAT                  object
AC                    object
ROOMS                  int64
BEDRM                  int64
AYB                  float64
EYB                    int64
SALEDATE              object
PRICE                float64
QUALIFIED             object
SALE_NUM               int64
BLDG_NUM               int64
FIREPLACES             int64
USECODE                int64
LANDAREA               int64
GIS_LAST_MOD_DTTM     object
SOURCE                object
ZIPCODE              float64
LATITUDE             float64
LONGITUDE            float64
ASSESSMENT_NBHD       object
CENSUS_TRACT         float64
WARD                  object
SQUARE                object
QUADRANT              object
dtype: object

In [25]:
df.to_csv('DC_Dataset.csv', index=False, header=True)

In [19]:
year_cutoff = pd.to_datetime('1992-Jan-1')

df['SALEDATE'] = pd.to_datetime(df['SALEDATE'])

df.drop(df[(df['SALEDATE'] <= year_cutoff)].index, inplace=True)

In [24]:
meanPrice = df['PRICE'].mean()
stdPrice = df['PRICE'].std()
toprange = meanPrice + stdPrice * 1.96
botrange = meanPrice - stdPrice *1.96


df = df.drop(df[df['PRICE'] > toprange].index)
df = df.drop(df[df['PRICE'] < botrange].index)

In [27]:
df['ROOMS'].value_counts()

6     19903
4     15813
3     13701
7     11740
5     10555
8      9173
9      4185
2      3822
10     3578
12     1784
11     1331
16      794
13      417
14      313
15      169
20       82
18       66
1        63
0        52
17       33
19       16
21        8
24        7
23        5
22        4
28        2
25        2
30        1
Name: ROOMS, dtype: int64