# terminal execution

In [47]:
! pip3 install pyarrow

Collecting pyarrow
  Downloading pyarrow-2.0.0-cp38-cp38-manylinux2014_x86_64.whl (17.8 MB)
[K     |████████████████████████████████| 17.8 MB 655 kB/s eta 0:00:01    |███████▎                        | 4.0 MB 4.5 MB/s eta 0:00:04
Installing collected packages: pyarrow
Successfully installed pyarrow-2.0.0


# user input

In [1]:
basePath = '/home/bobby/'
codePath = 'ml/src'
dataPath = 'dat'

projectName = 'melbourne_housing_snapshot'
rawInputPath = 'melb_data.csv'

# dependencies

In [2]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

import datetime

# initialization

In [3]:
### Initialization
BASE_DIRECTORY = os.path.dirname(basePath)
CODE_DIRECTORY = os.path.join(BASE_DIRECTORY, codePath)
DATA_DIRECTORY = os.path.join(BASE_DIRECTORY, dataPath)

projectDir = os.path.join(DATA_DIRECTORY, projectName)
inDir = os.path.join(projectDir, 'in')
outDir = os.path.join(projectDir, 'out')
rawInput = os.path.join(inDir, rawInputPath)

y_eda_outfile = os.path.join(outDir, 'y_eda.feather')
y_preprocessed_outfile = os.path.join(outDir, 'y_preprocessed.feather')
y_featureengineered_outfile = os.path.join(outDir, 'y_featureengineered.feather')
y_modelled_outfile = os.path.join(outDir, 'y_modelled.feather')
y_resulted_outfile = os.path.join(outDir, 'y_resulted.feather')
y_test_outfile = os.path.join(outDir, 'y_test.feather')


x_eda_outfile = os.path.join(outDir, 'x_eda.feather')
x_preprocessed_outfile = os.path.join(outDir, 'x_preprocessed.feather')
x_featureengineered_outfile = os.path.join(outDir, 'x_featureengineered.feather')
x_modelled_outfile = os.path.join(outDir, 'x_modelled.feather')
x_resulted_outfile = os.path.join(outDir, 'x_resulted.feather')

x_test_outfile = os.path.join(outDir, 'x_test.feather')


# custom classes and functions

In [4]:
### Custom classes and functions


# data input

In [5]:
rawDataframe = pd.read_csv(rawInput)

x = rawDataframe.drop('Price', axis=1)
y = rawDataframe.Price

train_x, test_x, train_y, test_y = train_test_split(x, y, random_state = 0)
test_x.reset_index().to_feather(x_test_outfile)
test_y.to_frame().reset_index().to_feather(y_test_outfile)

# exploratory data analysis

In [7]:
rawDataframe.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [8]:
rawDataframe.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [9]:
y_eda = rawDataframe.Price
x_eda = rawDataframe.drop('Price', axis=1)
y_eda.to_frame().to_feather(y_eda_outfile)
x_eda.to_feather(x_eda_outfile)

# pre-processing

In [10]:
y_preprocessed = pd.read_feather(y_eda_outfile)
x_preprocessed = pd.read_feather(x_eda_outfile)

In [11]:
y_preprocessed.to_feather(y_preprocessed_outfile)
x_preprocessed.to_feather(x_preprocessed_outfile)

# feature engineering

In [12]:
y_featureengineered = pd.read_feather(y_preprocessed_outfile)
x_featureengineered = pd.read_feather(x_preprocessed_outfile)

In [13]:
y_featureengineered.to_feather(y_featureengineered_outfile)
x_featureengineered.to_feather(x_featureengineered_outfile)

# modeling

In [14]:
y_modelled = pd.read_feather(y_featureengineered_outfile)
x_modelled = pd.read_feather(x_featureengineered_outfile)

In [15]:
dtr = DecisionTreeRegressor(random_state=1)

dtr.fit(x, y)

ValueError: could not convert string to float: 'Abbotsford'

# validation

In [None]:
### Testing


# comparison with others

# double check

# results

In [None]:
### Results
