# Importing packages and datasets.

In [1]:
import pandas
import numpy
import pickle

import altair

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error


In [2]:
dataset_file_path = "../data/processed/"


In [3]:
dataFrame = pandas.read_csv(dataset_file_path + "cleaned.csv")
print(dataFrame.shape)


(2895, 30)


In [None]:
normalisedDataFrame = pandas.read_csv(dataset_file_path + "normalised.csv")
print(normalisedDataFrame.shape)


(2895, 30)


In [4]:
print(dataFrame.columns)


Index(['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate',
       'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale', 'AvgHouseholdSize',
       'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctBachDeg18_24',
       'PctHS25_Over', 'PctBachDeg25_Over', 'PctEmployed16_Over',
       'PctUnemployed16_Over', 'PctPrivateCoverage', 'PctEmpPrivCoverage',
       'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
       'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate'],
      dtype='object')


In [5]:
possibly_related_fields = [
                           "medIncome", "povertyPercent", "PctUnemployed16_Over", "PctEmployed16_Over",
                           "PctHS25_Over", "PctBachDeg25_Over",
                           "PctPrivateCoverage", "PctEmpPrivCoverage", "PctPublicCoverage", "PctPublicCoverageAlone",
                           "PctMarriedHouseholds", "PercentMarried"
                           ]


In [6]:
X = dataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


In [None]:
X = normalisedDataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


# Baseline Performance.

In [7]:
y_mean = y.mean()

y_base = numpy.full((y_test.shape[0], ), y_mean)
(y_base - y_test).sum()

mean_squared_error(y_test, y_base, squared=True)


712.9935696334397

# Training.

In [8]:
dtRegressor = DecisionTreeRegressor(
		criterion="squared_error", 		#“squared_error”, “friedman_mse”, “absolute_error”, “poisson”
		splitter="random",				#“best”, “random”
		max_depth=10,					#int, default=None
		min_samples_split=32,			#int or float, default=2
		min_samples_leaf=100,			#int or float, default=1
		min_weight_fraction_leaf=0.0,	#float, default=0.0
		max_features=None,				#int, float or {“auto”, “sqrt”, “log2”}, default=None
		random_state=None,				#int, RandomState instance or None, default=None
		max_leaf_nodes=None,			#int, default=None
		min_impurity_decrease=0.0,		#float, default=0.0
		ccp_alpha=0.0					#non-negative float, default=0.0
)	

regression = dtRegressor.fit(df_train, y_train)


In [None]:
model_file_path = "../models/"
model_file_name = "decision_tree.pickle"


In [11]:
pickle.dump(dtRegressor, open(model_file_path + model_file_name, "wb"))


In [12]:
dtRegressor = pickle.load(open(model_file_path + model_file_name, "rb"))


# Testing.

In [13]:
def results(test_type):

	mse = 0.0
	prediction_line = altair.Chart()
	actual_line = altair.Chart()

	if test_type == "test":
		y_preds = regression.predict(df_test)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y',
			color='blue'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_test})).mark_line().encode(
      		x='x',
      		y='y',
			color='orange'
    	)
		mse = mean_squared_error(y_test, y_preds, squared=True)

	elif test_type == "train":
		y_preds = regression.predict(df_train)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y',
			color='blue'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_train})).mark_line().encode(
      		x='x',
      		y='y',
			  color='orange'
    	)
		mse = mean_squared_error(y_train, y_preds, squared=True)

	return (actual_line, prediction_line, mse)


In [15]:
actual_line, prediction_line, mse = results("train")
print(mse)


561.5508819766216


In [16]:
actual_line, prediction_line, mse = results("test")
print(mse)


461.3304108150435
