# Importing packages and datasets.

In [1]:
import pandas
import numpy

import altair

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error


In [None]:
from google.colab import drive
drive.mount('/content/drive')
base_file_path = "/content/drive/MyDrive/"


In [2]:
base_file_path = "../data/processed/"


In [3]:
dataFrame = pandas.read_csv(base_file_path + "cleaned.csv")
print(dataFrame.shape)


(2895, 30)


In [None]:
normalisedDataFrame = pandas.read_csv(base_file_path + "normalised.csv")
print(normalisedDataFrame.shape)


(2895, 30)


In [4]:
print(dataFrame.columns)


Index(['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate',
       'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale', 'AvgHouseholdSize',
       'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctBachDeg18_24',
       'PctHS25_Over', 'PctBachDeg25_Over', 'PctEmployed16_Over',
       'PctUnemployed16_Over', 'PctPrivateCoverage', 'PctEmpPrivCoverage',
       'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
       'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate'],
      dtype='object')


In [5]:
possibly_related_fields = [
                           "medIncome", "povertyPercent", "PctUnemployed16_Over", "PctEmployed16_Over",
                           "PctHS25_Over", "PctBachDeg25_Over",
                           "PctPrivateCoverage", "PctEmpPrivCoverage", "PctPublicCoverage", "PctPublicCoverageAlone",
                           "PctMarriedHouseholds", "PercentMarried"
                           ]


In [6]:
X = dataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


In [None]:
X = normalisedDataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


# Baseline Performance.

In [7]:
y_mean = y.mean()

y_base = numpy.full((y_test.shape[0], ), y_mean)
(y_base - y_test).sum()

mean_squared_error(y_test, y_base, squared=True)


712.9935696334397

# Training.

In [None]:
nnRegressor = neuralNetwork = MLPRegressor(
		hidden_layer_sizes=[128], 				#array-like of shape(n_layers - 2,), default=(100,)
		batch_size="auto", 						#int, default=’auto’
		activation="relu", 						#{’identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’
		solver="adam", 							#{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
		alpha=0.0001,							#float, default=0.0001
		learning_rate ="constant", 				#{‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
		learning_rate_init = 0.001,				#float, default=0.001
		power_t=0.5,							#float, default=0.5
		max_iter=200,							#int, default=200
		shuffle=False,							#bool, default=True, used when solver=’sgd’ or ‘adam’
		random_state=1,
		tol=1e-4,								#float, default=1e-4
		verbose=True							#bool, default=False
)

regression = nnRegressor.fit(df_train, y_train)


# Testing.

In [16]:
def results(test_type):

	mse = 0.0
	prediction_line = altair.Chart()
	actual_line = altair.Chart()

	if test_type == "test":
		y_preds = neuralNetwork.predict(df_test)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y',
			color='blue'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_test})).mark_line().encode(
      		x='x',
      		y='y',
			color='orange'
    	)
		mse = mean_squared_error(y_test, y_preds, squared=True)

	elif test_type == "train":
		y_preds = neuralNetwork.predict(df_train)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y',
			color='blue'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_train})).mark_line().encode(
      		x='x',
      		y='y',
			  color='orange'
    	)
		mse = mean_squared_error(y_train, y_preds, squared=True)

	return (actual_line, prediction_line, mse)


In [17]:
actual_line, prediction_line, mse = results("train")
print(mse)


Iteration 1, loss = 76213953.43813549
Iteration 2, loss = 12893415.53583313
Iteration 3, loss = 390520.27282529
Iteration 4, loss = 1014917.54955776
Iteration 5, loss = 366828.48774938
Iteration 6, loss = 17100.99825522
Iteration 7, loss = 36230.93849631
Iteration 8, loss = 8297.33450427


Iteration 9, loss = 4794.58934986
Iteration 10, loss = 4097.74241130
Iteration 11, loss = 3425.29102794
Iteration 12, loss = 3440.55624714
Iteration 13, loss = 3370.09591180
Iteration 14, loss = 3365.26893696
Iteration 15, loss = 3355.41166890
Iteration 16, loss = 3351.76027002
Iteration 17, loss = 3346.58142407
Iteration 18, loss = 3341.48817088
Iteration 19, loss = 3336.47008671
Iteration 20, loss = 3331.37557056
Iteration 21, loss = 3326.05323735
Iteration 22, loss = 3320.59708909
Iteration 23, loss = 3315.03093695
Iteration 24, loss = 3309.32532208
Iteration 25, loss = 3303.48372981
Iteration 26, loss = 3297.51813610
Iteration 27, loss = 3291.42784148
Iteration 28, loss = 3285.21315221
Iteration 29, loss = 3279.01892587
Iteration 30, loss = 3271.94054003
Iteration 31, loss = 3265.17756674
Iteration 32, loss = 3259.02993625
Iteration 33, loss = 3252.46852214
Iteration 34, loss = 3245.46874045
Iteration 35, loss = 3238.40948405
Iteration 36, loss = 3231.49013027
Iteration 37, loss = 

In [18]:
actual_line, prediction_line, mse = results("test")
print(mse)


Iteration 1, loss = 76213953.43813549
Iteration 2, loss = 12893415.53583313
Iteration 3, loss = 390520.27282529
Iteration 4, loss = 1014917.54955776
Iteration 5, loss = 366828.48774938
Iteration 6, loss = 17100.99825522
Iteration 7, loss = 36230.93849631
Iteration 8, loss = 8297.33450427
Iteration 9, loss = 4794.58934986


Iteration 10, loss = 4097.74241130
Iteration 11, loss = 3425.29102794
Iteration 12, loss = 3440.55624714
Iteration 13, loss = 3370.09591180
Iteration 14, loss = 3365.26893696
Iteration 15, loss = 3355.41166890
Iteration 16, loss = 3351.76027002
Iteration 17, loss = 3346.58142407
Iteration 18, loss = 3341.48817088
Iteration 19, loss = 3336.47008671
Iteration 20, loss = 3331.37557056
Iteration 21, loss = 3326.05323735
Iteration 22, loss = 3320.59708909
Iteration 23, loss = 3315.03093695
Iteration 24, loss = 3309.32532208
Iteration 25, loss = 3303.48372981
Iteration 26, loss = 3297.51813610
Iteration 27, loss = 3291.42784148
Iteration 28, loss = 3285.21315221
Iteration 29, loss = 3279.01892587
Iteration 30, loss = 3271.94054003
Iteration 31, loss = 3265.17756674
Iteration 32, loss = 3259.02993625
Iteration 33, loss = 3252.46852214
Iteration 34, loss = 3245.46874045
Iteration 35, loss = 3238.40948405
Iteration 36, loss = 3231.49013027
Iteration 37, loss = 3224.52409333
Iteration 38, loss =