# Importing packages and datasets.

In [1]:
import pandas
import numpy
import pickle

import altair

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error


In [2]:
dataset_file_path = "../data/processed/"


In [3]:
dataFrame = pandas.read_csv(dataset_file_path + "cleaned.csv")
print(dataFrame.shape)


(2895, 30)


In [4]:
normalisedDataFrame = pandas.read_csv(dataset_file_path + "normalised.csv")
print(normalisedDataFrame.shape)


(2895, 30)


In [5]:
print(dataFrame.columns)


Index(['avgAnnCount', 'avgDeathsPerYear', 'TARGET_deathRate', 'incidenceRate',
       'medIncome', 'popEst2015', 'povertyPercent', 'studyPerCap', 'MedianAge',
       'MedianAgeMale', 'MedianAgeFemale', 'AvgHouseholdSize',
       'PercentMarried', 'PctNoHS18_24', 'PctHS18_24', 'PctBachDeg18_24',
       'PctHS25_Over', 'PctBachDeg25_Over', 'PctEmployed16_Over',
       'PctUnemployed16_Over', 'PctPrivateCoverage', 'PctEmpPrivCoverage',
       'PctPublicCoverage', 'PctPublicCoverageAlone', 'PctWhite', 'PctBlack',
       'PctAsian', 'PctOtherRace', 'PctMarriedHouseholds', 'BirthRate'],
      dtype='object')


In [6]:
possibly_related_fields = [
                           "medIncome", "povertyPercent", "PctUnemployed16_Over", "PctEmployed16_Over",
                           "PctHS25_Over", "PctBachDeg25_Over",
                           "PctPrivateCoverage", "PctEmpPrivCoverage", "PctPublicCoverage", "PctPublicCoverageAlone",
                           "PctMarriedHouseholds", "PercentMarried"
                           ]


In [7]:
X = dataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


In [8]:
X = normalisedDataFrame[possibly_related_fields + ["TARGET_deathRate"]].copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 12) 	 (500, 12)
(2395,) 	 (500,)


# Baseline Performance.

In [9]:
y_mean = y.mean()

y_base = numpy.full((y_test.shape[0], ), y_mean)
(y_base - y_test).sum()

mean_squared_error(y_test, y_base, squared=True)


712.9935696334397

# Training.

In [10]:
nnRegressor = neuralNetwork = MLPRegressor(
		hidden_layer_sizes=[128], 				#array-like of shape(n_layers - 2,), default=(100,)
		batch_size="auto", 						#int, default=’auto’
		activation="relu", 						#{’identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’
		solver="adam", 							#{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
		alpha=0.0001,							#float, default=0.0001
		learning_rate ="constant", 				#{‘constant’, ‘invscaling’, ‘adaptive’}, default=’constant’
		learning_rate_init = 0.001,				#float, default=0.001
		power_t=0.5,							#float, default=0.5
		max_iter=200,							#int, default=200
		shuffle=False,							#bool, default=True, used when solver=’sgd’ or ‘adam’
		random_state=1,
		tol=1e-4,								#float, default=1e-4
		verbose=True							#bool, default=False
)

regression = nnRegressor.fit(df_train, y_train)


Iteration 1, loss = 11401.77746125
Iteration 2, loss = 5537.97522267
Iteration 3, loss = 1954.92108287
Iteration 4, loss = 609.58634325
Iteration 5, loss = 535.91158588
Iteration 6, loss = 493.83980065
Iteration 7, loss = 452.59503237
Iteration 8, loss = 438.58544007
Iteration 9, loss = 420.56796724
Iteration 10, loss = 403.96876960
Iteration 11, loss = 388.45926111
Iteration 12, loss = 373.88485411
Iteration 13, loss = 360.02650462
Iteration 14, loss = 346.61129164
Iteration 15, loss = 332.92492242
Iteration 16, loss = 321.23640847
Iteration 17, loss = 311.52528663
Iteration 18, loss = 304.30659641
Iteration 19, loss = 298.57235602
Iteration 20, loss = 294.49845291
Iteration 21, loss = 291.39065495
Iteration 22, loss = 289.05474686
Iteration 23, loss = 287.14000985
Iteration 24, loss = 285.55188283
Iteration 25, loss = 284.11210721
Iteration 26, loss = 282.82425584
Iteration 27, loss = 281.61481501
Iteration 28, loss = 280.47316325
Iteration 29, loss = 279.38223496
Iteration 30, loss 



In [11]:
model_file_path = "../models/"
model_file_name = "neural_network.pickle"


In [12]:
pickle.dump(regression, open(model_file_path + model_file_name, "wb"))


In [13]:
regression = pickle.load(open(model_file_path + model_file_name, "rb"))


# Testing.

In [14]:
def results(test_type):

	mse = 0.0
	prediction_line = altair.Chart()
	actual_line = altair.Chart()

	if test_type == "test":
		y_preds = neuralNetwork.predict(df_test)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_preds})).mark_line(color='blue').encode(
      		x='x',
      		y='y'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_test, 'y': y_test})).mark_line(color='orange').encode(
      		x='x',
      		y='y'
    	)
		mse = mean_squared_error(y_test, y_preds, squared=True)

	elif test_type == "train":
		y_preds = neuralNetwork.predict(df_train)
		prediction_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_preds})).mark_line(color='blue').encode(
      		x='x',
      		y='y'
    	)
		actual_line = altair.Chart(pandas.DataFrame({'x': y_train, 'y': y_train})).mark_line(color='orange').encode(
      		x='x',
      		y='y'
    	)
		mse = mean_squared_error(y_train, y_preds, squared=True)

	return (actual_line, prediction_line, mse)


In [15]:
actual_line, prediction_line, mse = results("train")
print(mse)


493.42380849040933


In [16]:
actual_line, prediction_line, mse = results("test")
print(mse)


448.441232003191
