# Importing packages and datasets.

In [1]:
import pandas
import numpy

import altair

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error


In [2]:
base_file_path = "../data/processed/"


In [3]:
dataFrame = pandas.read_csv(base_file_path + "cleaned.csv")
print(dataFrame.shape)


(2895, 30)


In [4]:
X = dataFrame.copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)


(2395, 29) 	 (500, 29)
(2395,) 	 (500,)


# Baseline performance.

In [5]:
y_mean = y.mean()

y_base = numpy.full((y_test.shape[0], ), y_mean)
(y_base - y_test).sum()

mean_squared_error(y_test, y_base, squared=True)


712.9935696334397

# Training.

In [6]:
regression = LinearRegression()

def training(column_name):
	x_train = df_train[column_name].values
	y = y_train.values
	
	regression.fit(x_train.reshape(-1, 1), y)


In [7]:
training("PctBachDeg25_Over")


# Testing.

In [10]:
def results(column_name, test_type):
	x_test = df_test[column_name].values
	x_train = df_train[column_name].values

	mse = 0.0
	scatter_chart = altair.Chart()
	line_chart = altair.Chart()

	if test_type == "test":
		y_preds = regression.predict(x_test.reshape(-1, 1))
		line_chart = altair.Chart(pandas.DataFrame({'x': x_test, 'y': y_preds})).mark_line().encode(
			x='x',
			y='y',
			color='blue'
		)
		scatter_chart = altair.Chart(test).mark_point().encode(
			x=column_name,
			y="TARGET_deathRate",
			color='orange'
		)
		mse = mean_squared_error(y_test, y_preds, squared=True)

	elif test_type == "train":
		y_preds = regression.predict(x_train.reshape(-1, 1))
		line_chart = altair.Chart(pandas.DataFrame({'x': x_train, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y',
			color='blue'
    	)
		scatter_chart = altair.Chart(train).mark_point().encode(
			x=column_name,
			y="TARGET_deathRate",
			color='orange'
		)
		mse = mean_squared_error(y_train, y_preds, squared=True)

	return (line_chart, scatter_chart, mse)


# Results.

In [12]:
line, scatter, mse = results("PctBachDeg25_Over", "train")
print(mse)

#line+scatter


596.1827922009649


In [13]:
line, scatter, mse = results("PctBachDeg25_Over", "test")
print(mse)
# line+scatter


494.6006449969083
