# Importing packages and datasets.

In [6]:
import pandas
import numpy

import altair

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')
base_file_path = "/content/drive/MyDrive/STDS/Assignment 2/Datasets/"

Mounted at /content/drive


In [5]:
dataFrame = pandas.read_csv(base_file_path + "cleaned.csv")
print(dataFrame.shape)

(2895, 30)


In [17]:
X = dataFrame.copy()
y = X.pop("TARGET_deathRate")

train, test = train_test_split(dataFrame, test_size=0.1727, random_state=42)
df_train, df_test = train_test_split(X, test_size=0.1727, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.1727, random_state=42)

print(df_train.shape, "\t", df_test.shape)
print(y_train.shape, "\t", y_test.shape)

(2395, 29) 	 (500, 29)
(2395,) 	 (500,)


# Baseline performance.

In [12]:
y_mean = y.mean()

y_base = numpy.full((y_test.shape[0], ), y_mean)
(y_base - y_test).sum()

mean_squared_error(y_test, y_base, squared=True)

712.9935696334397

# Training and Testing Functions.

In [26]:
def train_results(column_name):
  x_train = df_train[column_name].values
  y = y_train.values

  reg = LinearRegression().fit(x_train.reshape(-1, 1), y)

  y_train_preds = reg.predict(x_train.reshape(-1, 1))

  train_line_chart = altair.Chart(pandas.DataFrame({'x': x_train, 'y': y_train_preds})).mark_line().encode(
      x='x',
      y='y'
    )
  train_scatter_chart = altair.Chart(train).mark_point().encode(x=column_name, y="TARGET_deathRate")

  mse = mean_squared_error(y_train, y_train_preds, squared=True)

  return (train_line_chart, train_scatter_chart, mse)

In [27]:
def test_results(column_name):
  x_train = df_train[column_name].values
  x_test = df_test[column_name].values

  y = y_train.values

  reg = LinearRegression().fit(x_train.reshape(-1, 1), y)

  y_test_preds = reg.predict(x_test.reshape(-1, 1))

  test_line_chart = altair.Chart(pandas.DataFrame({'x': x_test, 'y': y_test_preds})).mark_line().encode(
      x='x',
      y='y'
    )
  test_scatter_chart = altair.Chart(test).mark_point().encode(x=column_name, y="TARGET_deathRate")

  mse = mean_squared_error(y_test, y_test_preds, squared=True)

  return (test_line_chart, test_scatter_chart, mse)

In [None]:
def results(column_name, test_type):
	x_train = df_train[column_name].values
	x_test = df_test[column_name].values

	y = y_train.values

	reg = LinearRegression().fit(x_train.reshape(-1, 1), y)

	mse = 0.0
	scatter_chart = altair.Chart()
	line_chart = altair.Chart()

	if test_type == "test":
		y_preds = reg.predict(x_test.reshape(-1, 1))
		line_chart = altair.Chart(pandas.DataFrame({'x': x_test, 'y': y_preds})).mark_line().encode(
			x='x',
			y='y'
		)
		scatter_chart = altair.Chart(test).mark_point().encode(x=column_name, y="TARGET_deathRate")
		mse = mean_squared_error(y_test, y_preds, squared=True)

	elif test_type == "train":
		y_preds = reg.predict(x_test.reshape(-1, 1))
		line_chart = altair.Chart(pandas.DataFrame({'x': x_train, 'y': y_preds})).mark_line().encode(
      		x='x',
      		y='y'
    	)
		scatter_chart = altair.Chart(train).mark_point().encode(x=column_name, y="TARGET_deathRate")
		mse = mean_squared_error(y_train, y_preds, squared=True)

	return (line_chart, scatter_chart, mse)

# Results.

In [25]:
line, scatter, mse = results("PctBachDeg25_Over", "train")
print(mse)
line+scatter

596.1827922009649


In [28]:
line, scatter, mse = results("PctBachDeg25_Over", "test")
print(mse)
line+scatter

494.6006449969083


In [30]:
line, scatter, mse = test_results("PctPublicCoverageAlone")
print(mse)
line+scatter

561.7041757751923
