In [87]:
import numpy as np
import pandas as pd
#from sklearn.datasets import load_boston -> https://scikit-learn.org/1.0/modules/generated/sklearn.datasets.load_boston.html
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from numpy.linalg import inv
%matplotlib inline

def print_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f'MSE = {mse:.2f}, RMSE = {rmse:.2f}')

def linreg_linear(data, target):
    data_transposed = np.transpose(data)
    #inv(data_transposed*data)*(data_transposed*target)
    theta = np.matmul(inv(np.matmul(data_transposed,data)), np.matmul(data_transposed,target))
    return theta

raw_df = pd.read_csv('../../data/boston.csv', sep="\s+", skiprows=22, header=None)
#raw_df.info()

data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
#data = np.hstack([np.ones(data.shape[0])[:, np.newaxis], data])

theta = linreg_linear(data, target)
print_regression_metrics(target, data.dot(theta))

#with split data
X_train, X_valid, y_train, y_valid = train_test_split(data, target, test_size=0.2, random_state=77)
theta = linreg_linear(X_train, y_train)
y_pred = X_valid.dot(theta)
y_train_pred = X_train.dot(theta)


#print_regression_metrics(y_valid, y_pred)
#print_regression_metrics(y_train, y_train_pred)


MSE = 24.17, RMSE = 4.92


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression() 
model.fit(data, target)
print_regression_metrics(target, model.predict(data))


In [None]:
model = LinearRegression() 
data_b = np.delete(data, np.where(data[:, 11] < 50)[0], axis=0)
target_b = np.delete(target, np.where(data[:, 11] < 50)[0], axis=0)

model.fit(data_b, target_b)
print_regression_metrics(target_b, model.predict(data_b))



In [None]:
std_dev_x = np.std(data, axis=0)
print(std_dev_x)
index = std_dev_x.argmax(axis=0)
print("Index=", index, ", Value=", std_dev_x[index])

In [98]:
from sklearn import preprocessing

#data_fix = np.hstack([np.ones(data.shape[0])[:, np.newaxis], data])
scaler = preprocessing.RobustScaler()
data_ds = scaler.fit_transform(data)

theta = linreg_linear(data_ds, target)
print_regression_metrics(target, data_ds.dot(theta))

[[-0.06959315  1.44       -0.57164988 ... -1.33928571  0.26190191
  -0.63768116]
 [-0.06375455  0.         -0.20294345 ... -0.44642857  0.26190191
  -0.22188906]
 [-0.06376011  0.         -0.20294345 ... -0.44642857  0.06667466
  -0.73263368]
 ...
 [-0.05445006  0.          0.17350891 ...  0.69642857  0.26190191
  -0.57171414]
 [-0.04086745  0.          0.17350891 ...  0.69642857  0.09641444
  -0.48775612]
 [-0.05816351  0.          0.17350891 ...  0.69642857  0.26190191
  -0.34782609]]
MSE = 230.02, RMSE = 15.17
