In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/My Drive/Computer Statistic/Project03-LinearRegression '
%ls

/content/drive/My Drive/Computer Statistic/Project03-LinearRegression 
 18127248.pdf  'Linear Regression.ipynb'         wine.csv
 18127248.zip  'Linear Regression Report.docx'


In [None]:
import pandas as pd
df = pd.read_csv('wine.csv', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1194,7.0,0.745,0.12,1.8,0.114,15.0,64,0.99588,3.22,0.59,9.5,6
1195,6.2,0.430,0.22,1.8,0.078,21.0,56,0.99633,3.52,0.60,9.5,6
1196,7.9,0.580,0.23,2.3,0.076,23.0,94,0.99686,3.21,0.58,9.5,6
1197,7.7,0.570,0.21,1.5,0.069,4.0,9,0.99458,3.16,0.54,9.8,6


In [None]:
import numpy as np
import pandas as pd

In [None]:
def preparing_data(file_name):
  # read csv file
  df = pd.read_csv(file_name, sep=';')
  # label of data set
  y = df.quality.values
  # data point
  x = df.drop('quality', axis=1).values
  # add the bias
  bias = np.ones(x.shape[0])
  bias = np.resize(bias, (1, x.shape[0]))
  x = np.concatenate((x, bias.T), axis=1)
  return x, y

In [None]:
def linear_regression(dataset, label):
  para = np.linalg.pinv(dataset) @ label
  return para
def get_error(dataset, label, para):
  error = np.mean(abs(np.dot(dataset, para) - label))
  return error

In [None]:
data_points, label = preparing_data("wine.csv")
para_model = linear_regression(data_points, label)
error = get_error(data_points, label, para_model)
print(f"error: {error}")
print(f"parameter: {para_model}")

error: 0.4997901216695821
parameter: [ 4.75247531e-02 -1.06874258e+00 -2.68710829e-01  3.49742662e-02
 -1.59729560e+00  3.48788138e-03 -3.79835506e-03 -3.94690810e+01
 -2.45575908e-01  7.73840794e-01  2.69377496e-01  4.29171625e+01]


In [None]:
def kFold_validation_set(file_name):
  x, y = preparing_data(file_name)
  from sklearn.model_selection import RepeatedKFold
  kf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=None)
  lst_training_datasets = []
  lst_training_labels = []
  lst_testing_datasets = []
  lst_testing_labels = []

  for train_index, test_index in kf.split(x):
    lst_training_datasets.append(x[train_index])
    lst_training_labels.append(y[train_index])
    lst_testing_datasets.append(x[test_index])
    lst_testing_labels.append(y[test_index])

  return lst_training_datasets, lst_training_labels, lst_testing_datasets, lst_testing_labels

In [None]:
def get_attribute(training_set, testing_set, k):
  train_shape = training_set.shape
  test_shape = testing_set.shape

  attribute = training_set[:, k].reshape((train_shape[0], 1))
  bias = training_set[:, train_shape[1]-1].reshape((train_shape[0], 1))
  new_training_set = np.concatenate((attribute, bias), axis=1)

  attribute = testing_set[:, k].reshape((test_shape[0], 1))
  bias = testing_set[:, test_shape[1]-1].reshape((test_shape[0], 1))
  new_testing_set = np.concatenate((attribute, bias), axis=1)
  return new_training_set, new_testing_set

In [None]:
def linear_regression_for_attribute(train_x, train_y, test_x, test_y, k):
  theta = []
  error = []
  for idx in range(len(train_x)):
    new_train_x, new_test_x = get_attribute(train_x[idx], test_x[idx], k)
    para = linear_regression(new_train_x, train_y[idx])
    theta.append(para)
    error.append(get_error(new_test_x, test_y[idx], para))

  min_err_idx = np.argmin(np.array(error))
  error = np.mean(np.array(error))
  return error, theta[min_err_idx]

In [None]:
def find_best_attribute(train_x, train_y, test_x, test_y, n_attributes):
  error = []
  theta = []
  for i in range(n_attributes):
    err, para = linear_regression_for_attribute(train_x, train_y, test_x, test_y, i)
    error.append(err)
    theta.append(para)

  min_err_idx = np.argmin(np.array(error))
  return error[min_err_idx], theta[min_err_idx], min_err_idx

In [None]:
training_datasets, training_labels, testing_datasets, testing_labels = kFold_validation_set("wine.csv")
error, para, attribute = find_best_attribute(training_datasets, training_labels, testing_datasets, testing_labels, 11)
print(f"error: {error}")
print(f"parameter: {para}")
print(f"attribute index: {attribute}")

error: 0.5658150426292187
parameter: [0.37357799 1.79191128]
attribute index: 10


In [None]:
"""
drop an attribute in the training_set and testing_set
training_set is a list of k validation dataset
testing_set is a list of k validation test
"""
def drop_attribute(training_set, testing_set, k):
  for idx in range(len(training_set)):
    training_set[idx] = np.delete(training_set[idx], k, axis=1)
  for idx in range(len(testing_set)):
    testing_set[idx] = np.delete(testing_set[idx], k, axis=1)
  return k

In [None]:
"""
after dropping, find the parameter and the error corresponding to the
attribute which we drop k times
choose the min error and the best parameter
"""
def linear_regression_for_validation(train_x, train_y, test_x, test_y, k):
  theta = []
  error = []

  for idx in range(len(train_x)):
    """
    np.delete can modify the original dataset, so we must use copy of them
    """
    train_x_copy = np.copy(train_x[idx])
    test_x_copy = np.copy(test_x[idx])
    """
    drop attribute to find the parameter
    """
    train_x_copy = np.delete(train_x_copy, k, axis=1)
    test_x_copy = np.delete(test_x_copy, k, axis=1)
    """
    find the parameter when we drop this attribute
    """
    para = linear_regression(train_x_copy, train_y[idx])
    theta.append(para)
    error.append(get_error(test_x_copy, test_y[idx], para))

  min_err_idx = np.argmin(np.array(error))
  error = np.mean(np.array(error))
  return error, theta[min_err_idx]

In [None]:
"""
in n attributes, we find the worst attribute, which has max error, 
and the parameter when we drop this worst attribute
"""
def find_worst_attribute(train_x, train_y, test_x, test_y, n_attributes):
  error = []
  theta = []
  for attribute in range(n_attributes):
    err, para = linear_regression_for_validation(train_x, train_y, test_x, test_y, attribute)
    error.append(err)
    theta.append(para)

  error = np.array(error)
  worst_idx = np.argmin(error)

  k = drop_attribute(train_x, test_x, worst_idx)
  return train_x, test_x, theta[worst_idx], error[worst_idx], k

In [None]:
def backward_feature_elimination(train_x, train_y, test_x, test_y, n_attributes):
  theta = []
  errors = []
  k_drops = []
  while n_attributes > 1:
    train_x, test_x, para, error, k_drop = find_worst_attribute(train_x, train_y, test_x, test_y, n_attributes)
    n_attributes -= 1
    theta.append(para)
    errors.append(error)
    k_drops.append(k_drop)
    print(f"error when training {n_attributes} attributes: {error}")
  errors = np.array(errors)
  best_idx = np.argmin(errors)
  return errors[best_idx], theta[best_idx], k_drops[0:best_idx+1]

In [None]:
training_datasets, training_labels, testing_datasets, testing_labels = kFold_validation_set("wine.csv")

In [None]:
error, para, drop = backward_feature_elimination(training_datasets, training_labels, testing_datasets, testing_labels, 11)

error when training 10 attributes: 0.5040513352847633
error when training 9 attributes: 0.5036998645974882
error when training 8 attributes: 0.5047378588206431
error when training 7 attributes: 0.506026038431801
error when training 6 attributes: 0.5063159647293907
error when training 5 attributes: 0.5070031546589473
error when training 4 attributes: 0.5089255811212384
error when training 3 attributes: 0.5190606727362768
error when training 2 attributes: 0.5290291201045382
error when training 1 attributes: 0.5661938000802602


In [None]:
error

0.5036998645974882

In [None]:
para

array([ 5.82164677e-02, -9.33518538e-01,  4.44289036e-02, -1.68113884e+00,
        4.17682145e-03, -4.10197708e-03, -6.35447247e+01,  7.71792947e-01,
        2.36343112e-01,  6.62041085e+01])

In [None]:
drop

[2, 7]

In [None]:
def sum_combine(array):
  for i in array:
    