In [1]:
import csv
import math
import numpy as np
import pandas as pd

##system_equations

In [2]:
def system_equations(matrix):
  
  #get the number of the rows and columns of the matrix
  row_num = matrix.shape[0]
  col_num = matrix.shape[1]

  #if the equation contains one variable, return the value of (y)
  if row_num==1:
    bk = matrix[0,-1]/matrix[0,-2]
    bs = np.array([bk])
    return bs

  #Divide by coefficient to balance the equations
  for i in range(row_num):
    div = matrix[i,0]
    for j in range(col_num):
      matrix[i,j] = matrix[i,j]/div

  #New matrix = reduce the degree of the matrix
  newMatrix = np.zeros((row_num-1,col_num-1))
  for i in range(row_num-1):
    for j in range(col_num-1):
      newMatrix[i,j] = matrix[i+1,j+1] - matrix[0,j+1]

  #send the reduced matrix until the degree become (1)
  bs = system_equations(newMatrix)

  #substitute the value of the known Bs to get the remaining unknown Bs
  start = matrix.shape[1]-bs.shape[0]-1
  z = matrix[0,-1]
  for i in range(bs.shape[0]):
    z = z - (matrix[0,start+i]*bs[i])
  
  z = np.array([z])
  bs = np.concatenate((z,bs))

  return bs

##printRegression

In [11]:
def printRegression(datafile,bs,dataset_head):

  #print the title
  print("\n\n\n----------------||||||", datafile , "Dataset ||||||----------------\n\n")

  #print the first 5 rows in the dataset
  print("The first 5 rows in the dataset:\n\n")
  print(dataset_head,"\n\n\n")

  #print the values of the Bs
  for i in range(bs.shape[0]):
    b_value = "{:f}".format(bs[i]);
    print("B" + str(i) +" = ",b_value)

  #print the regression equation
  print("\n\nRegression Equation:\n")
  
  b_value = bs[0]
  print("y = ",b_value,end="")
  
  for i in range(1,bs.shape[0]):
    b_value = bs[i]
    if b_value>0:
      cof_x = " + {:.10f} x{:.0f}".format(b_value,i)
      print(cof_x,end="")
    elif b_value<0:
      cof_x = " - {:.10f} x{:.0f}".format(abs(b_value),i)
      print(cof_x,end="")

##getRegression

In [4]:
def getRegression(datafile):
  #read the dataset from a file
  dataset = pd.read_csv(datafile+'.csv')
  
  dataset_head = dataset.head()
  
  #converate the dataset to numpy array
  dataset = dataset.to_numpy()

  #dim = the number of features in the dataset = (number of columns - 1)
  dim = len(dataset[0])-1
  
  #matrix: the system equations matrix
  matrix = np.zeros((dim+1,dim+2))

  #sum_Xs = the sum of each feature. (form the first column to one before the last)
  sum_Xs = dataset.sum(axis=0)[0:dim]
  
  #sum_y = the sum of the target data. (The last column)
  sum_y = dataset.sum(axis=0)[dim]
  matrix[0,dim+1] = sum_y

  #n = number of sampales in the dataset = (number of cells/number of columns)
  n = dataset.size/(dim+1)
  matrix[0,0] = n
  
  #set the sum value of each feature (x)
  for i in range(dim):
    matrix[0,i+1] = sum_Xs[i]
    matrix[i+1,0] = sum_Xs[i]

  #set the sum product value of each feature with all feature(Xs) and the target (y)
  for row in dataset:
    for x in range(dim):
      #sum-product of (x and y)
      matrix[x+1,dim+1] = matrix[x+1,dim+1] + (row[x]*row[dim])
      for z in range(dim):
        #sum-product of (x and each x)
        matrix[x+1,z+1] = matrix[x+1,z+1] + (row[x]*row[z])

  #send the matrix to get the values of Bs
  bs = system_equations(matrix)

  #print the regression equation:
  printRegression(datafile,bs,dataset_head)

##Start

In [None]:
getRegression("wire_pull_strength")
getRegression("cancer_reg")
getRegression("Fish")
getRegression("insurance")
getRegression("prices-split-adjusted")
getRegression("random")
getRegression("winequality-red")
getRegression("winequality-white")

In [12]:
getRegression("test0")
getRegression("test1")
getRegression("test2")
getRegression("test3")
getRegression("test4")
getRegression("test5")




----------------|||||| test0 Dataset ||||||----------------


The first 5 rows in the dataset:


   X1   X2   X3    Y
0  67  353  964   64
1  11  464  973   51
2  33  436  833   74
3   2  487  722  121
4  15  437  802  118 



B0 =  106.930765
B1 =  -0.191269
B2 =  -0.021384
B3 =  0.042501


Regression Equation:

y =  106.930764580302 - 0.1912687839 x1 - 0.0213836928 x2 + 0.0425014908 x3


----------------|||||| test1 Dataset ||||||----------------


The first 5 rows in the dataset:


   X1   X2   X3    Y
0  67  353  964   64
1  11  464  973   51
2  33  436  833   74
3   2  487  722  121
4  15  437  802  118 



B0 =  107.251748
B1 =  0.155616
B2 =  -0.035580
B3 =  0.018662


Regression Equation:

y =  107.25174794438986 + 0.1556164135 x1 - 0.0355800317 x2 + 0.0186620952 x3


----------------|||||| test2 Dataset ||||||----------------


The first 5 rows in the dataset:


   X1   X2   X3    Y
0  73   52  841  124
1  96   59  747  138
2  72  124  774  101
3  70  245  866   57
4  27  3