# Preload libraries and functions
First of all, let's import libraries that will be used in this example

In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as prep
import threading as thrd
import time
import multiprocessing as mp
from multiprocessing import Pool

import matplotlib.pyplot as plt
# import tensorflow as tf
# from tensorflow import keras
#matplot inline


# np.set_printoptions(precision=8)

Define functions and class

In [6]:
class MethodException(Exception):
    def __init__(self, msg, value):
        self.msg=msg
        self.value=value
        
# def PolynomialFeature
def polyFeature(X, degree=2):
    poly = prep.PolynomialFeatures(degree,include_bias = False)
    polyArray = poly.fit_transform(X)
    return polyArray

def normalizeFeature(X, method):
    try:
        if X.ndim == 1:  # Reshape the 1d array
            X=np.reshape(X,(-1,1))
        X_norm=np.ones((X.shape[0],X.shape[1]+1), dtype=np.float64)
        if method == 'std':
            X_norm[:,1:]=(X-X.mean(0))/X.std(0)     
        elif method == 'range':
            X_norm[:,1:]=(X-X.min(0))/(X.max(0)-X.min(0))
        else:
            raise MethodException('method should be either \'std\' or \'range\'(case sensitive)', method)
    except MethodException as ex:
        print(f'The error is: {ex.msg}, here the input method is \'{ex.value}\'')
    else:
        return X_norm

# cost (error) function
def computeCost(X, y, theta):
    inner = np.power(((X @ theta.T) - y), 2)
    return np.sum(inner) / (2 * len(X))

# normal equation
def normalEq(X,y):
    theta=np.linalg.pinv(X.T@X)@X.T@y
    return theta

# normal equation with regulization
def normalRegEq(X,y,beta):
    L=np.eye(X.shape[1])
    L[0,0]=0
    theta=np.linalg.inv(X.T@X+beta*L)@X.T@y
    return theta
    
# def linearRegCostFunction(X, y, theta, reg):


# Data Import and Initialization

In [7]:
df = pd.read_excel('obs_data_w.xlsx', sheet_name=0) #sheet_name='Sheet1')
print(df)

         V    T  uncertainty             J
0     0.00  100     0.001000  0.000000e+00
1     0.05  100     0.001000  6.417136e-90
2     0.10  100     0.001000  1.081096e-88
3     0.15  100     0.001000  1.719633e-87
4     0.20  100     0.001000  2.725747e-86
5     0.25  100     0.001000  4.319561e-85
6     0.30  100     0.001000  6.845225e-84
7     0.35  100     0.001000  1.084764e-82
8     0.40  100     0.001000  1.719028e-81
9     0.45  100     0.001000  2.724148e-80
10    0.50  100     0.001000  4.316963e-79
11    0.55  100     0.001000  6.841100e-78
12    0.60  100     0.001000  1.084111e-76
13    0.65  100     0.001000  1.717992e-75
14    0.70  100     0.001000  2.722506e-74
15    0.75  100     0.001000  4.314361e-73
16    0.80  100     0.001000  6.836977e-72
17    0.85  100     0.001000  1.083457e-70
18    0.90  100     0.001000  1.716957e-69
19    0.95  100     0.001000  2.720865e-68
20    1.00  100     0.001000  4.311761e-67
21    1.05  100     0.001000  6.832857e-66
22    1.10 

## Train, cross validation and test data

### Random Method 1: Sample function of df

In [8]:
tcv=df.sample(frac=0.8) # random_state=200
test=df.drop(tcv.index)
test=test.sample(frac=1) # shuffle the rows

print(test.shape)

(1579, 4)


### Random Method 2: random array

In [None]:
# idx=np.arange(0,len(df))
# np.random.shuffle(idx)
# temp_split=math.floor(0.8*len(df))
# #idx[0:20]
# train=df.iloc[idx[0:temp_split],:]
# test=df.iloc[idx[temp_split:],:] # Output all the elements after temp_split
# print(train.shape)
# print(test.shape)

## Split the data into data and results

In [9]:
test_data=test.loc[:,['V','T']]
test_unc=test.uncertainty.values
test_J=test.J.values

# print(test_data)
# J=J.tolist()

## Add the polynominal features

In [10]:
degree = 2
print(f'The polynominal degree is {degree}.\n')


# test data
test_data_poly = polyFeature(test_data, degree)
print('test data = \n', test_data_poly, '\n')


The polynominal degree is 2.

test data = 
 [[2.5000e-01 1.1300e+02 6.2500e-02 2.8250e+01 1.2769e+04]
 [2.9500e+00 1.0300e+02 8.7025e+00 3.0385e+02 1.0609e+04]
 [1.9000e+00 1.8400e+02 3.6100e+00 3.4960e+02 3.3856e+04]
 ...
 [3.3000e+00 1.4900e+02 1.0890e+01 4.9170e+02 2.2201e+04]
 [2.9000e+00 1.6900e+02 8.4100e+00 4.9010e+02 2.8561e+04]
 [0.0000e+00 1.3000e+02 0.0000e+00 0.0000e+00 1.6900e+04]] 



## Data Normalization (Two methods, range or stand deviation)  

(__We should try both__)

In [11]:
# Normalization method
method='std'

# test data
test_data_norm=normalizeFeature(test_data_poly, method)
print('normalized test_data = \n', test_data_norm, '\n')

# test the normalize method is right
# t=test_data_poly[:,1]
# print(t.ndim)
# t_norm=normalizeFeature(t,'std')
# print(t_norm)

normalized test_data = 
 [[ 1.         -1.39867327 -0.57614405 -1.02530894 -1.38122722 -0.39111011]
 [ 1.          1.21128154 -0.6592906   1.25359043  0.14721885 -0.41529809]
 [ 1.          0.19629911  0.01419652 -0.08961502  0.40094312 -0.154975  ]
 ...
 [ 1.          1.54960901 -0.27681643  1.83056871  1.18901346 -0.28548928]
 [ 1.          1.16294904 -0.11052331  1.17644019  1.18014004 -0.21426913]
 [ 1.         -1.64033575 -0.43479489 -1.04179404 -1.53789849 -0.34485061]] 



# Model Training

## Method 1: Normal Equation

### Training without regulation

In [None]:
theta=normalEq(train_data_norm, train_J) 
print(f'theta = {theta}')
error_train=computeCost(train_data_norm, train_J, theta)
error_test=computeCost(test_data_norm, test_J, theta)

print('The training error is ', error_train)
print('The test error is ', error_test)

# OTHER PRINT EXPRESSION WITH FORMAT
# print('The training error is %.10f'%error)
# print('error = {:.10f}'.format(error))
# print(f'error = {error}')
# print('error= ', error)

### Training with regulation

#### Multiprocess with process

In [30]:
featureSize=test_data_norm.shape[1]
beta_range=np.geomspace(0.001,15, num = 100, endpoint = True, dtype=np.float64)
beta_range=np.insert(beta_range,0,0) # insert a zero element in the first index
theta_reg=np.zeros((beta_range.size, featureSize))
error_train_reg=np.zeros(beta_range.size)
error_cv_reg=np.zeros(beta_range.size)

def train_model_loop(num, epoch):
    # theta_reg_multismpl=np.zeros((epoch, featureSize))
    error_train_reg_multismpl=np.zeros(epoch)
    error_cv_reg_multismpl=np.zeros(epoch)
    beta_array=np.zeros(num)

    for i in range(0,num):
        for index, beta in enumerate(beta_range):
            prs=[]
            for j in range(0,epoch,1):
                pr = mp.Process(target=train_model_single, name=f'Process {i}-{index}-{j}', args=(i,j,beta, 
                                 error_train_reg_multismpl, error_cv_reg_multismpl))
                prs.append(pr)
            for pr in prs:
                pr.daemon=True
                pr.start()
            for pr in prs:
                pr.join()
            print(error_train_reg_multismpl.sum(0))
            error_train_reg[index] = error_train_reg_multismpl.sum(0)/epoch
            error_cv_reg[index] = error_cv_reg_multismpl.sum(0)/epoch
        idx=error_cv_reg.argmin()
        beta_array[i] = beta_range[idx]
    return beta_array

def train_model_single(i, j, beta, error_train_reg_multismpl, error_cv_reg_multismpl):
    print(f'{mp.current_process().name} is running...\n')
    # Split into train set and cv set
    train=tcv.sample(frac=0.75, random_state = int((157*j+71*i)/3))
    cv=tcv.drop(train.index)

    train_data=train.loc[:,['V','T']]
    train_unc=train.uncertainty.values
    train_J=train.J.values

    cv_data=cv.loc[:,['V','T']]
    cv_unc=cv.uncertainty.values
    cv_J=cv.J.values

    # add polynominal features
    train_data_poly = polyFeature(train_data, degree)
    cv_data_poly = polyFeature(cv_data, degree)

    # train data normalization
    train_data_norm=normalizeFeature(train_data_poly, method)

    # cross valiation data normalization
    cv_data_norm=normalizeFeature(cv_data_poly, method)

    theta_reg_multismpl=normalRegEq(train_data_norm, train_J, beta)
    error_train_reg_multismpl[j]=computeCost(train_data_norm, train_J, theta_reg_multismpl)
    error_cv_reg_multismpl[j]=computeCost(cv_data_norm, cv_J, theta_reg_multismpl)
    
    print(f'{mp.current_process().name} ended. \n')


#### Multiprocess with pool

In [None]:
featureSize=test_data_norm.shape[1]
beta_range=np.geomspace(0.001,15, num = 100, endpoint = True, dtype=np.float64)
beta_range=np.insert(beta_range,0,0) # insert a zero element in the first index
theta_reg=np.zeros((beta_range.size, featureSize))
error_train_reg=np.zeros(beta_range.size)
error_cv_reg=np.zeros(beta_range.size)

def train_model_loop(num, epoch):
    # theta_reg_multismpl=np.zeros((epoch, featureSize))
    error_train_reg_multismpl=np.zeros(epoch)
    error_cv_reg_multismpl=np.zeros(epoch)
    beta_array=np.zeros(num)
    
    pl=Pool(processes=2)

    for i in range(0,num):
        for index, beta in enumerate(beta_range):
            prs=[]
            for j in range(0,epoch,1):
                pl.apply_async(target=train_model_single, name=f'Process {i}-{index}-{j}', args=(i,j,beta, 
                                 error_train_reg_multismpl, error_cv_reg_multismpl))
            pl.close()
            pl.join()
            print('Sub-process(es) done.')
            error_train_reg[index] = error_train_reg_multismpl.sum(0)/epoch
            error_cv_reg[index] = error_cv_reg_multismpl.sum(0)/epoch
        idx=error_cv_reg.argmin()
        beta_array[i] = beta_range[idx]
    return beta_array

def train_model_single(i, j, beta, error_train_reg_multismpl, error_cv_reg_multismpl):
    print(f'{mp.current_process().name} is running...\n')
    # Split into train set and cv set
    train=tcv.sample(frac=0.75, random_state = int((157*j+71*i)/3))
    cv=tcv.drop(train.index)

    train_data=train.loc[:,['V','T']]
    train_unc=train.uncertainty.values
    train_J=train.J.values

    cv_data=cv.loc[:,['V','T']]
    cv_unc=cv.uncertainty.values
    cv_J=cv.J.values

    # add polynominal features
    train_data_poly = polyFeature(train_data, degree)
    cv_data_poly = polyFeature(cv_data, degree)

    # train data normalization
    train_data_norm=normalizeFeature(train_data_poly, method)

    # cross valiation data normalization
    cv_data_norm=normalizeFeature(cv_data_poly, method)

    theta_reg_multismpl=normalRegEq(train_data_norm, train_J, beta)
    error_train_reg_multismpl[j]=computeCost(train_data_norm, train_J, theta_reg_multismpl)
    error_cv_reg_multismpl[j]=computeCost(cv_data_norm, cv_J, theta_reg_multismpl)
    
    print(f'{mp.current_process().name} ended. \n')

In [31]:
t_start=time.time()
print("The number of CPU is:" + str(mp.cpu_count()))
beta_array=train_model_loop(1, 10)
t_end=time.time()
print(f'The time cost is {t_end-t_start}s')

# plot error vs. beta
plt.figure()
plt.semilogx(beta_range, error_train_reg, label = 'Train error')
plt.semilogx(beta_range, error_cv_reg, label = 'Val error')
plt.xlabel('regulization coefficient (beta)')
plt.ylabel('error')
plt.legend()

# print(error_cv_reg_multismpl.shape)
print(beta_array)
idx=error_cv_reg.argmin()
beta_best = beta_range[idx]
theta_best = theta_reg[idx]
error_test_reg=computeCost(test_data_norm, test_J, theta_best)

# print('The best value of THETA is ', theta_best)
print('The best value of BETA is ', beta_best)
# print('The test error with best THETA and BETA is ', error_test_reg)

The number of CPU is:4
Process 0-0-0 is running...
Process 0-0-1 is running...

Process 0-0-2 is running...

Process 0-0-3 is running...

Process 0-0-4 is running...

Process 0-0-5 is running...


Process 0-0-7 is running...

Process 0-0-8 is running...
Process 0-0-0 ended. 
Process 0-0-9 is running...
Process 0-0-1 ended. 



Process 0-0-6 is running...


Process 0-0-9 ended. 

Process 0-0-4 ended. 

Process 0-0-2 ended. 

Process 0-0-3 ended. 

Process 0-0-5 ended. 

Process 0-0-6 ended. 

Process 0-0-7 ended. 

Process 0-0-8 ended. 

0.0
Process 0-1-0 is running...

Process 0-1-1 is running...

Process 0-1-2 is running...
Process 0-1-3 is running...


Process 0-1-4 is running...
Process 0-1-3 ended. 
Process 0-1-2 ended. 
Process 0-1-5 is running...


Process 0-1-5 ended. 

Process 0-1-6 is running...

Process 0-1-7 is running...

Process 0-1-8 is running...


Process 0-1-0 ended. 
Process 0-1-1 ended. 



Process 0-1-9 is running...

Process 0-1-4 ended. 

Process 0-1-6 ended. 

Pr



Process 0-15-0 ended. 
Process 0-15-7 is running...

Process 0-15-8 is running...

Process 0-15-6 ended. 


Process 0-15-9 is running...
Process 0-15-4 ended. 
Process 0-15-3 ended. 



Process 0-15-7 ended. 

Process 0-15-5 ended. 
Process 0-15-2 ended. 


Process 0-15-8 ended. 

Process 0-15-9 ended. 

0.0
Process 0-16-0 is running...

Process 0-16-1 is running...

Process 0-16-2 is running...

Process 0-16-3 is running...

Process 0-16-4 is running...

Process 0-16-5 is running...

Process 0-16-6 is running...

Process 0-16-1 ended. 

Process 0-16-7 is running...

Process 0-16-8 is running...

Process 0-16-3 ended. 
Process 0-16-2 ended. 


Process 0-16-9 is running...


Process 0-16-5 ended. 
Process 0-16-0 ended. 

Process 0-16-6 ended. 


Process 0-16-4 ended. 
Process 0-16-7 ended. 

Process 0-16-8 ended. 

Process 0-16-9 ended. 

0.0
Process 0-17-0 is running...

Process 0-17-1 is running...

Process 0-17-2 is running...

Process 0-17-3 is running...

Process 0-17-4 is runnin


Process 0-30-2 ended. 
Process 0-30-1 ended. 
Process 0-30-3 ended. 



Process 0-30-9 is running...

Process 0-30-4 ended. 


Process 0-30-7 ended. 

Process 0-30-5 ended. 
Process 0-30-6 ended. 
Process 0-30-8 ended. 


Process 0-30-9 ended. 

0.0
Process 0-31-0 is running...

Process 0-31-1 is running...

Process 0-31-2 is running...

Process 0-31-3 is running...

Process 0-31-4 is running...

Process 0-31-5 is running...

Process 0-31-0 ended. 
Process 0-31-1 ended. 


Process 0-31-6 is running...

Process 0-31-7 is running...

Process 0-31-3 ended. 
Process 0-31-8 is running...


Process 0-31-2 ended. 

Process 0-31-9 is running...

Process 0-31-5 ended. 
Process 0-31-4 ended. 


Process 0-31-6 ended. 
Process 0-31-7 ended. 


Process 0-31-8 ended. 

Process 0-31-9 ended. 

0.0
Process 0-32-0 is running...

Process 0-32-1 is running...

Process 0-32-2 is running...

Process 0-32-3 is running...

Process 0-32-4 is running...
Process 0-32-2 ended. 
Process 0-32-5 is running...



P

Process 0-45-3 ended. 

Process 0-45-9 is running...


Process 0-45-5 ended. 

Process 0-45-6 ended. 

Process 0-45-4 ended. 
Process 0-45-7 ended. 


Process 0-45-9 ended. 

Process 0-45-8 ended. 

0.0
Process 0-46-0 is running...

Process 0-46-1 is running...

Process 0-46-2 is running...

Process 0-46-3 is running...

Process 0-46-4 is running...

Process 0-46-5 is running...

Process 0-46-6 is running...

Process 0-46-7 is running...
Process 0-46-8 is running...
Process 0-46-4 ended. 
Process 0-46-5 ended. 




Process 0-46-3 ended. 
Process 0-46-2 ended. 
Process 0-46-9 is running...



Process 0-46-1 ended. 
Process 0-46-0 ended. 


Process 0-46-8 ended. 
Process 0-46-6 ended. 


Process 0-46-7 ended. 

Process 0-46-9 ended. 

0.0
Process 0-47-0 is running...

Process 0-47-1 is running...

Process 0-47-2 is running...

Process 0-47-3 is running...

Process 0-47-4 is running...

Process 0-47-5 is running...

Process 0-47-6 is running...

Process 0-47-7 is running...

Process 0-47-


Process 0-60-6 ended. 

Process 0-60-4 ended. 

Process 0-60-5 ended. 

Process 0-60-7 ended. 

Process 0-60-8 ended. 
Process 0-60-9 ended. 


0.0
Process 0-61-0 is running...

Process 0-61-1 is running...

Process 0-61-2 is running...

Process 0-61-3 is running...

Process 0-61-4 is running...

Process 0-61-5 is running...

Process 0-61-6 is running...

Process 0-61-7 is running...

Process 0-61-2 ended. 

Process 0-61-8 is running...

Process 0-61-9 is running...

Process 0-61-1 ended. 
Process 0-61-0 ended. 
Process 0-61-3 ended. 
Process 0-61-4 ended. 




Process 0-61-5 ended. 

Process 0-61-6 ended. 

Process 0-61-7 ended. 

Process 0-61-8 ended. 

Process 0-61-9 ended. 

0.0
Process 0-62-0 is running...

Process 0-62-1 is running...

Process 0-62-2 is running...

Process 0-62-3 is running...

Process 0-62-4 is running...

Process 0-62-0 ended. 
Process 0-62-5 is running...


Process 0-62-2 ended. 
Process 0-62-1 ended. 
Process 0-62-3 ended. 

Process 0-62-6 is running...



P

Process Process 0-72-7:
Process Process 0-72-4:
Traceback (most recent call last):
  File "/Users/wandy/anaconda3/envs/NUSworkshop/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/wandy/anaconda3/envs/NUSworkshop/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process 0-72-9:
  File "/Users/wandy/anaconda3/envs/NUSworkshop/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
  File "/Users/wandy/anaconda3/envs/NUSworkshop/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-30-78eb86c8f88d>", line 61, in train_model_single
    print(f'{mp.current_process().name} ended. \n')
Traceback (most recent call last):
  File "/Users/wandy/anaconda3/envs/NUSworkshop/lib/python3.6/site-packages/ipykernel/iostream.py", line 406, in w

KeyboardInterrupt: 

# Prediction

In [None]:
beta_array.mean()

In [None]:
t=data(test_data)

In [24]:
print(error_train_reg)
print(error_cv_reg_multismpl)
len(error_train_reg)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]


NameError: name 'error_cv_reg_multismpl' is not defined

In [None]:
t.polyFeature(2)
t.array.shape

In [None]:
x=1