In [1]:
import numpy as np

In [2]:
def get_data(path):
    data = np.loadtxt(path)
    X = np.array(data[:, 1:-1])
    Y = np.array(data[:, -1])
    return X, Y

In [42]:
X, Y = get_data("D:\\movedFromC\\123\LabML\\session1\\deathRate_data.txt")


In [3]:
def normalize_and_add_ones(X):    # X is already in numpy arr form
    # a matrix where every row element is the maximum value
    X_max = np.array([[np.amax(X[:, col_id]) for col_id in range(X.shape[1])]
                     for _ in range(X.shape[0])])
    
    # a matrix where every row element is the minimum value
    X_min = np.array([[np.amin(X[:, col_id]) for col_id in range(X.shape[1])]
                     for _ in range(X.shape[0])])
    
    # feature scaling
    X_normalized = (X - X_min) / (X_max - X_min)
    
    ones = np.array([[1] for _ in range(X.shape[0])])
    
    return np.column_stack((ones, X_normalized))

In [26]:
class RidgeRegression:
    def __init__(self):
        return
    
    def fit(self, X_train, Y_train, LAMBDA):
        assert len(X_train.shape) == 2 and X_train.shape[0] == Y_train.shape[0]
        
        W = np.linalg.inv(X_train.transpose().dot(X_train)
                          + LAMBDA * np.identity(X_train.shape[1])
                         ).dot(X_train.transpose()).dot(Y_train)
        
      #  W = np.dot(np.dot(np.linalg.inv(np.dot(X_train.transpose(), X_train) + LAMBDA * np.identity(X_train.shape[1])), X_train.transpose()), Y_train)
        return W
        
    def predict(self, W, X_new):
        Y_new = X_new.dot(W)
        return Y_new
    
    def compute_RSS(self, Y_new, Y_predicted):
        loss = 1. / Y_new.shape[0] * np.sum((Y_new - Y_predicted) ** 2)
        return loss
    
    def fit_gradient(self, X_train, Y_train, LAMBDA, learning_rate, max_num_epoch=100, batch_size=128):
        # init W
        W = np.random.randn(X_train.shape[1])
        last_loss = 10e+8
        for ep in range(max_num_epoch):
            # shuffle data points X
            arr = np.array(range(X_train.shape[0]))
            np.random.shuffle(arr)
            X_train = X_train[arr]
            Y_train = Y_train[arr]   # Y_train must be respectively shuffled
            total_minibatch = int(np.ceil(X_train.shape[0] / batch_size))
            for i in range(total_minibatch):
                index = i * batch_size
                X_train_sub = X_train[index: index+batch_size]
                Y_train_sub = Y_train[index: index+batch_size]
                grad = X_train_sub.transpose().dot(X_train_sub.dot(W) - Y_train_sub) + LAMBDA * W
                W = W - learning_rate*grad
            Y_predicted = predict(W, X_train)
            new_loss = compute_RSS(Y_train, Y_predicted)
            if (np.abs(new_loss - last_loss) < 1e-5):
                break
            last_lost = new_lost
        return W
    
    def get_the_best_LAMBDA(self, X_train, Y_train):
        def cross_validation(num_folds, LAMBDA):
            # all operations should be on indices of X_train 
            row_ids = np.array(range(X_train.shape[0]))
            # first, make sure indicec is divisiable by num_folds
            valid_ids = np.split(row_ids[: len(row_ids) - len(row_ids) % num_folds], num_folds)
            # then, append the rest to the last valid_id[]
            valid_ids[-1] = np.append(valid_ids[-1], row_ids[len(row_ids) - len(row_ids) % num_folds :])
            # create an array holding lists of train_ids respectively with valid_ids[i]
            train_ids = [[k for k in row_ids if k not in valid_ids[i]] for i in range(num_folds)]
            
            #init aver_RSS
            aver_RSS = 0
            for i in range(num_folds):
                valid_part = {'X': X_train[valid_ids[i]], 'Y': Y_train[valid_ids[i]]}
                train_part = {'X': X_train[train_ids[i]], 'Y': Y_train[train_ids[i]]}
                W = self.fit(train_part['X'], train_part['Y'], LAMBDA)
                Y_predicted = self.predict(W, valid_part['X'])
                aver_RSS += self.compute_RSS(valid_part['Y'], Y_predicted)
            return aver_RSS / num_folds
        
        def range_scan(best_LAMBDA, minimum_RSS, LAMBDA_values):
            for curr_LAMBDA in LAMBDA_values:
                curr_RSS = cross_validation(5, curr_LAMBDA)
                if (minimum_RSS > curr_RSS):
                    best_LAMBDA = curr_LAMBDA
                    minimum_RSS = curr_RSS
            return best_LAMBDA, minimum_RSS
        
        best_LAMBDA, minimum_RSS = range_scan(0, 10000**2, range(50))
        LAMBDA_values = [k * 1. / 1000 for k in range(max(0, best_LAMBDA-1) * 1000, (best_LAMBDA+1) * 1000, 1)]
        
        best_LAMBDA, minimum_RSS = range_scan(best_LAMBDA, minimum_RSS, LAMBDA_values)
        return best_LAMBDA
        
        
        
    

In [27]:
if __name__ == '__main__':
    X, Y = get_data("D:\\movedFromC\\123\LabML\\session1\\deathRate_data.txt")
    X = normalize_and_add_ones(X)
    X_train = X[: 50]
    Y_train = Y[: 50]
    X_test = X[50 :]
    Y_test = Y[50 :]
    
    ridge_regression = RidgeRegression()
    best_LAMBDA = ridge_regression.get_the_best_LAMBDA(X_train, Y_train)
    print('Best LAMBDA is: ', best_LAMBDA)
    W_learned = ridge_regression.fit(X_train, Y_train, best_LAMBDA)
    Y_predicted = ridge_regression.predict(W_learned, X_test)
    print (ridge_regression.compute_RSS(Y_test, Y_predicted))

Best LAMBDA is:  0.002
1527.0698078029754
