In [1]:
import numpy as np
import random
import os

In [2]:
def load_data(filename):
    """载入数据"""
    xys = []
    with open(filename, 'r') as f:
        for line in f:
            # line.strip() 去除行首尾的空白字符（包括换行符），
            # line.split() 将字符串按空格分割成一个列表，
            # map(float, ...) 将列表中的每个元素转换为 float 类型
            # map() 函数是 Python 的一个内建函数，用于将指定函数应用到给定可迭代对象（如列表、元组等）中的每个元素，
            # 并返回一个新的可迭代对象（通常是一个 map 对象）。可以将其理解为“对每个元素执行某个操作”。
            # map(function, iterable, ...)
            xys.append(map(float, line.strip().split()))

    if 'train' in filename:
        # 解包操作符 * 可以将一个可迭代对象的元素拆开作为多个单独的参数传递给函数。
        # zip 将多个可迭代对象按位置进行配对。
        xs, ys = zip(*xys)
        return np.asarray(xs), np.asarray(ys)#  np.asarray转换为 numpy 数组
    else:
        xs = [list(x)[0] for x in xys]
        return np.asarray(xs), None

In [36]:
class LinearRegression(object):
    def __init__(self):
        super(LinearRegression, self).__init__()
        '''
        self.w --> (2, ) is the parameter of a linear regression
        self.lr is the learning rate of training
        self.epoch is the iteration time of training
        '''
        self.w = 0.05 * np.random.randn(2)
        self.lr = 0.0001
        self.epoch = 1000

    def predict(self, x):
        beta0 = np.expand_dims(np.ones_like(x), axis=1)
        beta1 = np.expand_dims(x, axis=1)
        x = np.concatenate([beta1, beta0], axis=1)

        y = np.dot(x, self.w)
        return y
    
    def train(self, x, y):
        '''
        x and y are the data for traning a linear regression
        please simply update the value of self.w and not include any other parameters
        '''

        # ==========
        # todo '''使用随机梯度下降法优化对self.w进行更新'''
        beta0 = np.expand_dims(np.ones_like(x), axis=1)
        beta1 = np.expand_dims(x, axis=1)
        x = np.concatenate([beta1, beta0], axis=1)
        
        print(f"y:{y}")

        for i in range(self.epoch):
            ids = np.arange(len(x))
            random.shuffle(ids)

            for j in ids:
                delta_w = []
                xii = x[j]
                yii = y[j]
                delta_w = np.dot(xii, yii - np.dot(xii, self.w))
                self.w += self.lr*delta_w #加号是负负得正得到的

        # ==========

#     def LSE(self, x, y):
#         '''
#         x and y are the data for estimate a linear regression
#         '''

#         # ==========
#         # todo '''使用最小二乘法对self.w进行估计'''
#         print(f"y_train:{y_train}")
#         beta0 = np.expand_dims(np.ones_like(x_train), axis=1)
#         beta1 = np.expand_dims(x_train, axis=1)
#         beta = np.concatenate([beta1, beta0], axis=1)

#         self.w = np.dot(np.linalg.pinv(beta), y)# np.linalg.pinv求广义逆
    def LSE(self, x, y):
        '''
        x 和 y 是用于估计线性回归的训练数据
        '''
        beta0 = np.expand_dims(np.ones_like(x), axis=1)
        beta1 = np.expand_dims(x, axis=1)
        X = np.concatenate([beta1, beta0], axis=1)  # 设计矩阵

        # 计算正规方程 (X^T X)^(-1) X^T y
        XTX = np.dot(X.T, X)  # X^T X
        XTY = np.dot(X.T, y)  # X^T y
        XTX_inv = np.linalg.inv(XTX)  # (X^T X)^(-1)

        self.w = np.dot(XTX_inv, XTY)  # w = (X^T X)^(-1) X^T y

        # ==========
#     def LSE(self, x, y):
#         '''
#         x 和 y 是用于估计线性回归的训练数据
#         '''
#         beta0 = np.expand_dims(np.ones_like(x), axis=1)
#         beta1 = np.expand_dims(x, axis=1)
#         X = np.concatenate([beta1, beta0], axis=1)  # 设计矩阵

#         # 计算正规方程 (X^T X)^(-1) X^T y
#         XXT = np.dot(X, X.T)  # X^T X
#         XY = np.dot(X, y)  # X^T y
#         XXT_inv = np.linalg.inv(XXT)  # (X^T X)^(-1)

#         self.w = np.dot(XXT_inv, XY)  # w = (X^T X)^(-1) X^T y

#         # ==========

In [9]:
def evaluate(ys, ys_pred):
    """评估模型。"""
    std = np.sqrt(np.mean(np.abs(ys - ys_pred) ** 2))
    return std

In [10]:
def zip_fun():
    path=os.getcwd()
    newpath=path+"/output/"
    os.chdir(newpath)
    os.system('zip predict.zip predict.npy')
    os.chdir(path)

In [9]:
train_file = './input/train.txt'
test_file = './input/test_X.txt'
x_train, y_train = load_data(train_file)

In [11]:
x_test, _ = load_data(test_file)

In [12]:
x_test

array([ 0.1 ,  0.2 ,  0.45,  0.6 ,  0.85,  0.95,  1.15,  1.2 ,  1.3 ,
        1.45,  1.5 ,  1.6 ,  1.65,  1.8 ,  1.9 ,  2.  ,  2.05,  2.1 ,
        2.15,  2.2 ,  2.25,  2.95,  3.  ,  3.15,  3.25,  3.3 ,  3.35,
        3.6 ,  3.75,  3.85,  3.9 ,  4.25,  4.3 ,  4.5 ,  4.65,  5.  ,
        5.1 ,  5.15,  5.3 ,  5.35,  5.6 ,  5.65,  6.25,  6.3 ,  6.35,
        6.4 ,  6.45,  6.7 ,  6.85,  7.1 ,  7.2 ,  7.25,  7.4 ,  7.5 ,
        7.55,  7.6 ,  7.95,  8.2 ,  8.25,  8.45,  8.5 ,  8.55,  8.7 ,
        9.  ,  9.05,  9.25,  9.35,  9.4 ,  9.45,  9.65,  9.95, 10.05,
       10.35, 10.4 , 10.45, 10.5 , 10.65, 10.7 , 10.75, 10.85, 10.9 ,
       11.  , 11.05, 11.1 , 11.15, 11.2 , 11.25, 11.45, 11.65, 11.8 ,
       11.9 , 12.15, 12.3 , 12.35, 12.4 , 12.95, 13.05, 13.1 , 13.25,
       13.4 , 13.45, 13.55, 13.65, 13.7 , 13.75, 13.95, 14.05, 14.1 ,
       14.15, 14.3 , 14.35, 14.45, 14.55, 14.8 , 14.85, 14.95, 15.05,
       15.5 , 16.  , 16.1 , 16.25, 16.45, 16.55, 16.6 , 16.65, 16.75,
       16.8 , 16.95,

In [37]:
if __name__ == '__main__':
    solver = 'LSE'
    train_file = './input/train.txt'
    test_file = './input/test_X.txt'
    # 载入数据
    x_train, y_train = load_data(train_file)
    x_test, _ = load_data(test_file)

    # 使用线性回归训练模型，返回一个函数f()使得y = f(x)
    f = LinearRegression()
    print
    if solver == 'GD':
        f.train(x_train, y_train)
    elif solver == 'LSE':
        f.LSE(x_train, y_train)
    else:
        raise TypeError("Wrong solver !")
    y_train_pred = f.predict(x_train)
    std = evaluate(y_train, y_train_pred)
    print('The std on training data via SGD is ：{:f}'.format(std))

    preds = f.predict(x_test)
    np.save('./output/predict', preds)
    zip_fun()


The std on training data via SGD is ：2.041070
