In [8]:
import pandas as pd
import numpy as np
import os
import sys
import typing as t

# CSV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data\\train.csv')
CSV_PATH = r"c:\Users\24717\Projects\MachineLearning\NTU_ML2017_Hung-yi-Lee_HW\HW1\data\train.csv"


In [9]:
def prefix_process(df_data: pd.DataFrame) -> t.Tuple[np.array, np.array, np.array]:
    """
    数据预处理
    """
    x_list, y_list = [], []
    # df替换指定元素，将空数据填充为0
    df_data = df_data.replace(['NR'], [0.0])
    # astype() 转换array中元素数据类型
    array = np.array(df_data).astype(float)
    # 将数据集拆分为多个数据帧
    for i in range(0, 4320, 18):
        for j in range(24-9):
            mat = array[i:i+18, j:j+9]
            label = array[i+9, j+9] # 第10行是PM2.5
            x_list.append(mat)
            y_list.append(label)
    x = np.array(x_list)
    y = np.array(y_list)
    return x, y, len(y)

In [5]:
def train(
    x_train: np.array, 
    y_label: np.array, 
    iteration: int,
):
    """
    训练模型
    """
    bias = 0 # 初始偏置值
    weights = np.ones(9) # 权重初始化
    learning_rate = 1  # 初始学习曲率
    reg_rate = 0.001  # 正则化系数
    bg_sum = 0 # 初始化bias的梯度下降和
    wg_sum = np.zeros(9) # 初始化weight的梯度下降和
    train_lens = len(x_train)

    for i in range(iteration):
        b_g = 0
        w_g = np.zeros(9)
        for j in range(train_lens):
            for m in range(9):
                w_g[m] += (y_label[j] - np.dot(weights, x_train[j, 9, :]) - bias) * (-x_train[j, 9, m])
            b_g += (y_label[j] - np.dot(weights, x_train[j, 9, :]) - bias) * (-1)
        b_g /= train_lens
        for m in range(9):
            w_g[m] = w_g[m] / train_lens + reg_rate * np.sum(weights)
        
        # 是否使用adgrad优化自适应学习率算法
        wg_sum += w_g ** 2
        bg_sum += b_g ** 2
        weights -= learning_rate / np.sqrt(wg_sum) * w_g
        bias -= learning_rate / np.sqrt(bg_sum) * b_g

        if i % 200 == 0 :
             loss = 0
             for j in range(train_lens):
                loss += (y_label[j] - np.dot(weights, x_train[j, 9, :]) - bias) ** 2
             print('after {} epochs, the loss on train data is:'.format(i), loss/train_lens)
             print(f'weight: {weights}, bias: {bias}')
    return weights, bias


In [6]:
def validate(x_val: np.array, y_val: np.array, weights: np.array, bias: float):
    """
    简单交叉验证测试
    """
    loss = 0
    for i in range(len(x_val)):
        loss += (y_val[i] - np.dot(weights, x_val[i, 9, :] - bias)) ** 2
    return loss / 400

In [None]:
def s_fold_validation(x_val: np.array, y_val: np.array, weights: np.array, bias: float):
    pass

In [7]:
def main():
    """
    入口
    """
    df_data = pd.read_csv(CSV_PATH, encoding = 'gb18030', usecols=range(3, 27))
    x_train, y_label, lens = prefix_process(df_data)
    split_pos = 3200
    train_x, train_y = x_train[0: split_pos], y_label[0: split_pos]
    test_x, test_y = x_train[split_pos:], y_label[split_pos:]
    weight, bias = train(train_x, train_y, 500)
    loss = validate(test_x, test_y, weight, bias)
    print('The loss on val data is:', loss)
    

main()

after 0 epochs, the loss on train data is: 955.3009375
weight: [0.00000000e+00 1.11022302e-16 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00], bias: -1.0
after 200 epochs, the loss on train data is: 49.86806930542829
weight: [ 0.04987391  0.00985012  0.01943023 -0.00319608  0.0197318   0.03941956
 -0.1943336   0.1987887   0.84691785], bias: 0.1789462375459398
after 400 epochs, the loss on train data is: 46.200829394245595
weight: [ 4.18816070e-02 -3.10392757e-04  3.30540588e-02 -4.12878290e-02
  3.58532224e-02  1.46547656e-01 -3.29580796e-01  1.28575672e-01
  9.53500758e-01], bias: 0.7666605100922935
The loss on val data is: 48.95745778506594


In [None]:
weight = [0.00411392,-0.0422015, 0.20925909, -0.21168959, -0.04279669,0.44531037
 -0.53739735, 0.04967271, 1.05177267]
bias = 2.0752944127920574
