In [1]:
import numpy as np
import pandas as pd
import csv

In [2]:
def data_process(data):
    """数据预处理"""
    # 取24个小时的观测值
    data = data.iloc[:, 3: ]
    # 将空数据填充为0
    data = data.replace(['NR'], [0.0])
    array = np.array(data, dtype=float)

    # x为训练集列表，y为标签列表
    x = []
    y = []
    
    for i in range(0, 4320, 18):
        # 将连续9个小时的数据作为训练集，第10个小时的数据作为标签，每天有24个小时，因此每天可以划分出15份训练集和标签
        for j in range(24-9):
            tmp_x = array[i+9, j: j+9]
            tmp_y = array[i+9, j+9]
            x.append(tmp_x)
            y.append(tmp_y)
    
    # 此时shape(X) = (4320/18*15, 9) = (3600, 9)
    X = np.array(x, dtype=float)
    Y = np.array(y, dtype=float)

    return X,Y

In [3]:
def build_model(train_x, train_y, epochs):
    """建立模型"""
    bias = 0 # 偏置项
    weights = np.ones(9) # 权重
    bg_sum = 0 # 偏置项的梯度平方和
    wg_sum = np.zeros(9) # 权重的梯度平方和
    lr = 1 # 学习率
    reg_rate = 0.001 # 正则项系数
    
    for epoch in range(epochs):
        b_grad = 0
        w_grad = np.zeros(9)
        for i in range(3200):
            # 由于只需要观测值表为pm2.5的数据，所以第二维为9
            b_grad += (train_y[i] - weights.dot(train_x[i, :]) - bias) * (-1)
            for j in range(9):
                w_grad[j] += (train_y[i] - weights.dot(train_x[i, :]) - bias) * (-train_x[i, j])
            
        # 将梯度求平均
        b_grad /= 3200
        w_grad /= 3200
        
        # 加上正则项
        for i in range(9):
            w_grad[i] += reg_rate * weights[i]
            
        # adgrad
        bg_sum += b_grad**2
        wg_sum += w_grad**2
        
        # 更新权重和偏置项
        bias -= (lr / bg_sum**0.5) * b_grad
        weights -= (lr / wg_sum**0.5) * w_grad
               
        
        # 每训练200轮，输出一次Loss
        if epoch % 200 == 0:
            loss = 0
            for i in range(3200):
                loss += (train_y[i] - weights.dot(train_x[i, :]) - bias)**2
            print('after {} epochs, the loss on train data is:'.format(epoch), loss/3200)
    
    return weights, bias

In [4]:
def val_model(val_x, val_y, weights, bias):
    """验证模型"""
    loss = 0
    for i in range(400):
        loss += (val_y[i] - weights.dot(val_x[i, :]) - bias)**2
    return loss / 400

In [5]:
def test_data_process(data):
    """测试集预处理"""
    data = data.iloc[:, 2: ]
    # 将空数据填充为0
    data = data.replace(['NR'], [0.0])
    array = np.array(data, dtype=float)
    
    x = []
    
    for i in range(0, 4320, 18):
        tmp_x = array[i+9, :]
        x.append(tmp_x)
    
    X = np.array(x, dtype=float)
    return X

In [6]:
def test(test_x, weights, bias):
    """测试模型"""
    y = np.dot(test_x, weights) 
    return y

In [7]:
def main():
    train_data = pd.read_csv('../data/train.csv', encoding='gb18030')
    X, Y = data_process(train_data)
    
    # 划分训练集和验证集
    train_x, train_y = X[0: 3200], Y[0: 3200]
    val_x, val_y = X[3200: 3600], Y[3200: 3600]
    
    # 训练模型
    epoch = 2000
    w, b = build_model(train_x, train_y, epoch)

    # 验证模型
    loss = val_model(val_x, val_y, w, b)
    print('The loss on val data is: ', loss)
    
    # 测试集预处理
    test_data = pd.read_csv('../data/test.csv', header=None, encoding='gb18030')
    test_x = test_data_process(test_data)
    test_y = test(test_x, w, b)
    
    # 保存为结果文件
    with open('result.csv', mode='w', newline='') as result:
        csv_writer = csv.writer(result)
        header = ['id', 'value']
        csv_writer.writerow(header)
        for i in range(240):
            row = ['id_' + str(i), test_y[i]]
            csv_writer.writerow(row)

In [None]:
if __name__ == '__main__':
    main()

after 0 epochs, the loss on train data is: 955.3009375
after 200 epochs, the loss on train data is: 49.86823677027294
after 400 epochs, the loss on train data is: 46.20101423801225
after 600 epochs, the loss on train data is: 44.88913061600436
after 800 epochs, the loss on train data is: 44.26903588227097
after 1000 epochs, the loss on train data is: 43.95010919056686
after 1200 epochs, the loss on train data is: 43.78092633274225
after 1400 epochs, the loss on train data is: 43.68982565130423
after 1600 epochs, the loss on train data is: 43.6403143032977
