In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
train_data = pd.read_csv('train.csv')
# 前两列是名称不是数据, 后24列为一天24小时的测量值
# 每一行代表某天某测量指标一天的测量值
test_data = pd.read_csv('test.csv')
test = test_data[test_data['AMB_TEMP'] == 'PM2.5']
# 待预测的测试集x
test_x = test.iloc[:, 2:]
train = train_data[train_data['columns'] == 'PM2.5'].iloc[:, 2:]
train_x = []
train_y = []
# 将每一天的PM2.5的数据以10为一组划分,前9个为x, 第10个为y,例如0..9是x,10为y.
# 最后一组是14..22是x,23为y
for i in range(15):
    x = train.iloc[:, i:i+9].values
    y = train.iloc[:, i+9].values
    train_x.append(x)
    train_y.append(y)

# 测试数据
train_x = np.vstack(train_x)  # 3600 x 9
train_y = np.hstack(train_y)  # 3600
train_x = np.array(train_x,float)
train_y = np.array(train_y,float)

# 进行标准缩放，即数据归一化
ss = StandardScaler()

# 进行数据拟合
ss.fit(train_x)
# 进行数据转换
train_x = ss.transform(train_x)

ss.fit(test_x)
test_x = ss.transform(test_x)

  return self.partial_fit(X, y)


In [3]:
from sklearn.metrics import r2_score


class LinearRegression(object):
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
        self._weights = None

    def fit_norm(self, X, y):
        bias = np.ones(X.shape[0])
        X = np.c_[bias, X]
        self._weights = np.linalg.inv(X.T@X)@X.T@y
        self.coef_ = self._weights[1:]
        self.intercept_ = self._weights[0]

        return self

    def fit_gd(self, X, y, eta=0.001, n_iters=64):

        def J(weights, X, y):
            return np.sum(y - (X@weights)**2)/len(y)

        def dJ(weights, X, y):
            return X.T @ ((X@weights) - y) * 2 / len(y)

        def gradient_descent(X, y, initial_weights, eta, n_iters=1e4, epsilon=1e-8):
            weights = initial_weights
            for i in range(n_iters):
                gradient = dJ(weights, X, y)
                last_gradient = gradient
                weights -= eta*gradient
                if (abs(J(weights, X, y) - J(last_gradient, X, y)) < epsilon):
                    break
            return weights

        bias = np.ones(X.shape[0])
        X = np.c_[bias, X]
        initial_weights = np.zeros(X.shape[1])
        self._weights = gradient_descent(X, y, initial_weights, eta, n_iters)
        self.coef_ = self._weights[1:]
        self.intercept_ = self._weights[0]
        return self

    def predict(self, X_pred):
        bias = np.ones(X_pred.shape[0])
        X_pred = np.c_[bias, X_pred]
        return X_pred@self._weights

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return r2_score(y_pred, y_test)

In [4]:
lr = LinearRegression()
lr.fit_gd(train_x,train_y,n_iters=10000)
lr.score(train_x,train_y)

0.8156157460570261

In [5]:
etas = [1e-5,1e-4,1e-3,1e-2,1e-1,1]
scores = []
for eta in etas:
    lr.fit_gd(train_x,train_y,eta=eta,n_iters=10000)
    scores.append(lr.score(train_x,train_y))

In [13]:
eta=etas[np.argmax(scores)]
lr.fit_gd(train_x,train_y,eta=eta,n_iters=20000)
lr.score(train_x,train_y)

0.8302731904056851

In [16]:
sampleSubmission = pd.read_csv('sampleSubmission.csv',index_col='id')
sampleSubmission['value'] = lr.predict(test_x)
sampleSubmission.to_csv('result.csv')