```
Author： 孙兰昌-北京邮电大学-计算机学院
Blog: https://blog.csdn.net/sunlanchang/article/details/102679472
Github: https://github.com/sunlanchang/blog/blob/master/linear_model.ipynb
```

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Preprocess train data 

## Before you start, please move the train.csv and test.csv to current directory.

In [2]:
df_train = pd.read_csv('train.csv',encoding = 'Big5')
df_train.describe()

Unnamed: 0,日期,測站,測項,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
count,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320,...,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320
unique,240,1,18,369,361,351,355,353,342,356,...,423,411,409,423,405,374,366,374,382,370
top,2014/12/6,豐原,O3,NR,NR,NR,NR,NR,NR,NR,...,NR,NR,NR,NR,NR,NR,NR,NR,NR,NR
freq,18,4320,240,221,225,229,226,229,230,226,...,220,219,221,221,222,223,225,224,226,224


### take a look at '測項' feature

In [3]:
tmp = df_train.loc[df_train['測項']=='RAINFALL']
def helper(row):
#     print(row.values.tolist().count('NR'))
    pass
    
# tmp.apply(helper, axis=1)

### concat each day data to a DataFrame

In [4]:
df_train_cat = pd.DataFrame()
for time in df_train['日期'].unique():
    tmp = df_train.loc[df_train['日期'] == str(time), '0':'23']
    tmp_col_name = list(tmp.columns)
    tmp.columns = [time+'_'+col_name for col_name in tmp_col_name]
    tmp.reset_index(drop=True, inplace=True)#promise index is same when merging
    df_train_cat = pd.concat([df_train_cat, tmp], axis=1)
df_train_cat.drop([10], inplace=True)#remove NR row
df_train_cat = df_train_cat.astype('float')
print(df_train_cat.shape)

(17, 5760)


## Train data is composed with every 10 hours df_train_cat data

## train data shape (576, 153) means 576 examples and 153 features

In [5]:
label = []
columns = list(df_train_cat.columns)
flag = True
for start in range(0, df_train_cat.shape[1], 10):
    train_data_2d = df_train_cat.loc[:, columns[start]:columns[start+8]]
    label.append(train_data_2d.loc[9,train_data_2d.columns[-1]])
    if flag:
        train_data_example_1d = train_data_2d.values.reshape(1,-1)
        flag = False
    else:
        train_data_example_1d = np.vstack((train_data_example_1d, train_data_2d.values.reshape(1,-1)))
label = np.array(label).reshape(-1,1)
train_data_all = train_data_example_1d
print(train_data_all.shape)
print(label.shape)

(576, 153)
(576, 1)


# Split all labeled data to train dataset and validation dataset

In [6]:
val_proportion = 0.2 #the proportion of validation data
mid = int(val_proportion * train_data_all.shape[0])
indices = np.random.permutation(train_data_all.shape[0])
val_idx, train_idx  = indices[:mid], indices[mid:]
X_train, y_train = train_data_all[train_idx,:], label[train_idx,:]
X_val, y_val = train_data_all[val_idx, :], label[val_idx, :]
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(461, 153) (461, 1) (115, 153) (115, 1)


# Test data preprocess

In [7]:
df_test = pd.read_csv('test.csv', names=[num for num in range(11)], encoding = 'Big5')
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,id_0,AMB_TEMP,25.0,26.0,26.0,25.0,22.0,21.0,21.0,21.0,21.0
1,id_0,CH4,1.6,1.7,1.7,1.7,1.7,1.7,1.7,1.7,1.7
2,id_0,CO,0.3,0.36,0.47,0.48,0.41,0.41,0.39,0.39,0.45
3,id_0,NMHC,0.1,0.22,0.2,0.21,0.08,0.1,0.14,0.12,0.15
4,id_0,NO,1.2,2.2,3.2,1.8,1.3,0.6,0.7,0.9,3.6


In [8]:
df_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
count,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320,4320
unique,240,18,433,426,430,422,425,424,416,410,416
top,id_185,O3,NR,NR,NR,NR,NR,NR,NR,NR,NR
freq,18,240,225,222,228,222,226,221,224,230,228


## Same preprocess like train data

In [9]:
start_idx = 0
df_test_cat = pd.DataFrame()
for id_ in df_test[0].unique():
    tmp = df_test.loc[df_test[0] == str(id_), 2:10]
    tmp_col_name = list(tmp.columns)
    tmp.columns = [num for num in range(start_idx, start_idx + 9)]
    start_idx += 9
    tmp.reset_index(drop=True, inplace=True)#promise index is same when merging
    df_test_cat = pd.concat([df_test_cat, tmp], axis=1)
df_test_cat.drop([10], inplace=True) #remove NR row
df_test_cat = df_test_cat.astype('float')
print(df_test_cat.shape)

(17, 2160)


In [10]:
columns = list(df_test_cat.columns)
flag = True
for start in range(0, df_test_cat.shape[1], 9):
    test_data_2d = df_test_cat.loc[:, columns[start]:columns[start+8]]
    if flag:
        test_data_example_1d = test_data_2d.values.reshape(1,-1)
        flag = False
    else:
        test_data_example_1d = np.vstack((test_data_example_1d, test_data_2d.values.reshape(1,-1)))
X_test = test_data_example_1d
X_test.shape

(240, 153)

# Implement Linear Regression

## Standardize train and val data

In [11]:
def standardization(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    return (data - mu) / sigma

In [12]:
X_train = standardization(X_train)
X_val = standardization(X_val)

## Train Linear Regression model

## Notice
- W is a m * n matrix, but b is a scalar. 
- When I set b a m * 1 vector, I got a lot of error which caused by the size of b vector. 
- In order to calculate gridient of b, you need to sum vector of db element-wise, just like the code. That confused me a lot.

In [13]:
(m, n) = X_train.shape #m examples, n features
W = np.random.rand(n, 1)
# b = np.random.rand(m, 1)
b = 0

epoch = 200000
lr = 0.0001
for ep in range(epoch):
    y_hat = X_train.dot(W) + b
    tmp_train = y_hat - y_train
    loss = tmp_train.T.dot(tmp_train) / (2 * m)
    dW = np.dot(X_train.T, np.dot(X_train, W) + b - y_train) / m
    db = np.dot(np.ones(shape=[1, X_train.shape[0]]),  np.dot(X_train, W) + b - y_train) / m # this is  right.
    dp = np.sum(np.dot(X_train, W) + b - y_train) / m # this is also right.
    W += - lr * dW
    b += - lr * db

    m_val = X_val.shape[0]
    y_hat = X_val.dot(W) + b
    tmp_val = y_hat - y_val
    loss_val = tmp_val.T.dot(tmp_val) / (2 * m_val)
    if ep % 10000 == 0:
        print('train loss: {}, val loss: {}'.format(loss, loss_val))
#         pass


train loss: [[473.25105808]], val loss: [[506.85180047]]
train loss: [[43.31075257]], val loss: [[50.30960388]]
train loss: [[10.98255212]], val loss: [[18.4697661]]
train loss: [[5.07783257]], val loss: [[11.9838766]]
train loss: [[3.33609128]], val loss: [[9.51441868]]
train loss: [[2.4873367]], val loss: [[7.9955848]]
train loss: [[1.95013164]], val loss: [[6.88096775]]
train loss: [[1.57348018]], val loss: [[6.01416238]]
train loss: [[1.29623708]], val loss: [[5.32069764]]
train loss: [[1.08565753]], val loss: [[4.75488429]]
train loss: [[0.9218369]], val loss: [[4.28598733]]
train loss: [[0.79188977]], val loss: [[3.89240929]]
train loss: [[0.68712585]], val loss: [[3.55850063]]
train loss: [[0.60149325]], val loss: [[3.27262857]]
train loss: [[0.53066215]], val loss: [[3.02595518]]
train loss: [[0.47146194]], val loss: [[2.81163922]]
train loss: [[0.42152391]], val loss: [[2.62430008]]
train loss: [[0.37904754]], val loss: [[2.45964777]]
train loss: [[0.34264369]], val loss: [[2.

## Standardize test data

In [14]:
X_test = standardization(X_test)

## Predict data

In [15]:
y_predict = np.dot(X_test, W) + b
y_predict

array([[31.23789135],
       [10.91846024],
       [33.53879837],
       [20.29396106],
       [45.76892444],
       [55.26823308],
       [31.98168383],
       [16.14821886],
       [ 3.82161982],
       [20.89239662],
       [ 2.77983564],
       [37.4800314 ],
       [27.09427362],
       [ 9.46954022],
       [15.79451477],
       [ 3.62477217],
       [21.1547981 ],
       [ 6.05173989],
       [16.2041509 ],
       [29.04795692],
       [ 5.1728246 ],
       [57.908577  ],
       [ 3.94079159],
       [11.24381152],
       [38.28109898],
       [ 9.90178504],
       [10.39157355],
       [10.66980927],
       [15.39603933],
       [14.74749505],
       [30.225952  ],
       [10.29197359],
       [ 0.53467943],
       [14.56075386],
       [25.52752142],
       [25.17374716],
       [19.45838083],
       [11.89537182],
       [33.12955657],
       [23.24193354],
       [ 2.0443079 ],
       [16.61158279],
       [18.56372224],
       [39.2747357 ],
       [16.51318661],
       [31

## Generate submission csv file

In [16]:
data_submit = {'id': df_test[0].unique(), 'value': y_predict.reshape(1, -1)[0]}
df_submit = pd.DataFrame(data_submit)
df_submit.head()

Unnamed: 0,id,value
0,id_0,31.237891
1,id_1,10.91846
2,id_2,33.538798
3,id_3,20.293961
4,id_4,45.768924


In [17]:
df_submit.to_csv('submission.csv', index=False)

In [20]:
!head submission.csv

id,value
id_0,31.696346780163733
id_1,10.731900860202735
id_2,35.064320391250064
id_3,20.2139549069601
id_4,46.505006706933585
id_5,57.78130099082912
id_6,32.90253364779179
id_7,17.897677615084568
id_8,3.1701880072903563


## Refference

https://en.wikipedia.org/wiki/Matrix_calculus

https://blog.csdn.net/nomadlx53/article/details/50849941

https://www.jb51.net/article/146990.htm