In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_hdf('./data/data_final.h5')

In [3]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 344736 entries, 0 to 344735
Data columns (total 8 columns):
STA_DATETIME    344736 non-null float64
STATION_ID      344736 non-null int64
LATITUDE        344736 non-null float64
LONGITUDE       344736 non-null float64
VOLUME          344736 non-null float64
OCC             344736 non-null float64
SPEED           344736 non-null float64
NUM_SAMPLES     344736 non-null int64
dtypes: float64(6), int64(2)
memory usage: 23.7 MB
None


Unnamed: 0,STA_DATETIME,STATION_ID,LATITUDE,LONGITUDE,VOLUME,OCC,SPEED,NUM_SAMPLES
0,0.0,400000,38.081167,-122.547606,62.0,0.0,64.0,0
1,300.0,400000,38.081167,-122.547606,46.0,0.0,64.0,0
2,600.0,400000,38.081167,-122.547606,68.0,0.0,65.0,0
3,900.0,400000,38.081167,-122.547606,86.0,0.0,65.0,0
4,1200.0,400000,38.081167,-122.547606,95.0,0.0,64.0,0


## 划分数据集

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X = df.iloc[:,:6]
y = df.iloc[:,[6]]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.25)

## 标准化数据 
`StandardScaler()`

$x'=\frac{x-\overline{X}}{S}$

In [7]:
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()

In [8]:
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.transform(y_test)

## 评价标准
- $R^2$
- $MSE$
- $MAE$

In [9]:
# r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def show_score(method, y_true, y_pred):
    r2, mse, mae = '', '', ''
    for i in range(len(method)):
        score_r2 = r2_score(y_true[i],y_pred[i])
        score_mse = mean_squared_error(y_true[i],y_pred[i])
        score_mae = mean_absolute_error(y_true[i],y_pred[i])
        r2 += str(method[i]) + ': ' + str(score_r2) + '\t'
        mse += str(method[i]) + ': ' + str(score_mse) + '\t'
        mae += str(method[i]) + ': ' + str(score_mae) + '\t'
    print ('\nR-squared 得分\n' + r2)
    print ('\nMSE 得分\n' + mse)
    print ('\nMAE 得分\n' + mae)

## 模型

### LinearRegression，SGDRegressor

In [10]:
# LinearRegression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)

In [11]:
# SGDRegressor
from sklearn.linear_model import SGDRegressor

# SGDRegressor 的输出需要一维数据
y_sgdr_train = y_train[:,0]
y_sgdr_test = y_test[:,0]

sgdr = SGDRegressor(max_iter=1000)
sgdr.fit(X_train, y_sgdr_train)
sgdr_y_predict = sgdr.predict(X_test)

#### 得分 - LinearRegression，SGDRegressor

In [12]:
methed = ['LinearRegression','SGDRegressor']
y_true = [y_test, y_sgdr_test]
y_pred = [lr_y_predict,sgdr_y_predict]
show_score(methed,y_true,y_pred)


R-squared 得分
LinearRegression: 0.112960470898	SGDRegressor: 0.113014326057	

MSE 得分
LinearRegression: 0.870643886719	SGDRegressor: 0.870591026994	

MAE 得分
LinearRegression: 0.677551433274	SGDRegressor: 0.677447073247	


### KNeighborsRegressor
- weights = distance
- weights = uniform

In [13]:
# KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor

In [14]:
# weights = distanc
dis_knr=KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict=dis_knr.predict(X_test)

In [15]:
# weights = uniform
uni_knr=KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict=uni_knr.predict(X_test)

#### 得分 - KNeighborsRegressor

In [16]:
methed = ['KNN-distance','KNN-uniform']
y_true = [y_test, y_test]
y_pred = [dis_knr_y_predict,uni_knr_y_predict]
show_score(methed,y_true,y_pred)


R-squared 得分
KNN-distance: 0.893691810102	KNN-uniform: 0.876195974434	

MSE 得分
KNN-distance: 0.104343236808	KNN-uniform: 0.121515687265	

MAE 得分
KNN-distance: 0.184975559414	KNN-uniform: 0.201668438081	


### DecisionTreeRegressor

In [17]:
# DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_y_predict=dtr.predict(X_test)

#### 得分 - DecisionTreeRegressor

In [18]:
methed = ['DecisionTreeRegressor']
y_true = [y_test]
y_pred = [dtr_y_predict]
show_score(methed,y_true,y_pred)


R-squared 得分
DecisionTreeRegressor: 0.949351864665	

MSE 得分
DecisionTreeRegressor: 0.049711977828	

MAE 得分
DecisionTreeRegressor: 0.117338188147	


### Ensemble Methods
- RandomForestRegressor
- ExtraTreesRegressor
- GradientBoostingRegressor

In [19]:
# RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor 的输出需要一维数据
y_rfr_train = y_train[:,0]
y_rfr_test = y_test[:,0]

rfr=RandomForestRegressor()
rfr.fit(X_train, y_rfr_train)
rfr_y_predict = rfr.predict(X_test)

In [20]:
# ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

# ExtraTreesRegressor 的输出需要一维数据
y_etr_train = y_train[:,0]
y_etr_test = y_test[:,0]

etr=ExtraTreesRegressor()
etr.fit(X_train, y_etr_train)
etr_y_predict = etr.predict(X_test)

In [21]:
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# GradientBoostingRegressor 的输出需要一维数据
y_gbr_train = y_train[:,0]
y_gbr_test = y_test[:,0]

gbr=GradientBoostingRegressor()
gbr.fit(X_train, y_gbr_train)
gbr_y_predict = gbr.predict(X_test)

#### 得分 - RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [22]:
methed = ['RandomForestRegressor','ExtraTreesRegressor','GradientBoostingRegressor']
y_true = [y_rfr_test, y_etr_test, y_gbr_test]
y_pred = [rfr_y_predict, etr_y_predict, gbr_y_predict]
show_score(methed,y_true,y_pred)


R-squared 得分
RandomForestRegressor: 0.96716332804	ExtraTreesRegressor: 0.962733018571	GradientBoostingRegressor: 0.426370021136	

MSE 得分
RandomForestRegressor: 0.0322297335852	ExtraTreesRegressor: 0.036578155193	GradientBoostingRegressor: 0.563027258596	

MAE 得分
RandomForestRegressor: 0.106344162137	ExtraTreesRegressor: 0.116503383112	GradientBoostingRegressor: 0.546573537821	


### XGBRegressor
- max_depth = default(3)
- max_depth = 10
- max_depth = 20

In [23]:
# XGBRegressor
from xgboost import XGBRegressor



In [24]:
# max_depth = default(3)
d3_xgbr = XGBRegressor()
d3_xgbr.fit(X_train,y_train)
d3_xgbr_y_predict = d3_xgbr.predict(X_test)

In [25]:
# max_depth = 10
d10_xgbr = XGBRegressor(max_depth = 10)
d10_xgbr.fit(X_train,y_train)
d10_xgbr_y_predict = d10_xgbr.predict(X_test)

In [26]:
# max_depth = 20
d20_xgbr = XGBRegressor(max_depth = 20)
d20_xgbr.fit(X_train,y_train)
d20_xgbr_y_predict = d20_xgbr.predict(X_test)

#### 得分 - XGBRegressor

In [27]:
methed = ['XGBR-d3', 'XGBR-d10', 'XGBR-d20']
y_true = [y_test, y_test, y_test]
y_pred = [d3_xgbr_y_predict, d10_xgbr_y_predict, d20_xgbr_y_predict]
show_score(methed,y_true,y_pred)


R-squared 得分
XGBR-d3: 0.420290790829	XGBR-d10: 0.900011815206	XGBR-d20: 0.970165393093	

MSE 得分
XGBR-d3: 0.568994123126	XGBR-d10: 0.0981400478544	XGBR-d20: 0.0292831573615	

MAE 得分
XGBR-d3: 0.549450947576	XGBR-d10: 0.222173981266	XGBR-d20: 0.103590061312	
