[View in Colaboratory](https://colab.research.google.com/github/zhongqin0820/Misc-Algorithm-Implement/blob/master/Colab/xgboost_taxi.ipynb)

# 配置环境

## Kaggle API
注意此处使用的链接中谷歌OAuth2的验证码

In [0]:
!pip install -U -q kaggle

In [5]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [0]:
!kaggle competitions list

### 下载数据集到GoogleDrive
使用
```
!kaggle competitions download -c nyc-taxi-trip-duration -p /content/kaggle
```
指定数据下载到目录`/content/kaggle`

In [0]:
!kaggle competitions download -c nyc-taxi-trip-duration -p /content/kaggle

In [7]:
!ls /content/kaggle # list the content in /content/kaggle

sample_submission.zip  test.zip  train.zip


## XGBoost
下载依赖包

In [0]:
!pip install -U -q numpy
!pip install -U -q pandas
!pip install -U -q matplotlib
!pip install -U -q sklearn
!pip install -U -q xgboost # to use XGBoost

# 训练模型

### 预处理
使用zipfile解压数据， 也可以直接使用pandas读取数据的时候指定数据的解压方式等。

In [0]:
import zipfile
import os
for file in os.listdir(r'/content/kaggle/'):
    print('/content/kaggle/'+file)
    f = zipfile.ZipFile('/content/kaggle/'+file, 'r')
    for file in f.namelist():
        f.extract(file,"/content/kaggle/taxi/")

In [10]:
!ls /content/kaggle/taxi/

sample_submission.csv  test.csv  train.csv


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline
dataPath = '/content/kaggle/taxi/'

In [0]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV

需要处理header的情况，以及注意features和label的维数是否一致的问题

In [0]:
df = pd.read_csv(dataPath+'train.csv')

In [0]:
df.passenger_count = df.passenger_count.astype(np.uint8)
df.vendor_id = df.vendor_id.astype(np.uint8)
df.trip_duration = df.trip_duration.astype(np.uint32)
for c in [c for c in df.columns if c.endswith('tude')]:
    df.loc[:,c] = df[c].astype(np.float32)

In [0]:
df.info()
df.describe()

In [0]:
df_test = pd.read_csv(dataPath+'test.csv')
df_test.passenger_count = df_test.passenger_count.astype(np.uint8)
df_test.vendor_id = df_test.vendor_id.astype(np.uint8)
for c in [c for c in df_test.columns if c.endswith('tude')]:
    df_test.loc[:,c] = df_test[c].astype(np.float32)
df_test.info()

In [0]:
features = df[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
target = df[['trip_duration']]

In [30]:
# print(features.shape,type(features))
# print(target.shape,type(target))
print(features.head())
print(target.head())

   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0                1        -73.982155        40.767937         -73.964630   
1                1        -73.980415        40.738564         -73.999481   
2                1        -73.979027        40.763939         -74.005333   
3                1        -74.010040        40.719971         -74.012268   
4                1        -73.973053        40.793209         -73.972923   

   dropoff_latitude  
0         40.765602  
1         40.731152  
2         40.710087  
3         40.706718  
4         40.782520  
   trip_duration
0            455
1            663
2           2124
3            429
4            435


### 训练
主要是关于regressor的定义吧？（此处还有待进一步的学习！）

In [0]:
def rmsle(evaluator,X,real):
    sum = 0.0
    predicted = evaluator.predict(X)
    print("Number predicted less than 0: {}".format(np.where(predicted < 0)[0].shape))

    predicted[predicted < 0] = 0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p-r)**2
    return (sum/len(predicted))**0.5

In [32]:
reg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
cv = ShuffleSplit(n_splits=4, test_size=0.1, random_state=0)
print(cross_val_score(reg, features, np.ravel(target), cv=cv, scoring=rmsle))
reg.fit(features, target)

Number predicted less than 0: (3,)
Number predicted less than 0: (7,)
Number predicted less than 0: (8,)
Number predicted less than 0: (6,)
[0.64925179 0.65498191 0.66591876 0.65287333]


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.08, max_delta_step=0,
       max_depth=7, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

### 测试
1、包括与训练数据一致的数据处理方式

2、测试结果的保存

3、模型文件？？

In [0]:
testf = df_test[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]

In [35]:
pred = reg.predict(testf)
print(np.where(pred < 0)[0].shape)

(54,)


In [36]:
submi = pd.read_csv(dataPath + 'sample_submission.csv')
submi.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [0]:
pred[pred < 0] = 0
df_test['trip_duration']=pred.astype(int)
out = df_test[['id','trip_duration']]
out['trip_duration'].isnull().values.any()
out.to_csv('pred_xgboost.csv',index=False)

In [39]:
!ls .

datalab  kaggle  pred_xgboost.csv


In [40]:
mysub = pd.read_csv('pred_xgboost.csv')
mysub.head()

Unnamed: 0,id,trip_duration
0,id3004672,900
1,id3505355,711
2,id1217141,619
3,id2150126,1226
4,id1598245,684


In [43]:
!ls .

datalab  kaggle  pred_xgboost.csv


### 保存模型结果
可以使用`plot_tree(模型)`查看训练的`xgboost`树结构

In [0]:
from xgboost import plot_tree
plot_tree(reg)

In [0]:
import pickle
pickle.dump(reg, open('xgb_model.sav','wb'),protocol=2)

# 上传结果

In [44]:
!kaggle competitions submit -c nyc-taxi-trip-duration -f pred_xgboost.csv -m "Commit from colab"

Successfully submitted to New York City Taxi Trip Duration