[View in Colaboratory](https://colab.research.google.com/github/zhongqin0820/Misc-Algorithm-Implement/blob/master/Colab/xgboost_taxi.ipynb)

# 配置环境

## Kaggle API
注意此处使用的链接中谷歌OAuth2的验证码

In [0]:
!pip install -U -q kaggle

In [2]:
from googleapiclient.discovery import build
import io, os
from googleapiclient.http import MediaIoBaseDownload
from google.colab import auth
auth.authenticate_user()
drive_service = build('drive', 'v3')
results = drive_service.files().list(
        q="name = 'kaggle.json'", fields="files(id)").execute()
kaggle_api_key = results.get('files', [])
filename = "/content/.kaggle/kaggle.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
fh = io.FileIO(filename, 'wb')
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress() * 100))
os.chmod(filename, 600)

Download 100%.


In [3]:
!kaggle competitions list

ref                                              deadline             category            reward  teamCount  userHasEntered  
-----------------------------------------------  -------------------  ---------------  ---------  ---------  --------------  
house-prices-advanced-regression-techniques      2030-01-01 00:00:00  Getting Started  Knowledge       4806           False  
digit-recognizer                                 2030-01-01 00:00:00  Getting Started  Knowledge       2642           False  
titanic                                          2030-01-01 00:00:00  Getting Started  Knowledge      10380           False  
imagenet-object-localization-challenge           2029-12-31 07:00:00  Research         Knowledge         20           False  
competitive-data-science-predict-future-sales    2019-01-01 23:59:00  Playground           Kudos       1061           False  
tgs-salt-identification-challenge                2018-10-19 23:59:00  Featured          $100,000        613    

### 下载数据集到GoogleDrive
使用
```
!kaggle competitions download -c nyc-taxi-trip-duration -p /content/kaggle
```
指定数据下载到目录`/content/kaggle`

In [4]:
!kaggle competitions download -c nyc-taxi-trip-duration -p /content/kaggle

sample_submission.zip: Skipping, found more recently modified local copy (use --force to force download)
test.zip: Skipping, found more recently modified local copy (use --force to force download)
train.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!ls /content/kaggle # list the content in /content/kaggle

sample_submission.zip  taxi  test.zip  train.zip


## XGBoost
下载依赖包

In [0]:
!pip install -U -q numpy
!pip install -U -q pandas
!pip install -U -q matplotlib
!pip install -U -q sklearn
!pip install -U -q xgboost # to use XGBoost

### 预处理
使用zipfile解压数据， 也可以直接使用pandas读取数据的时候指定数据的解压方式等。

In [0]:
import zipfile
import os
for file in os.listdir(r'/content/kaggle/'):
    print('/content/kaggle/'+file)
    f = zipfile.ZipFile('/content/kaggle/'+file, 'r')
    for file in f.namelist():
        f.extract(file,"/content/kaggle/taxi/")

In [8]:
!ls /content/kaggle/taxi/

sample_submission.csv  test.csv  train.csv


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
%matplotlib inline
dataPath = '/content/kaggle/taxi/'

In [0]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV

需要处理header的情况，以及注意features和label的维数是否一致的问题

In [11]:
df = pd.read_csv(dataPath+'train.csv')
df.passenger_count = df.passenger_count.astype(np.uint8)
df.vendor_id = df.vendor_id.astype(np.uint8)
df.trip_duration = df.trip_duration.astype(np.uint32)
for c in [c for c in df.columns if c.endswith('tude')]:
    df.loc[:,c] = df[c].astype(np.float32)
print(df.memory_usage().sum()/2**20)
df.pickup_datetime=pd.to_datetime(df.pickup_datetime)
df.dropoff_datetime=pd.to_datetime(df.dropoff_datetime)
df['pu_hour'] = df.pickup_datetime.dt.hour
df['yday'] = df.pickup_datetime.dt.dayofyear
df['wday'] = df.pickup_datetime.dt.dayofweek
df['month'] = df.pickup_datetime.dt.month
# print(df.head())

75.11792755126953


In [0]:
# features = df[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pu_hour', 'yday', 'wday', 'month']]
features = df[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]
target = df['trip_duration']
features.to_csv('tf.csv', index=False)
target.to_csv('tl.csv', index=False)

In [0]:
features = pd.read_csv('tf.csv', skiprows=1)
target = pd.read_csv('tl.csv')

In [14]:
print(features.shape)
print(target.shape)
print(features.head())

(1458643, 5)
(1458643, 1)
   1  -73.982155  40.767937  -73.96463  40.765602
0  1  -73.980415  40.738564  -73.99948  40.731150
1  1  -73.979030  40.763940  -74.00533  40.710087
2  1  -74.010040  40.719970  -74.01227  40.706720
3  1  -73.973050  40.793210  -73.97292  40.782520
4  6  -73.982860  40.742195  -73.99208  40.749184


### 训练
主要是关于regressor的定义吧？（此处还有待进一步的学习！）

In [0]:
def rmsle(evaluator,X,real):
    sum = 0.0
    predicted = evaluator.predict(X)
    print("Number predicted less than 0: {}".format(np.where(predicted < 0)[0].shape))

    predicted[predicted < 0] = 0
    for x in range(len(predicted)):
        p = np.log(predicted[x]+1)
        r = np.log(real[x]+1)
        sum = sum + (p-r)**2
    return (sum/len(predicted))**0.5

In [0]:
reg = xgb.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)
cv = ShuffleSplit(n_splits=4, test_size=0.1, random_state=0)
print(cross_val_score(reg, features, np.ravel(target), cv=cv, scoring=rmsle))
reg.fit(features, target)

### 测试
1、包括与训练数据一致的数据处理方式

2、测试结果的保存

3、模型文件？？

In [29]:
tdf = pd.read_csv(dataPath+'test.csv')
tdf.passenger_count = tdf.passenger_count.astype(np.uint8)
for c in [c for c in tdf.columns if c.endswith('tude')]:
    tdf.loc[:,c] = tdf[c].astype(np.float32)
print(tdf.memory_usage().sum()/2**20)
testf = tdf[['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']]

testf.to_csv('test.csv', index=False)
# print(testf.head())
tfeatures = pd.read_csv('test.csv', skiprows=2)
# print(tfeatures.head())

29.212614059448242


In [30]:
tfeatures = pd.read_csv('test.csv', skiprows=1)
print(tfeatures.head())

   1  -73.98813   40.73203  -73.99017   40.75668
0  1 -73.964200  40.679993 -73.959810  40.655403
1  1 -73.997440  40.737583 -73.986160  40.729523
2  1 -73.956070  40.771900 -73.986430  40.730470
3  1 -73.970215  40.761475 -73.961510  40.755890
4  1 -73.991300  40.749798 -73.980515  40.786550


In [0]:
pred = reg.predict(tfeatures)
print(np.where(pred < 0)[0].shape)

In [0]:
pred[pred < 0] = 0
tdf['trip_duration']=pred.astype(int)
out = tdf[['id','trip_duration']]
out['trip_duration'].isnull().values.any()
out.to_csv('pred_xgboost.csv',index=False)

### 保存模型结果
可以使用`plot_tree(模型)`查看训练的`xgboost`树结构

In [0]:
from xgboost import plot_tree
plot_tree(reg)

In [0]:
import pickle
pickle.dump(reg, open('xgb_model.sav','wb'),protocol=2)

### 问题

输入和输出数据不匹配...