In [1]:
#Solution to Kaggle Taxi Fare Prediction competition
#https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
#Placement: 303/1488

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import os
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from subprocess import call
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
#inputs
TEST_PATH = 'Data/test.csv'
TRAIN_PATH = 'Data/train_1mln.csv'
#TRAIN_PATH = 'Data/train.csv'


In [4]:
#calculate distance between coordinates using Haversine formula ( https://en.wikipedia.org/wiki/Haversine_formula )
def calc_distance(lat1, lon1, lat2, lon2):
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

#use apply function to calculate distance
def get_distance(df):
    df['distance'] = df.apply(lambda row: calc_distance(row['pickup_latitude'], 
                                 row['pickup_longitude'], 
                                 row['dropoff_latitude'], 
                                 row['dropoff_longitude']), axis=1)
    return df

#set max distance, drop values above 100km 
def lower_distance(df):
    df = df[df['distance'] < 100]
    return df

In [5]:
#drop missings, unreal values
def clean_data(df):
    df = df.dropna(how = 'any', axis = 'rows')
    df = df[(df['passenger_count'] > 0) & (df['passenger_count'] < 7)]
    df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 100)]
    df = df[(df['pickup_latitude'] != 0) & (df['pickup_longitude'] != 0) &
                        (df['dropoff_latitude'] != 0) & (df['dropoff_longitude'] != 0)]
    return df

In [6]:
#Extract hour, day, month and year  from datetime
def get_date_features(df):
    df['hour'] = df.pickup_datetime.dt.hour
    df['dayofweek'] = df.pickup_datetime.dt.dayofweek
    df['month'] = df.pickup_datetime.dt.month
    df['year'] = df.pickup_datetime.dt.year
    
    return df

In [9]:
#Define bounding box - max cords extracted from test set
BB = (-74.55, -72.75, 40.45, 41.85)
    
# Select trips only within boundingbox - max cordinates values extracted from test_set
def select_within_boundingbox(df, BB):
    df = df[(df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])]
    return df


#Remove incorrect values - locations on water
#This trick is taken from https://www.kaggle.com/breemen/nyc-taxi-fare-data-exploration
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
               (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')

    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('Data/nyc_mask.png')[:,:,0] > 0.9
    
    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude, 
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)    
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
    df = df[idx]
    # return only datapoints on land
    return df



In [10]:
%%time

traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

cols = list(traintypes.keys())
chunksize = 1000 
df_list = [] # list to hold the batch dataframe

for df_chunk in tqdm(pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
     
    # Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
    # Using parse_dates would be much slower!
    df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
    df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    
    df_chunk = clean_data(df_chunk)
    df_chunk = get_date_features(df_chunk)
    df_chunk = get_distance(df_chunk)
    df_chunk = lower_distance(df_chunk)
    df_chunk = select_within_boundingbox(df_chunk, BB)
    df_chunk = remove_datapoints_from_water(df_chunk)
    df_chunk = df_chunk.drop('pickup_datetime', axis=1)
    
    X = df_chunk.drop(["fare_amount"],axis=1)
    y = df_chunk.fare_amount
    
    
    
    
    df_list.append(df_chunk)

66it [00:09,  7.32it/s]


KeyboardInterrupt: 


55it [27:55, 30.46s/it]
IndexError                                Traceback (most recent call last)
<timed exec> in <module>()

<ipython-input-12-d5092148ffb0> in remove_datapoints_from_water(df)
     16                                       nyc_mask.shape[1], nyc_mask.shape[0], BB)    
     17     # calculate boolean index
---> 18     idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]
     19     df = df[idx]
     20     # return only datapoints on land

IndexError: index 1262 is out of bounds for axis 0 with size 1262


In [16]:
# Merge all dataframes into one dataframe
df = pd.concat(df_list)

# Delete the dataframe list to release memory
del df_list

# See what we have loaded
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52878537 entries, 0 to 55423855
Data columns (total 11 columns):
fare_amount          float32
pickup_longitude     float32
pickup_latitude      float32
dropoff_longitude    float32
dropoff_latitude     float32
passenger_count      uint8
hour                 int64
dayofweek            int64
month                int64
year                 int64
distance             float64
dtypes: float32(5), float64(1), int64(4), uint8(1)
memory usage: 3.4 GB


In [7]:
def XGBoost_first_iter(X_train,X_test,y_train,y_test,num_rounds=1000):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)
    xgb_model = xgb.train(params={
            'eval_metric':'rmse',
            'objective':'reg:linear',
            'max_depth': '8',
            'tree_method':'approx'}
            ,dtrain=dtrain,num_boost_round=num_rounds, 
            early_stopping_rounds=20,evals=[(dtest,'test')])
    xgb_model.save_model('taxi_fare_2nd.model')

def XGBoost(X_train,X_test,y_train,y_test,num_rounds=1000):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)
    xgb_model = xgb.train(params={
            'eval_metric':'rmse',
            'objective':'reg:linear',
            'max_depth': '8',
            'tree_method':'approx'}
            ,dtrain=dtrain,num_boost_round=num_rounds, 
            early_stopping_rounds=20,evals=[(dtest,'test')],xgb_model='taxi_fare_2nd.model')
    call(["mv", "taxi_fare_2nd.model taxi_fare_2nd.model.bak"])
    call(["rm", "taxi_fare_2nd.model"])
    xgb_model.save_model('taxi_fare_2nd.model')

In [17]:
df.shape

(52878537, 11)

In [18]:
df.to_csv('52mln_clean.csv')

In [2]:
TRAIN_PATH = '52mln_clean.csv'

In [4]:
def lower_distance(df_chunk):
    df_chunk = df_chunk[df_chunk['distance'] < 101]
    return df_chunk

In [5]:
chunksize = 500000 # 5 million rows at one go. Or try 10 million

In [None]:
df.shape

In [2]:
TRAIN_PATH = 'Data/26mln_clean.csv'

In [3]:
df = df.drop('Unnamed: 0', axis=1)

NameError: name 'df' is not defined

In [6]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,dayofweek,month,year,distance
0,4.5,-73.844315,40.721317,-73.84161,40.712276,1,17,0,6,2009,1.030742
1,16.9,-74.016045,40.711304,-73.97927,40.782005,1,16,1,1,2010,8.45
2,5.7,-73.982735,40.76127,-73.99124,40.75056,2,0,3,8,2011,1.389632
3,7.7,-73.98713,40.733143,-73.99157,40.75809,1,4,5,4,2012,2.799211
4,5.3,-73.968094,40.76801,-73.95666,40.783764,1,7,1,3,2010,1.999081


In [7]:
df.shape

(52878537, 11)

In [4]:
%%time
# Set columns to most suitable type to optimize for memory usage
traintypes = {'fare_amount': 'float32',
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8',
              'hour':'uint8',
              'dayofweek':'uint8',
              'month':'uint8',
              'year':'int',
              'distance':'float32'}

cols = list(traintypes.keys())

chunksize = 1000000 # 1 million rows at one go. Or try 10 million


X_train_list = [] # list to hold the batch dataframe
X_test_list = []
y_train_list = []
y_test_list = []

for df in tqdm(pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
#     df = df.drop('Unnamed: 0', axis=1)
    df = df.drop('pickup_longitude', axis=1)
    df = df.drop('pickup_latitude', axis=1)
    df = df.drop('dropoff_longitude', axis=1)
    df = df.drop('dropoff_latitude', axis=1)
    
    X = df.drop(["fare_amount"],axis=1)
    y = df.fare_amount
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.005,random_state=123)
    
    X_train_list.append(X_train)
    X_test_list.append(X_test)
    y_train_list.append(y_train)
    y_test_list.append(y_test)
    del X, y, X_train, X_test, y_train, y_test
    

27it [00:49,  1.96s/it]

CPU times: user 1min 13s, sys: 2.38 s, total: 1min 15s
Wall time: 50 s





In [5]:
# Merge all dataframes into one dataframe
X_train = pd.concat(X_train_list)
X_test = pd.concat(X_test_list)
y_train = pd.concat(y_train_list)
y_test = pd.concat(y_test_list)



# Delete the dataframe list to release memory
del X_train_list, X_test_list, y_train_list, y_test_list, df


In [6]:
X_train.shape

(26670328, 6)

In [7]:
y_test.shape

(134022,)

In [8]:
del df

NameError: name 'df' is not defined

In [10]:
def XGBoost(X_train,X_test,y_train,y_test,num_rounds=1000):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)

    return xgb.train(params={
                    'eta':'0.3',
                    'max_depth':'8',
                    'eval_metric':'rmse',
                    'objective':'reg:linear',
                    'tree_method':'approx'}
                    ,dtrain=dtrain,num_boost_round=num_rounds, 
                    early_stopping_rounds=50,evals=[(dtest,'test')],)

In [9]:
?xgb.train()

In [12]:
xgbm = XGBoost(X_train,X_test,y_train,y_test)


[22:28:24] Tree method is selected to be 'approx'
[22:28:40] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 498 extra nodes, 0 pruned nodes, max_depth=8
[0]	test-rmse:10.4083
Will train until test-rmse hasn't improved in 50 rounds.
[22:28:53] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=8
[1]	test-rmse:7.84749
[22:29:06] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 508 extra nodes, 0 pruned nodes, max_depth=8
[2]	test-rmse:6.21149
[22:29:18] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 510 extra nodes, 0 pruned nodes, max_depth=8
[3]	test-rmse:5.21877
[22:29:30] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 508 extra nodes, 0 pruned nodes, max_depth=8
[4]	test-rmse:4.65111
[22:29:42] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 506 extra nodes, 0 pruned nodes, max_depth=8
[5]	test-rmse:4.33947
[22:29:54] /works

[22:40:11] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 172 extra nodes, 0 pruned nodes, max_depth=8
[56]	test-rmse:3.97469
[22:40:24] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 396 extra nodes, 0 pruned nodes, max_depth=8
[57]	test-rmse:3.97499
[22:40:36] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 440 extra nodes, 0 pruned nodes, max_depth=8
[58]	test-rmse:3.97505
[22:40:49] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 412 extra nodes, 0 pruned nodes, max_depth=8
[59]	test-rmse:3.9758
[22:41:01] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 432 extra nodes, 0 pruned nodes, max_depth=8
[60]	test-rmse:3.9757
[22:41:13] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 332 extra nodes, 0 pruned nodes, max_depth=8
[61]	test-rmse:3.97589
[22:41:25] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 336 extra nodes, 0 pruned nodes, max_dept

[22:51:32] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 82 extra nodes, 0 pruned nodes, max_depth=8
[112]	test-rmse:3.97353
[22:51:44] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 124 extra nodes, 0 pruned nodes, max_depth=8
[113]	test-rmse:3.97353
[22:51:57] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 368 extra nodes, 0 pruned nodes, max_depth=8
[114]	test-rmse:3.97368
[22:52:09] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 174 extra nodes, 0 pruned nodes, max_depth=8
[115]	test-rmse:3.97365
[22:52:21] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 416 extra nodes, 0 pruned nodes, max_depth=8
[116]	test-rmse:3.97359
[22:52:33] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 148 extra nodes, 0 pruned nodes, max_depth=8
[117]	test-rmse:3.97357
[22:52:45] /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 178 extra nodes, 0 pruned nodes, m

In [None]:
dataset.shape

In [62]:
dataset.to_csv('24mln_taxifare.csv')

In [None]:
dataset.head()

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24191409 entries, 0 to 24191408
Data columns (total 11 columns):
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dayofweek            int64
year                 int64
hour                 int64
month                int64
distance             float64
dtypes: float64(6), int64(5)
memory usage: 2.0 GB


In [4]:
dataset = dataset.drop('Unnamed: 0', axis=1)

# XGB TUNING 

General Approach for Parameter Tuning
We will use an approach similar to that of GBM here. The various steps to be performed are:

1. Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.
2. Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.
3. Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.
4. Lower the learning rate and decide the optimal parameters.

In [12]:
#Import libraries:
import pandas as pd
import numpy as np

from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams


In [23]:
dataset.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofweek,year,hour,month,distance
0,4.5,-73.844315,40.721317,-73.84161,40.712276,1,0,2009,17,6,1.030742
1,16.9,-74.016045,40.711304,-73.97927,40.782005,1,1,2010,16,1,8.45
2,5.7,-73.982735,40.76127,-73.99124,40.75056,2,3,2011,0,8,1.389632
3,7.7,-73.98713,40.733143,-73.99157,40.75809,1,5,2012,4,4,2.799211
4,5.3,-73.968094,40.76801,-73.95666,40.783764,1,1,2010,7,3,1.999081


In [14]:
# IDcol = 'ID'
# train = dataset
# target = 'fare_amount'
# predictors = dataset.drop(target, axis=1)

In [15]:
del predictors

NameError: name 'predictors' is not defined

In [34]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['fare_amount'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['fare_amount'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['fare_amount'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [4]:
del dataset

In [3]:
dataset = pd.read_csv("24mln_taxifare.csv")
dataset = dataset.drop('Unnamed: 0', axis=1)
X = dataset.drop(["fare_amount"],axis=1)
y = dataset.fare_amount
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.005,random_state=123)

In [19]:
def XGBoost(X_train,X_test,y_train,y_test,num_rounds=500):
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dtest = xgb.DMatrix(X_test,label=y_test)

    return xgb.train(
                     params={
                         'objective':'reg:linear',
                         'eval_metric':'rmse'},
                    early_stopping_rounds=20,
                    evals=[(dtest,'test')],)

In [20]:
xgbm = XGBoost(X_train,X_test,y_train,y_test)

NameError: name 'X_train' is not defined

In [36]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)


In [37]:
#xgbm = XGBoost(X_train,X_test,y_train,y_test)
predictors = dataset.drop(target, axis=1)

In [None]:
modelfit(xgb1, dataset, predictors)

In [30]:
type(dataset)

pandas.core.frame.DataFrame

In [7]:
from sklearn.externals import joblib 

In [8]:
joblib.dump(xgbm, 'taxi_fare_xgboost_3.9.pkl.pkl')

NameError: name 'xgbm' is not defined

In [10]:
test_1 = pd.read_csv('taxi_fare_test_file.csv')

In [12]:
xgbm_pred = xgbm.predict(xgb.DMatrix(test_1), ntree_limit = xgbm.best_ntree_limit)

ValueError: feature_names mismatch: ['Unnamed: 0', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dayofweek', 'hour', 'month', 'distance'] ['Unnamed: 0', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dayofweek', 'year', 'hour', 'distance']
expected month in input data
training data did not have the following fields: year

In [17]:
test_1 = test_1.drop('Unnamed: 0', axis=1)

KeyError: "labels ['Unnamed: 0'] not contained in axis"

In [20]:
dataset

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofweek,hour,month,distance
0,0,4.50,-73.844315,40.721317,-73.841610,40.712276,1,0,17,6,1.030742
1,1,16.90,-74.016045,40.711304,-73.979270,40.782005,1,1,16,1,8.450000
2,2,5.70,-73.982735,40.761270,-73.991240,40.750560,2,3,0,8,1.389632
3,3,7.70,-73.987130,40.733143,-73.991570,40.758090,1,5,4,4,2.799211
4,4,5.30,-73.968094,40.768010,-73.956660,40.783764,1,1,7,3,1.999081
5,5,12.10,-74.000960,40.731630,-73.972890,40.758232,1,3,9,1,3.787118
6,6,7.50,-73.980000,40.751663,-73.973800,40.764843,1,1,20,11,1.555860
7,7,16.50,-73.951300,40.774140,-73.990100,40.751050,1,2,17,1,4.155500
8,8,9.00,-74.006460,40.726710,-73.993080,40.731630,1,0,13,12,1.253181
9,9,8.90,-73.980660,40.733870,-73.991540,40.758137,2,2,1,9,2.849590


In [18]:
test_1

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dayofweek,year,hour,distance
0,-73.973320,40.763805,-73.981430,40.743835,1,1,2015,13,2.323260
1,-73.986860,40.719383,-73.998886,40.739200,1,1,2015,13,2.425353
2,-73.982520,40.751260,-73.979650,40.746140,1,5,2011,11,0.618412
3,-73.981160,40.767807,-73.990450,40.751637,1,5,2012,21,1.960778
4,-73.966050,40.789776,-73.988560,40.744427,1,5,2012,21,5.387280
5,-73.960980,40.765550,-73.979180,40.740050,1,5,2012,21,3.222969
6,-73.949010,40.773205,-73.959625,40.770893,1,3,2011,12,0.929905
7,-73.777280,40.646637,-73.985085,40.759370,1,3,2011,12,21.540153
8,-74.014100,40.709637,-73.995110,40.741364,1,3,2011,12,3.873826
9,-73.969580,40.765520,-73.980680,40.770725,1,1,2014,15,1.099625


In [19]:
xgbm_pred = xgbm.predict(xgb.DMatrix(test_1), ntree_limit = xgbm.best_ntree_limit)

ValueError: feature_names mismatch: ['Unnamed: 0', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dayofweek', 'hour', 'month', 'distance'] ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'dayofweek', 'year', 'hour', 'distance']
expected Unnamed: 0, month in input data
training data did not have the following fields: year

In [132]:
submision = pd.DataFrame(xgbm_pred)

In [133]:
submision.to_csv('submission_kaggle.csv')

In [None]:
submision['key'] = test_1['']

In [136]:
TEST_PATH

'Data/test.csv'

In [137]:
TEST_PATH = 'Data/test.csv'

In [138]:
test_join = pd.read_csv(TEST_PATH)

In [141]:
submision['key'] = test_join['key']

In [143]:
submision.to_csv('submission_kaggle.csv')

In [3]:
TRAIN_PATH = 'Data/54mln_clean.csv'

In [4]:
siema = pd.read_csv(TRAIN_PATH, nrows=200000)

In [5]:
siema


Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,dayofweek,month,year,distance
0,0,4.500000,2009-06-15 17:26:00,-73.844315,40.721317,-73.841614,40.712276,1,17,0,6,2009,1.030742
1,1,16.900000,2010-01-05 16:52:00,-74.016045,40.711304,-73.979271,40.782005,1,16,1,1,2010,8.450000
2,2,5.700000,2011-08-18 00:35:00,-73.982735,40.761269,-73.991241,40.750561,2,0,3,8,2011,1.389632
3,3,7.700000,2012-04-21 04:30:00,-73.987129,40.733143,-73.991570,40.758091,1,4,5,4,2012,2.799211
4,4,5.300000,2010-03-09 07:51:00,-73.968094,40.768009,-73.956657,40.783764,1,7,1,3,2010,1.999081
5,5,12.100000,2011-01-06 09:50:00,-74.000961,40.731628,-73.972893,40.758232,1,9,3,1,2011,3.787118
6,6,7.500000,2012-11-20 20:35:00,-73.980003,40.751663,-73.973801,40.764843,1,20,1,11,2012,1.555860
7,7,16.500000,2012-01-04 17:22:00,-73.951302,40.774139,-73.990097,40.751049,1,17,2,1,2012,4.155500
8,8,9.000000,2012-12-03 13:10:00,-74.006462,40.726711,-73.993080,40.731628,1,13,0,12,2012,1.253181
9,9,8.900000,2009-09-02 01:11:00,-73.980659,40.733871,-73.991539,40.758137,2,1,2,9,2009,2.849590


In [12]:
def proba():
     return 1,2,3 

In [13]:
a,b,c = proba()

In [14]:
a


1

In [15]:
b

2

In [16]:
c

3

In [17]:
for siema in (1,2,3,4,0.3):
    print(siema)

1
2
3
4
0.3
