# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction
***
- 使用並觀察特徵組合在計程車費率預測競賽的影響

In [69]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = r'C:\Users\user\Desktop\GitHub\ML100-Days\alldata'
df = pd.read_csv(data_path + r'\taxi_data1.csv')

train_Y = df['fare_amount']
df = df.drop(['fare_amount'] , axis=1)
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [70]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [71]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果

df = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.026876871475636888
Gradient Boosting Reg Score : 0.7108861722668829


In [80]:
# 增加緯度差, 經度差, 座標距離等三個特徵
df['longitude_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
df['latitude_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
df['distance_2D'] = (df['longitude_diff']**2 + df['latitude_diff']**2)**0.5

# df['season'] = df['pickup_month'].apply(lambda x: 1 if x<4 else ( 2 if x >=4 and x<7 else (3 if x>=7 and x<10 else 4))).astype('int64')
df[['distance_2D', 'longitude_diff', 'latitude_diff', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].head()

Unnamed: 0,distance_2D,longitude_diff,latitude_diff,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,0.009761,0.009452,-0.002437,-73.99058,40.761071,-73.981128,40.758634
1,0.018307,-0.001244,0.018265,-73.988403,40.723431,-73.989647,40.741695
2,0.00814,0.003756,-0.007222,-74.015785,40.71511,-74.012029,40.707888
3,0.021056,0.019292,-0.008437,-73.977322,40.787275,-73.95803,40.778838
4,0.032964,0.007193,0.03217,-73.989683,40.729717,-73.98249,40.761887


In [73]:
# 結果 : 準確度上升
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.0271910027355875
Gradient Boosting Reg Score : 0.8046304110869194


# 作業1
* 參考今日教材，試著使用經緯度一圈的長度比這一概念，組合出一個新特徵，再觀察原特徵加上新特徵是否提升了正確率?
</ br>
#### ANS: 否

In [74]:
from math import radians, cos, sin, asin, sqrt
 
def circle_distance(lon1, lat1, lon2, lat2): 
    """
利用Great-circle distance運算球面兩點距離
    """
    # 轉為弧度
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
 
    # haversine公式
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # 地球半徑
    return c * r * 1000
Dist = []
lon_lat = zip(df.loc[:,'pickup_longitude'],df.loc[:,'pickup_latitude'],df.loc[:,'dropoff_longitude'],df.loc[:,'dropoff_latitude'])
for lon1, lat1, lon2, lat2 in lon_lat:
    D = circle_distance(lon1, lat1, lon2, lat2)
    Dist.append(D)
df['Real_distance'] = Dist
df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,longitude_diff,latitude_diff,distance_2D,season,Real_distance
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,0.009452,-0.002437,0.009761,4,840.94978
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,-0.001244,0.018265,0.018307,1,2033.65113
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0.003756,-0.007222,0.00814,1,863.19813
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,0.019292,-0.008437,0.021056,2,1875.760391
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.007193,0.03217,0.032964,2,3628.101569


In [81]:
# 觀察結果 
train_X = scaler.fit_transform(df)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

  return self.partial_fit(X, y)


Linear Reg Score : 0.3670790275352388
Gradient Boosting Reg Score : 0.8052786861331536


# 作業2
* 試著只使用新特徵估計目標值(忽略原特徵)，效果跟作業1的結果比較起來效果如何?

In [100]:
train_X = scaler.fit_transform(df[['Real_distance']])
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.0011536096142396256
Gradient Boosting Reg Score : 0.715704780543987


In [107]:
import statsmodels.api as sm
train_X = sm.add_constant(df)

OLS = sm.OLS(train_Y,train_X).fit()
print(OLS.summary())

glsar_model = sm.GLSAR(train_Y,train_X, 1).iterative_fit(1)
print(f'\n\n{glsar_model.summary()}')

                            OLS Regression Results                            
Dep. Variable:            fare_amount   R-squared:                       0.399
Model:                            OLS   Adj. R-squared:                  0.397
Method:                 Least Squares   F-statistic:                     236.3
Date:                Mon, 14 Jan 2019   Prob (F-statistic):               0.00
Time:                        20:53:34   Log-Likelihood:                -17210.
No. Observations:                5000   AIC:                         3.445e+04
Df Residuals:                    4985   BIC:                         3.455e+04
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const             -1142.9934    117.78

In [88]:
rho

4.075220522079644

In [93]:
order

array([[   0,    1,    2, ..., 4997, 4998, 4999],
       [   1,    0,    1, ..., 4996, 4997, 4998],
       [   2,    1,    0, ..., 4995, 4996, 4997],
       ...,
       [4997, 4996, 4995, ...,    0,    1,    2],
       [4998, 4997, 4996, ...,    1,    0,    1],
       [4999, 4998, 4997, ...,    2,    1,    0]])