In [2]:
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from sklearn.ensemble import GradientBoostingRegressor 


In [3]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
samplesub = pd.read_csv("./data/sample_submission.csv")

## Check Missing Values

In [18]:
desc = train.describe(include='all').transpose()
desc['MV_count'] = train.isnull().sum()
desc['MV_ratio'] = desc['MV_count']/train.shape[0]
desc

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,MV_count,MV_ratio
TripType,647054,,,,58.5845,157.636,3.0,27.0,39.0,40.0,999.0,0,0.0
VisitNumber,647054,,,,96167.6,55545.5,5.0,49268.0,97074.0,144316.0,191347.0,0,0.0
Weekday,647054,,,,3.4997,2.00343,1.0,2.0,3.0,5.0,7.0,0,0.0
Upc,642925,,,,30607000000.0,91201300000.0,834.0,3400000000.0,7050100000.0,30065300000.0,978971000000.0,4129,0.006381
ScanCount,647054,,,,1.10888,0.700776,-12.0,1.0,1.0,1.0,71.0,0,0.0
DepartmentDescription,645693,68.0,GROCERY DRY GOODS,70402.0,,,,,,,,1361,0.002103
FinelineNumber,642925,,,,3726.88,2780.97,0.0,1404.0,3352.0,5501.0,9998.0,4129,0.006381


## Data Preprocessing

## 1. Imputation

### Upc

In [50]:
train[train['Upc'].isnull().any()==True and train['FinelineNumber'].isnull()==True].groupby('VisitNumber')['Upc'].apply(lambda x:'hi')

VisitNumber
8         hi
259       hi
409       hi
479       hi
484       hi
496       hi
521       hi
585       hi
619       hi
787       hi
845       hi
897       hi
1004      hi
1036      hi
1064      hi
1073      hi
1170      hi
1179      hi
1236      hi
1355      hi
1392      hi
1402      hi
1425      hi
1487      hi
1597      hi
1603      hi
1627      hi
1652      hi
1664      hi
1676      hi
          ..
188212    hi
188234    hi
188248    hi
188267    hi
188355    hi
188391    hi
188403    hi
188502    hi
188580    hi
188619    hi
188661    hi
188673    hi
188743    hi
188760    hi
188775    hi
188839    hi
188896    hi
188898    hi
188901    hi
188907    hi
188940    hi
189509    hi
189818    hi
190133    hi
190182    hi
190230    hi
190309    hi
190408    hi
190651    hi
191080    hi
Name: Upc, Length: 2754, dtype: object

## 2. Encoding

### Weekday

In [12]:
# before
train['Weekday'].value_counts()

Sunday       133975
Saturday     122096
Friday        96247
Monday        83130
Tuesday       72529
Wednesday     71115
Thursday      67962
Name: Weekday, dtype: int64

In [16]:
# after
def weekday_preprocessing(dataframe):
    weekday_dict={
        'Sunday':1,
        'Saturday':2,
        'Friday':3,
        'Monday':4,
        'Tuesday':5,
        'Wednesday':6,
        'Thursday':7,
    }
    return dataframe['Weekday'].map(weekday_dict)

train['Weekday'] = weekday_preprocessing(train)
train['Weekday'].value_counts()

1    133975
2    122096
3     96247
4     83130
5     72529
6     71115
7     67962
Name: Weekday, dtype: int64

In [20]:
train.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
647049,39,191346,1,32390000000.0,1,PHARMACY OTC,1118.0
647050,39,191346,1,7874205000.0,1,FROZEN FOODS,1752.0
647051,39,191346,1,4072.0,1,PRODUCE,4170.0
647052,8,191347,1,4190008000.0,1,DAIRY,1512.0
647053,8,191347,1,3800060000.0,1,GROCERY DRY GOODS,3600.0


### ScanCount

- VisitNumber로 groupby 할때 sum으로 대체

In [19]:
# after preprocessing
desc['MV_count'] = train.isnull().sum()
desc['MV_ratio'] = desc['MV_count']/train.shape[0]
desc

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,MV_count,MV_ratio
TripType,647054,,,,58.5845,157.636,3.0,27.0,39.0,40.0,999.0,0,0.0
VisitNumber,647054,,,,96167.6,55545.5,5.0,49268.0,97074.0,144316.0,191347.0,0,0.0
Weekday,647054,,,,3.4997,2.00343,1.0,2.0,3.0,5.0,7.0,0,0.0
Upc,642925,,,,30607000000.0,91201300000.0,834.0,3400000000.0,7050100000.0,30065300000.0,978971000000.0,4129,0.006381
ScanCount,647054,,,,1.10888,0.700776,-12.0,1.0,1.0,1.0,71.0,0,0.0
DepartmentDescription,645693,68.0,GROCERY DRY GOODS,70402.0,,,,,,,,1361,0.002103
FinelineNumber,642925,,,,3726.88,2780.97,0.0,1404.0,3352.0,5501.0,9998.0,4129,0.006381


##  Feature importance via Gradient Boosting model

In [44]:
X = data.drop(columns='TripType')
y = data['TripType']

In [45]:
start_time=time.time()

gb = GradientBoostingRegressor()
gb.fit(X, y)
features = data.columns.values

end_time=time.time()
print("total time in the current cell ",end_time-start_time,"s")

total time in the current cell  35.37821698188782 s


In [46]:
# Scatter plot 
trace = go.Scatter(
    y = gb.feature_importances_,
    x = features,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        color = gb.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = features
)

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Machine Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig)