In [5]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.insert(0, 'src/models/')
sys.path.insert(0, 'src/')
sys.path.insert(0, '../conf')
sys.path.insert(0, '../')
sys.path.insert(0, 'conf/')
from utils import Dataset
from keras.callbacks import Callback
from keras.models import load_model
from keras import backend as K


Using TensorFlow backend.


In [53]:
pd.options.display.max_columns=999 
pd.options.display.max_rows=999 

In [84]:
def reduce_mem_usage(props):
    # 计算当前内存
    start_mem_usg = props.memory_usage().sum() / 1024 ** 2
    print("Memory usage of the dataframe is :", start_mem_usg, "MB")
    
    # 哪些列包含空值，空值用-999填充。why：因为np.nan当做float处理
    NAlist = []
    for col in props.columns:
        # 这里只过滤了objectd格式，如果你的代码中还包含其他类型，请一并过滤
        if (props[col].dtypes != object):
            
            print("**************************")
            print("columns: ", col)
            print("dtype before", props[col].dtype)
            
            # 判断是否是int类型
            isInt = False
            mmax = props[col].max()
            mmin = props[col].min()
            
            # Integer does not support NA, therefore Na needs to be filled
            if not np.isfinite(props[col]).all():
                NAlist.append(col)
                props[col].fillna(-999, inplace=True) # 用-999填充
                
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = np.fabs(props[col] - asint)
            result = result.sum()
            if result < 0.01: # 绝对误差和小于0.01认为可以转换的，要根据task修改
                isInt = True
            
            # make interger / unsigned Integer datatypes
            if isInt:
                if mmin >= 0: # 最小值大于0，转换成无符号整型
                    if mmax <= 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mmax <= 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mmax <= 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else: # 转换成有符号整型
                    if mmin > np.iinfo(np.int8).min and mmax < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mmin > np.iinfo(np.int16).min and mmax < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mmin > np.iinfo(np.int32).min and mmax < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mmin > np.iinfo(np.int64).min and mmax < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)  
            else: # 注意：这里对于float都转换成float16，需要根据你的情况自己更改
                props[col] = props[col].astype(np.float16)
            
            print("dtype after", props[col].dtype)
            print("********************************")
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

# train queries 数据集
##1.1 个int 和3 个object
##2.50W条数据
##3.pid 缺失163979
##4.o,d之中最多的都是”北华新街“

In [27]:
tr_queries = pd.read_csv('../../../input/kdd2019_regular/phase1/train_queries.csv')
lntlat_adress = pd.read_csv('../../../input/kdd2019_regular/phase1/lntlat_adress_6525.csv')

In [46]:
tr_queries.head(2)

Unnamed: 0,sid,pid,req_time,o,d
0,3000821,,2018-11-02 17:54:30,"116.29,39.97","116.32,39.96"
1,3085857,210736.0,2018-11-16 10:53:10,"116.39,39.84","116.33,39.79"


In [29]:
print tr_queries.dtypes,tr_queries.describe()

sid           int64
pid         float64
req_time     object
o            object
d            object
dtype: object                 sid            pid
count  5.000000e+05  336021.000000
mean   1.762560e+06  158240.721684
std    1.015331e+06   33720.115235
min    2.000000e+00  100000.000000
25%    8.122398e+05  127774.000000
50%    1.848410e+06  161617.000000
75%    2.662461e+06  187360.000000
max    3.396588e+06  216977.000000


In [30]:
tr_queries.isnull().sum()   #---》数据缺失

sid              0
pid         163979
req_time         0
o                0
d                0
dtype: int64

In [31]:
tr_queries.nunique()  # -->判断某一列特征是否唯一

sid         500000
pid          46191
req_time    464211
o             5460
d             4923
dtype: int64

In [42]:
tr_queries.o.value_counts()[:10]#--->最多的出发地

116.37,39.86    5993
116.32,39.89    5930
116.42,39.90    5819
116.41,39.91    5616
116.37,39.91    5246
116.40,39.91    5156
116.39,39.90    4366
116.35,39.94    3674
116.39,39.91    3048
116.41,39.90    3048
Name: o, dtype: int64

In [39]:
lntlat_adress[lntlat_adress.lntlat.isin(tr_queries.o.value_counts()[:10].index)][['direction','distance','district','lntlat','street','street_number']]

Unnamed: 0,direction,distance,district,lntlat,street,street_number
8,附近,48.0,西城区,"116.39,39.91",北新华街,29号
46,北,60.0,东城区,"116.42,39.90",珠市口东大街,5-17号
64,附近,9.0,西城区,"116.35,39.94",展览馆路,9号
65,附近,41.0,丰台区,"116.32,39.89",西三环南路,64号
71,附近,15.0,东城区,"116.41,39.90",大席胡同,26号
85,,,东城区,"116.41,39.91",东交民巷,
86,,,丰台区,"116.37,39.86",南三环西路辅路,
126,附近,16.0,西城区,"116.39,39.90",后孙公园胡同,15号
204,附近,11.0,西城区,"116.37,39.91",闹市口大街,4-19号-临
419,附近,49.0,西城区,"116.40,39.91",人民大会堂西路,22号


In [44]:
tr_queries.d.value_counts()[:10]   #--->最多的出发地

116.32,39.89    16503
116.37,39.86    13124
116.42,39.90    11256
116.39,39.90     7575
116.39,39.91     7571
116.41,39.91     6348
116.37,39.91     5665
116.45,39.93     5357
116.27,40.00     4222
116.40,39.93     3782
Name: d, dtype: int64

In [45]:
lntlat_adress[lntlat_adress.lntlat.isin(tr_queries.d.value_counts()[:10].index)][['direction','distance','district','lntlat','street','street_number']]

Unnamed: 0,direction,distance,district,lntlat,street,street_number
8,附近,48.0,西城区,"116.39,39.91",北新华街,29号
11,附近,24.0,朝阳区,"116.45,39.93",朝外大街,145号
46,北,60.0,东城区,"116.42,39.90",珠市口东大街,5-17号
65,附近,41.0,丰台区,"116.32,39.89",西三环南路,64号
67,,,海淀区,"116.27,40.00",北闸内小桥,
85,,,东城区,"116.41,39.91",东交民巷,
86,,,丰台区,"116.37,39.86",南三环西路辅路,
126,附近,16.0,西城区,"116.39,39.90",后孙公园胡同,15号
204,附近,11.0,西城区,"116.37,39.91",闹市口大街,4-19号-临
247,附近,33.0,西城区,"116.40,39.93",景山西街,50号


# train_plans 数据集

In [47]:
tr_plans = pd.read_csv('../../../input/kdd2019_regular/phase1/train_plans.csv')
# (491054, 3)

In [61]:
import ast
tr_plans['plans'] = tr_plans.plans.apply(lambda s:ast.literal_eval(s))

In [59]:
tr_plans.shape

(491054, 3)

In [78]:
tr_plans.head()

Unnamed: 0,sid,plan_time,plans
0,1709112,2018-11-04 11:45:04,"[{u'distance': 32303, u'price': 600, u'eta': 5..."
1,3327773,2018-10-16 19:09:29,"[{u'distance': 3427, u'price': 300, u'eta': 62..."
2,3285959,2018-11-25 15:05:22,"[{u'distance': 36234, u'price': 1100, u'eta': ..."
3,1616170,2018-10-03 14:42:40,"[{u'distance': 11450, u'price': 400, u'eta': 2..."
4,351369,2018-10-05 19:52:37,"[{u'distance': 18495, u'price': 600, u'eta': 4..."


In [64]:
print tr_plans.dtypes,tr_plans.describe()

sid           int64
plan_time    object
plans        object
dtype: object                 sid
count  4.910540e+05
mean   1.762291e+06
std    1.015707e+06
min    2.000000e+00
25%    8.106305e+05
50%    1.848142e+06
75%    2.663208e+06
max    3.396588e+06


In [81]:
df_tmp_all = []
for index,row in tr_plans.iterrows():
    df_tmp = pd.DataFrame(row['plans'])
    df_tmp['rank'] = np.arange(df_tmp.shape[0]) +1 
    df_tmp['sid'] = np.repeat(row['sid'],df_tmp.shape[0])
    df_tmp_all.append(df_tmp)
tr_plans_expand = pd.concat(df_tmp_all)

In [83]:
del df_tmp_all

In [85]:
reduce_mem_usage(tr_plans_expand)

('Memory usage of the dataframe is :', 121, 'MB')
**************************
('columns: ', 'distance')
('dtype before', dtype('int64'))
('dtype after', dtype('uint32'))
********************************
**************************
('columns: ', 'eta')
('dtype before', dtype('int64'))
('dtype after', dtype('uint32'))
********************************
**************************
('columns: ', 'transport_mode')
('dtype before', dtype('int64'))
('dtype after', dtype('uint8'))
********************************
**************************
('columns: ', 'rank')
('dtype before', dtype('int64'))
('dtype after', dtype('uint8'))
********************************
**************************
('columns: ', 'sid')
('dtype before', dtype('int64'))
('dtype after', dtype('uint32'))
********************************
___MEMORY USAGE AFTER COMPLETION:___
('Memory usage is: ', 65, ' MB')
('This is ', 53, '% of the initial size')


(    distance    eta  price  transport_mode  rank      sid
 0      32303   5087    600               9     1  1709112
 1      33678   3149                      3     2  1709112
 2      33678   3269  10000               4     3  1709112
 3      32099   5996    600               2     4  1709112
 4      30446   6494    900               1     5  1709112
 0       3427    627    300               2     1  3327773
 1       3251    573                      3     2  3327773
 2       3251    693   1600               4     3  3327773
 3       3227    977                      6     4  3327773
 4       3227   2936                      5     5  3327773
 0      36234   9318   1100               7     1  3285959
 1      29545   3843                      3     2  3285959
 2      29545   4143   9100               4     3  3285959
 3      33295  10577    900               1     4  3285959
 0      11450   2801    400               2     1  1616170
 1       9100   2751                      6     2  16161

In [107]:
tr_plans_expand.head()

Unnamed: 0,distance,eta,price,transport_mode,rank,sid
0,32303,5087,600,9,1,1709112
1,33678,3149,0,3,2,1709112
2,33678,3269,10000,4,3,1709112
3,32099,5996,600,2,4,1709112
4,30446,6494,900,1,5,1709112


In [93]:
tr_plans_expand.price =tr_plans_expand.price.fillna(0)
tr_plans_expand.price =tr_plans_expand.price.replace('',0)
tr_plans_expand.price =tr_plans_expand.price.astype(int)

In [94]:
tr_plans_expand.groupby(['transport_mode']).price.mean() #---> mean-price checking

transport_mode
1      502.345424
2      539.708916
3        0.000000
4     5805.797245
5        0.000000
6        0.000000
7      844.193436
8     3085.709651
9      518.866908
10    2640.551983
11     816.554801
Name: price, dtype: float64

In [99]:
tr_plans_expand.groupby(['transport_mode']).distance.mean()/tr_plans_expand.groupby(['transport_mode']).eta.mean()#---> speed checking

transport_mode
1     3.738075
2     5.546162
3     8.366171
4     7.735960
5     1.116283
6     3.311160
7     5.338080
8     6.236155
9     5.883500
10    7.079487
11    5.488053
dtype: float64

In [104]:
tr_plans_expand.transport_mode.value_counts()  # -->整体推荐的mode 分布情况

3     477118
4     433568
1     331795
7     237309
2     207019
6     196602
9     135523
5     132812
10     87865
11     24881
8      12330
Name: transport_mode, dtype: int64

In [114]:
tr_plans_expand[tr_plans_expand['rank'] ==1].transport_mode.value_counts() # -->first推荐的mode 分布情况

2     145217
7      81656
1      72974
5      51944
9      50675
3      31272
4      22129
10     17149
6       8984
11      6180
8       2874
Name: transport_mode, dtype: int64

In [115]:
tr_plans_expand[tr_plans_expand['rank'] ==2].transport_mode.value_counts() # -->second 分布情况

3     378123
6      51459
4      33506
9       9290
10      9225
7       3358
2       2620
1       2062
5        268
11       257
8         59
Name: transport_mode, dtype: int64

##类型猜测
1.价格低，距离和时间成正比 (公交)
    distance    9671.500000
    eta         3504.000000
    price        300.000000
    dj             0.032273
    sd             2.720400
2.价格低距离和时间比较短 (公交)
    distance    14158.000000
    eta          2833.000000
    price         500.000000
    dj              0.033622
    sd              5.033312
3.价格0   距离长，时间段  (自驾)
    distance    12802.000000
    eta          1811.000000
    price           0.000000
    dj              0.000000
    sd              6.948261
4.价格高，距离时间成正比（打车)
    distance    14349.000000
    eta          2151.500000
    price        4400.000000
    dj              0.318007
    sd              6.504455
5.0价格，距离短 4000米内，时间4000秒内 (步行)
    distance    1792.000000
    eta         1603.000000
    price          0.000000
    dj             0.000000
    sd             1.121212
6.0价格，距离短 10000米内，时间3000秒内 略快 (自行车)
    distance    3800.500000
    eta         1147.000000
    price          0.000000
    dj             0.000000
    sd             3.312303
7.价格略高(30-40),距离7500以内，时间略长 (地铁)
    distance    22619.000000
    eta          4598.000000
    price         700.000000
    dj              0.034208
    sd              4.847191
8.价格较高（40-140)，距离长，时间略短 快 （打车+步行)
    distance    24463.000000
    eta          4748.000000
    price        2400.000000
    dj              0.106034
    sd              5.222663
9.有的有价格(<25)，有的无;距离长，时间略短 (公交)
    distance    16730.000000
    eta          3073.000000
    price         500.000000
    dj              0.029211
    sd              5.440238
10.价格较低(<10),距离较长（10000-50000),时间<10000 (地铁)
    distance    23016.000000
    eta          3419.000000
    price        2100.000000
    dj              0.107647
    sd              6.738737
11.价格低，距离短（<6000),时间不短 （公交)
    distance    25063.000000
    eta          4933.000000
    price         700.000000
    dj              0.030078
    sd              5.013298
推荐次数多：0 3 4 7
推荐次数少：8 11

In [None]:
set(tr_plans.sid.values)-set(tr_queriesuerieserieserieseries.sid.values)

# train——click 数据集

In [100]:
tr_click = pd.read_csv('../../../input/kdd2019_regular/phase1/train_clicks.csv')

In [101]:
tr_click.shape

(453336, 3)

In [102]:
tr_click.head()

Unnamed: 0,sid,click_time,click_mode
0,2848914,2018-11-17 18:42:17,1
1,2629085,2018-10-12 16:28:13,3
2,602598,2018-11-11 16:38:42,2
3,2022975,2018-10-14 12:28:11,9
4,988425,2018-11-12 16:47:30,7


In [103]:
tr_click.click_mode.value_counts()  # ---》 跟first_mode 分布很相似

2     136491
7      78209
1      70369
9      48864
5      47480
3      24626
10     14882
4      12606
6      11863
11      6089
8       1857
Name: click_mode, dtype: int64