In [1]:
import pandas as pd
import numpy as np
import math
import time
import datetime

from sklearn.linear_model import Ridge
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
path_train = 'train.csv'
path_test = 'test.csv'
path_test_out = 'model/'

In [3]:
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
print(train.shape)
print(test.shape)

(69306, 10)
(6400, 10)


In [4]:
def label_process(data):
    pre_label = data.drop_duplicates()
    return pre_label['Y'].values
pre_label = label_process(train[["TERMINALNO","Y"]])
pre_label.shape

(100,)

In [5]:
train.head()

Unnamed: 0,TERMINALNO,TIME,TRIP_ID,LONGITUDE,LATITUDE,DIRECTION,HEIGHT,SPEED,CALLSTATE,Y
0,1,1476923580,1,122.985168,41.103741,12,39.402588,2.15,0,0.0
1,1,1476923640,1,122.984398,41.104904,24,39.311157,4.11,0,0.0
2,1,1476923700,1,122.986496,41.106388,74,34.178955,2.99,0,0.0
3,1,1476923760,1,122.989769,41.106884,115,37.765381,7.59,0,0.0
4,1,1476923820,1,122.991089,41.105442,151,36.049194,0.24,0,0.0


In [6]:
train = train.drop('Y',axis=1)

In [7]:
# feature = pd.concat([train,test],keys=['train','test'])
# feature.head()

In [8]:
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt

In [9]:
def conver_time(data):
    data['Conver_TIME'] = data.TIME.apply(timestamp_datetime)
#     data['month'] = data.Conver_TIME.apply(lambda x: int(x[5:7]))
    data['hour'] = data.Conver_TIME.apply(lambda x: int(x[11:13]))
#     data = data.drop('TIME',axis=1)
#     data = data.drop('Conver_TIME',axis=1)
    return data

In [10]:
train = conver_time(train)
test = conver_time(test)

In [11]:
# train = train.sort_values(["TIME"])
# train.head(50)

In [12]:
def feature_process(data):
    set_data = set(data['TERMINALNO'])
    columns=['p_id',
             'maxTime',
             'phonerisk',
             'dir_risk',
             'height_risk',
             'speed_max',
             'speed_mean',
             'speed_var',
             'height_max',
             'height_mean',
             'height_var',
             'sp_he_mean',
             'zao',
             'wan',
             'shenye',
             'weizhi_ratio',
             'huchu_ratio',
             'huru_ratio',
             'liantong_ratio',
             'duanlian_ratio',
             'call_spped0',
             'call_spped1',
             'call_spped2',
             'call_spped3',
             'call_spped4',
            ]
    feature = pd.DataFrame(columns=columns)
    
    # 针对每个用户进行分析
    for p_id in set_data:
        tempData = data.loc[data['TERMINALNO'] == p_id]
        tempData = tempData.sort_values(["TIME"])

        tempTime = tempData["TIME"].iloc[0]
        tempSpeed = tempData["SPEED"].iloc[0]
        tempDir = tempData["DIRECTION"].iloc[0]
        tempHeight = tempData["HEIGHT"].iloc[0]
        
        # 根据时间信息判断最长时间
        maxTime = 0
        maxTimelist = []

        # 用户行驶过程中，打电话危机上升
        phonerisk = 0

        # Direction 突变超过
        dir_risk = 0

        # Height 高度的危险值
        height_risk = 0
        zao=0
        wan=0
        shenye=0
        
        weizhi = 0
        huchu = 0
        huru = 0
        liantong = 0
        duanlian = 0
        
        for index, row in tempData.iterrows():
            
            hour = row['hour']
            if 7 <= hour <= 9:
                zao = 1
            elif 17 <= hour <= 19:
                wan = 1
            elif 0 <= hour < 7:
                shenye = 1

            if tempSpeed > 0 and row['CALLSTATE'] != 4:
                if row["CALLSTATE"] == 0:
                    phonerisk += math.exp(tempSpeed / 10) * 0.02
                else:
                    phonerisk += math.exp(tempSpeed / 10)
       
            if row["TIME"] - tempTime == 60:
                maxTime += 60
                tempTime = row["TIME"]

                # 判断方向变化程度与具有车速之间的危险系数
                dir_change = (min(abs(row["DIRECTION"] - tempDir), abs(360 + tempDir - row["DIRECTION"])) / 90.0)
                if tempSpeed != 0 and row["SPEED"] > 0:
                    dir_risk += math.pow((row["SPEED"] / 10), dir_change)
                
                # 海拔变化大的情况下和速度的危险系数
                height_risk += math.pow(abs(row["SPEED"] - tempSpeed) / 10,(abs(row["HEIGHT"] - tempHeight) / 100))
                
                tempHeight = row["HEIGHT"]

            elif row["TIME"] - tempTime > 60:
                maxTimelist.append(maxTime)
                maxTime = 0
                tempTime = row["TIME"]

                tempDir = row["DIRECTION"]
                tempHeight = row["HEIGHT"]
                tempSpeed = row["SPEED"]
                
            if row["CALLSTATE"] == 0:
                weizhi += 1
            elif row["CALLSTATE"] == 1:
                huchu += 1
            elif row["CALLSTATE"] == 2:
                huru += 1
            elif row["CALLSTATE"] == 3:
                liantong += 1
            elif row["CALLSTATE"] == 4:
                duanlian += 1
                

#         call0_spped = call_speed.loc[0]
#         call1_spped = call_speed.loc[1]
#         call2_spped = call_speed.loc[2]
#         call3_spped = call_speed.loc[3]
#         call4_spped = call_speed.loc[4]

        speed_max = tempData["SPEED"].max()
        speed_mean = tempData["SPEED"].mean()
        speed_var = tempData["SPEED"].var()
        
        height_max = tempData["HEIGHT"].max()
        height_mean = tempData["HEIGHT"].mean()
        height_var = tempData['HEIGHT'].var()
        
        sp_he_mean = speed_mean * height_mean

        maxTimelist.append(maxTime)
        maxTime = max(maxTimelist)
        
        total_callstate = len(tempData["CALLSTATE"])
        weizhi_ratio = weizhi / float(total_callstate)
        huchu_ratio = huchu / float(total_callstate)
        huru_ratio = huru / float(total_callstate)
        liantong_ratio = liantong / float(total_callstate)
        duanlian_ratio = duanlian / float(total_callstate)
        
        tempfeature = pd.DataFrame({'p_id':p_id,
                                    'maxTime':maxTime,
                                    'phonerisk':phonerisk,
                                    'dir_risk':dir_risk,
                                    'height_risk':height_risk,
                                    'speed_max':speed_max,
                                    'speed_mean':speed_mean,
                                    'speed_var':speed_var,
                                    'height_max':height_max,
                                    'height_mean':height_mean,
                                    'height_var':height_var,
                                    'sp_he_mean':sp_he_mean,
#                                     'call0_spped':call0_spped,
#                                     'call0_spped':call0_spped,
#                                     'call2_spped':call2_spped,
#                                     'call3_spped':call3_spped,
#                                     'call4_spped':call4_spped,
                                    'zao':zao,
                                    'wan':wan,
                                    'shenye':shenye,
                                    'weizhi_ratio':weizhi_ratio,
                                    'huchu_ratio':huchu_ratio,
                                    'huru_ratio':huru_ratio,
                                    'liantong_ratio':liantong_ratio,
                                    'duanlian_ratio':duanlian_ratio
                                    },
                                    index=['0'],
                                    columns=columns
                                    )
        
        CALLSTATE_SET = set(tempData['CALLSTATE'])
        call_speed = tempData["SPEED"].groupby(tempData['CALLSTATE']).mean()
        for call_set in CALLSTATE_SET:
            tempfeature['call_spped'+str(call_set)] = call_speed.loc[call_set]
            
        feature = feature.append(tempfeature,ignore_index=True)

        
    # feature = feature.values
    return feature

In [13]:
feature_train = feature_process(train)
feature_train = feature_train.fillna(method='pad')

feature_test = feature_process(test)
feature_test = feature_test.fillna(method='pad')

In [14]:
print(feature_train.shape)
print(feature_test.shape)

(100, 25)
(100, 25)


In [20]:
feature_train.head()

Unnamed: 0,p_id,maxTime,phonerisk,dir_risk,height_risk,speed_max,speed_mean,speed_var,height_max,height_mean,...,weizhi_ratio,huchu_ratio,huru_ratio,liantong_ratio,duanlian_ratio,call_spped0,call_spped1,call_spped2,call_spped3,call_spped4
0,1.0,3540.0,22.698271,759.505736,253.833059,32.779999,17.48984,138.297543,224.06958,47.848093,...,0.489796,0.0,0.0,0.003401,0.506803,15.924931,,,8.3,19.063912
1,2.0,3180.0,67.556593,3646.161361,376.75398,36.119999,9.287734,153.23893,526.300537,71.400718,...,0.396978,0.0,0.0,0.038462,0.56456,18.403149,,,3.626071,3.26382
2,3.0,3600.0,41.291745,597.444549,840.111283,25.440001,7.987331,37.641736,125.748291,43.942672,...,0.870197,0.0,0.001038,0.0,0.128764,7.940239,,0.0,3.626071,8.37
3,4.0,2700.0,110.310492,659.554444,656.575946,33.310001,6.312753,39.532636,115.885498,31.65925,...,0.228535,0.007576,0.003788,0.008838,0.751263,5.144309,4.985,22.036667,4.367143,6.625193
4,5.0,5520.0,39.709307,616.796195,776.837669,53.48,7.695846,67.545555,117.702576,29.461915,...,1.0,0.0,0.0,0.0,0.0,7.695846,4.985,22.036667,4.367143,6.625193


In [30]:
call_speed

CALLSTATE
0    5.505000
1    6.105000
3    8.355000
4    7.013714
Name: SPEED, dtype: float64

In [16]:
#########################################################################################################################
# Training Data

In [17]:
feature_train = feature_train.values
feature_test = feature_test.values

In [18]:
print(feature_train.shape)
print(feature_test.shape)

(100, 18)
(100, 18)


In [19]:
################################################################### 线性模型 ###############################################
# linreg = Ridge(normalize=True,max_iter=2000,solver="sparse_cg")
# linreg.fit(feature_train[:,1:],pre_label)
# predict_y_Ridge = linreg.predict(feature_test[:, 1:])

In [20]:
def xgb_model(X_train, y_train, X_test):
    model = xgb.XGBRegressor(
        learning_rate=0.001,
        n_estimators=1800,
        max_depth=6,
        min_child_weight=5,
        seed=0,
        subsample=0.8,
        colsample_bytree=0.3,
        gamma=0.1,
        reg_alpha=3,
        reg_lambda=1,
        metrics='auc')
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    return result

In [21]:
predict_y = xgb_model(feature_train[:,1:], pre_label, feature_test[:,1:])

In [22]:
print('***************** Sub Data *********************')
submission = pd.DataFrame(columns=['Id','Pred'])
submission['Id'] = feature_test[:,0]
submission['Pred'] = predict_y
submission.to_csv(path_test_out+ 'sub.csv',index=False)

***************** Sub Data *********************
