In [1]:
import pandas as pd
import numpy as np
from math import radians, atan, tan, sin, acos, cos
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
import sklearn.cross_validation
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [50]:
def getDistance(latA, lonA, latB, lonB):  
    ra = 6378140  # radius of equator: meter  
    rb = 6356755  # radius of polar: meter  
    flatten = (ra - rb) / ra  # Partial rate of the earth  
    # change angle to radians  
    radLatA = radians(latA)  
    radLonA = radians(lonA)  
    radLatB = radians(latB)  
    radLonB = radians(lonB)  
  
    try: 
        pA = atan(rb / ra * tan(radLatA))  
        pB = atan(rb / ra * tan(radLatB))  
        x = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(radLonA - radLonB))  
        c1 = (sin(x) - x) * (sin(pA) + sin(pB))**2 / cos(x / 2)**2  
        c2 = (sin(x) + x) * (sin(pA) - sin(pB))**2 / sin(x / 2)**2  
        dr = flatten / 8 * (c1 - c2)  
        distance = ra * (x + dr)  
        return distance  # meter   
    except:
        return 0.0000001
    
def score(latA, lonA, latB, lonB): #计算分数
    res = 0
    for i in range(len(latA)):
        d = getDistance(latA[i], lonA[i], latB[i], lonB[i])
        res += 1. / (1. + np.exp(-(d-1000)/250.))
    return res / len(latA)

In [3]:
good_train = pd.read_csv('D:/bmsz/good_train.csv', low_memory=False)
good_test = pd.read_csv('D:/bmsz/good_test.csv', low_memory=False)
train = pd.read_csv('D:/bmsz/train_new.csv', low_memory=False)
test = pd.read_csv('D:/bmsz/test_new.csv', low_memory=False)

In [4]:
(good_test['start_block'] == -1).sum()
(good_test['out_id'].isin(good_train['out_id'])).sum()

11519

58097

In [15]:
'''
def get_max_dis(df):
    res = 0
    for i in range(len(df)):
        res = max(res, getDistance(df.iloc[i]['start_lat'], df.iloc[i]['start_lon'],df.iloc[i]['end_lat'],df.iloc[i]['end_lon']))
    return res
train.groupby('out_id').apply(lambda x: get_max_dis(x))
'''

out_id
2016061820000b           206944.848219
358962079107966           24909.258931
358962079111695          143699.995891
358962079120563          178366.584784
4A23256745CBA3B0         250413.641220
663321706008871           91845.271091
673691705008931             161.011714
698851707000191          108482.103825
698851707000261          154578.047108
815821712000052          253334.882242
815821801000740          241059.915664
851181601004171          159585.840557
851181601028851           57893.116428
851181601046893          102604.588370
851181601098191          139512.353761
851181601098201          345258.073679
851181601098271           71853.448111
851181601099451          108017.236530
861021508004521           93769.527639
861021508005421           42768.666843
861021508005811          166350.629345
861021509004701           81130.381466
861021509011351          404987.965482
861021509014661          147442.346020
861021509015321           39289.273786
861021509017441   

In [7]:
good_train.groupby('out_id').apply(lambda x: len(x)).reset_index().describe()
train.groupby('out_id').apply(lambda x: len(x)).reset_index().describe()

Unnamed: 0,0
count,5817.0
mean,177.707065
std,98.088275
min,2.0
25%,101.0
50%,156.0
75%,240.0
max,510.0


Unnamed: 0,0
count,5817.0
mean,257.145264
std,101.075236
min,87.0
25%,175.0
50%,238.0
75%,328.0
max,582.0


#### 加入时间特征

In [38]:
def add_time_feature(df):
    #2018/01/01-2018/10/31的节日与特殊工作日
    special_holiday = ['2018-01-01'] + ['2018-02-%d' % d for d in range(15, 22)] + \
                      ['2018-04-%2d' % d for d in range(5, 8)] + \
                      ['2018-04-%d' % d for d in range(29, 31)] + ['2018-05-01'] +\
                      ['2018-06-%d' % d for d in range(16, 19)] + \
                      ['2018-09-%d' % d for d in range(22, 25)] + \
                      ['2018-10-%2d' % d for d in range(1, 8)]
    special_workday = ['2018-02-%d' % d for d in [11, 24]] + \
                      ['2018-04-08'] + ['2018-04-28'] + \
                      ['2018-09-%d' % d for d in range(29, 31)]
    date = 'start_time'
    tmp_col = df[date].map(pd.Timestamp)
    df['hour'] = tmp_col.map(lambda t: t.hour)
    df['half'] = tmp_col.map(lambda t: t.minute // 30)
    df['day'] = tmp_col.map(lambda t: t.dayofweek)
    tmp_date_col = df[date].map(lambda t: t[:10])
    df['is_special_holiday'] = tmp_date_col.isin(special_holiday).astype(int)
    df['normal_weekend'] = (df['day'] >= 5 & ~df['is_special_holiday']).astype(int)
add_time_feature(train)
add_time_feature(test)

In [56]:
train = train.drop(['r_key', 'start_time', 'end_time'], axis=1)
test_r_key = test['r_key']
test = test.drop(['r_key', 'start_time'], axis=1)

In [59]:
train.head()

Unnamed: 0,out_id,start_lat,start_lon,end_lat,end_lon,hour,half,day,is_special_holiday,normal_weekend
0,2016061820000b,33.783415,111.60366,33.779811,111.605885,10,0,5,0,1
1,2016061820000b,34.810763,115.549264,34.814875,115.549374,17,1,0,0,0
2,2016061820000b,34.640284,115.539024,34.813136,115.559243,14,1,1,0,0
3,2016061820000b,34.81828,115.542039,34.813141,115.559217,17,0,1,0,0
4,2016061820000b,34.813278,115.55926,34.786126,115.874361,18,0,1,0,0


In [52]:
car_data_dict = {}
count = 3
for car_id, data in train.groupby('out_id'):
    car_data_dict[car_id] = data.reset_index(), None #第二个用于存放学习器
    count -= 1
    if count == 0: break

In [82]:
def train_one_car(car_id):
    data = car_data_dict[car_id][0]
    blocks_data = np.concatenate([data[['end_lat', 'end_lon']].values, data[['start_lat', 'start_lon']].values])
    db = DBSCAN(eps=5e-3, min_samples=2, p=1, leaf_size=10, n_jobs=-1).fit(blocks_data)
    print(len(data))
    print(len(db.labels_))
    #data['end_block'] = db.labels_
    print(db.labels_)
    print(np.unique(db.labels_))
    print((db.labels_ == -1).sum())

In [83]:
train_one_car('2016061820000b')

128
256
[ 0  1  2  2  3  2  4  5 -1  3  2  6 -1  7  7  7  2  7  2  8  9  3  6 10  6
  3  2  4  2  4  2  4 -1 -1  4 11  2  4  4 12  4 13 14  1 -1  1  2 15  1  4
  2  1  2  4  8 16  2  2 -1 17  2 18 19  2  2  4  4  8  8  3  2  2 19 13  2
 20  4  1 21 22 23 24 25  0  9  4  8  4 14 -1  2  2  4  2 13 -1 -1  4  4  2
  8  2 26 27 28  2  4  4  4  4  4  4 29 -1 15  2  1  8 30  2 31  4  1 32 27
  5  1 12  0  1  4  1  2  3  2 -1  5 -1 -1  2 33  7  7 33  4  7 -1  1  8  9
  3  6 10  6  3  2  4  2  4  2  4 -1  2 17 11  2  2  4  2  2  4  4  2  4  4
  2  4  2  4  2  4  2  4 19 16  4  2  4 17  2 18 19 19  2  2  4  2  4  3  1
  2 19 13  2 20  4 -1 21 22 23 24 25  4  2  4  2  4  2  4  4  2 -1 -1 13  1
  2  4  4  2  8  2 26 27 28  2  2  2  2  2  2 -1 29  4  4  4  4  8 30  4 31
  4  2 32 28  4  2]
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33]
19


In [32]:
b = test['start_time'].map(lambda t: t[:10])

In [71]:
unique(np.array([1,2]))

NameError: name 'unique' is not defined