In [12]:
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import lightgbm as lgb
import datetime
from sklearn.model_selection import train_test_split

In [2]:
flow = pd.read_csv("flow_train.csv")
transition = pd.read_csv("transition_train.csv")

In [3]:
districts = sorted(flow.district_code.unique())
dates = sorted(flow.date_dt.unique())
cities = sorted(flow.city_code.unique())
d2c = dict(np.array((flow[[ 'district_code', 'city_code']].drop_duplicates())))
def dateStrlist(year,month, day, lenth):
    result = []
    for i in range(lenth):
        init_date = datetime.date(year, month, day)
        _date = init_date + datetime.timedelta(days=i)
        result.append(_date.strftime('%Y%m%d'))
    return result

In [4]:
trans = flow.copy()
trans['date_dt'] = trans['date_dt'].apply(lambda x: dates.index(x))
trans['district_code'] = trans['district_code'].apply(lambda x: districts.index(x))
trans['city_code'] = trans['city_code'].apply(lambda x: cities.index(x))

In [5]:
df = pd.DataFrame()
for d in range(len(districts)):
    dwell = np.log(np.array(trans[trans.district_code == d].sort_values(by = 'date_dt')['dwell'] + 1))
    flow_in = np.log(np.array(trans[trans.district_code == d].sort_values(by = 'date_dt')['flow_in'] + 1))
    flow_out = np.log(np.array(trans[trans.district_code == d].sort_values(by = 'date_dt')['flow_out'] + 1))
    df['dwell%d_b0'%d] = dwell
    df['flow_in%d_b0'%d] = flow_in
    df['flow_out%d_b0'%d] = flow_out
    for i in range(1, 30):
        df['dwell%d_b%d'%(d, i)]  = [*np.zeros(i), *dwell[:-i]]
        df['flow_in%d_b%d'%(d, i)]  = [*np.zeros(i), *flow_in[:-i]]
        df['flow_out%d_b%d'%(d, i)]  = [*np.zeros(i), *flow_out[:-i]]

In [6]:
for i in range(7):
    df["w%d"%i] = np.array(np.array(range(len(df)))%7 == i).astype(np.int8)
    
df["md"] = np.array(np.array(range(len(df)))%30)
df["mm"] = np.array(np.array(range(len(df)))//30)

In [21]:
def featureExtractor(fe, dist, col):
    nfe = pd.DataFrame()
    for j in range(1, 30):
        copy_col = "%s%d_b%d"%(col, dist, j)
        nfe[copy_col] = fe[copy_col]
        # 一阶差分
        if j > 0:
            nfe["delt%d_%d"%(j-1, j)] = fe['%s%d_b%d'%(col, dist, j-1)] - fe['%s%d_b%d'%(col, dist, j)]
        
        # 二阶差分
        if j > 1:
            nfe["delt%d_%d"%(j-2, j)] = fe['%s%d_b%d'%(col, dist, j-2)] + fe['%s%d_b%d'%(col, dist, j)] - 2*fe['%s%d_b%d'%(col, dist, j-1)]
        
        # 三阶差分
        if j > 2:
            nfe["delt%d_%d"%(j-3, j)] = fe['%s%d_b%d'%(col, dist, j-3)] - 3*fe['%s%d_b%d'%(col, dist, j-2)] + 3*fe['%s%d_b%d'%(col, dist, j-1)] - fe['%s%d_b%d'%(col, dist, j)]
    
    # 统计特征
    for span in [3,7,15,30]:
        columns = ['%s%d_b%d'%(col, dist, j) for j in range(span)]
        nfe["max%d"%span] = fe[columns].max(axis = 1)
        nfe["min%d"%span] = fe[columns].min(axis = 1)
        nfe["mean%d"%span] = fe[columns].mean(axis = 1)
        nfe["std%d"%span] = fe[columns].std(axis = 1)
    
    
    for i in range(7):
        nfe["w%d"%i] = fe["w%d"%i]
            
    for i in range(98):
        nfe["%s%d_b0"%(col, i)] = fe["%s%d_b0"%(col, i)]
        
    nfe['md'] = fe['md']
    nfe['mm'] = fe['mm']
#     if col != 'flow_out':
#         for i in range(98):
#             nfe["%s%d"%("flow_out", i)] = fe["%s%d"%("flow_out", i)]
    
#             nfe['dwell0-%d'%j] = fe['dwell%d'%i] - fe['dwell%d_b%d'%(i, j)]
#             nfe['flow_in0-%d'%j] = fe['flow_in%d'%i] - fe['flow_in%d_b%d'%(i, j)]
#             nfe['flow_out0-%d'%j] = fe['flow_out%d'%i] - fe['flow_out%d_b%d'%(i, j)]
            
#             if j>1:
#                 nfe['dwell%d-%d'%(j-1,j)] = fe['dwell%d_b%d'%(i, j-1)] - fe['dwell%d_b%d'%(i, j)]
#                 nfe['flow_in%d-%d'%(j-1, j)] = fe['flow_in%d_b%d'%(i, j-1)] - fe['flow_in%d_b%d'%(i, j)]
#                 nfe['flow_out%d-%d'%(j-1, j)] = fe['flow_out%d_b%d'%(i, j-1)] - fe['flow_out%d_b%d'%(i, j)]

    
    
    return nfe

# 分别建模预测

In [None]:
scores = []
pred_result= []
for day in range(1,15):
    for d in range(len(districts)):
        pred_values = []
        for col in ['dwell', 'flow_in', 'flow_out']:
            start_index = 30
            predict_span = day
            eval_days = 15
            label = df["%s%d_b0"%(col,d)][start_index + predict_span:]
            feature = df[start_index:-predict_span]
            feature = featureExtractor(feature, d, col)
            if eval_days <= 0:
                lgbr = lgb.LGBMRegressor(n_jobs = -1, learning_rate = 0.1, num_leaves = 5, subsample = 0.63, random_state = 2018, n_estimators = 50, colsample_bytree = 0.23, min_child_samples = 20, objective ="regression")
                lgbr.fit(feature, label)
            else:
#                 train_x = feature[:-eval_days]
#                 train_y = label[:-eval_days]
#                 eval_x = feature[-eval_days:]
#                 eval_y = label[-eval_days:]
                train_x, eval_x, train_y, eval_y = train_test_split(feature, label, test_size = 0.3)
                lgbr = lgb.LGBMRegressor(n_jobs = -1, learning_rate = 0.02, num_leaves = 5, subsample = 0.63, random_state = 2018, n_estimators = 300, colsample_bytree = 0.23, min_child_samples = 20, objective ="regression")
                lgbr.fit(train_x, train_y, eval_set = (eval_x, eval_y), early_stopping_rounds = 20, verbose  = False, eval_metric = 'rmse')
                pred = lgbr.predict(eval_x)
                print(d, col, np.sqrt(np.mean(np.square(pred - eval_y))), end = '\t')
                scores.append(np.mean(np.square(pred - eval_y)))

            #predict for online test
            test_x = featureExtractor(df[-1:], d, col)
            pred = lgbr.predict(test_x)
            pred_values.append(pred[0])
        
        pred_result.append([day, d, *pred_values])
        print(end = ".")
    print("\n", day)
print(np.sqrt(np.mean(scores)))

0 dwell 0.08796120229561617	0 flow_in 0.18939057648350013	0 flow_out 0.1777987160029279	.1 dwell 0.14163623051710028	1 flow_in 0.15541085418915307	1 flow_out 0.17341861808989847	.2 dwell 0.0904053325231948	2 flow_in 0.22020335962205234	2 flow_out 0.17755079526926362	.3 dwell 0.14862307660077442	3 flow_in 0.2158159605618133	3 flow_out 0.21379920430368862	.4 dwell 0.1473923549124637	4 flow_in 0.20880347981039094	4 flow_out 0.15073727442611246	.5 dwell 0.1296320649063666	5 flow_in 0.17830237143134897	5 flow_out 0.1662367143292317	.6 dwell 0.1162943888244019	6 flow_in 0.1805513403939677	6 flow_out 0.13283242720125826	.7 dwell 0.14672162627213667	7 flow_in 0.1257748723342385	7 flow_out 0.1201988398613064	.8 dwell 0.18194851093590755	8 flow_in 0.18917324232622365	8 flow_out 0.16297584788040748	.9 dwell 0.08205772504916749	9 flow_in 0.19414862926226797	9 flow_out 0.22161545020472403	.10 dwell 0.15059985651019717	10 flow_in 0.14590895797669615	10 flow_out 0.20116416463982978	.11 dwell 0.118624

89 flow_in 0.17289782455157357	89 flow_out 0.14908262532373068	.90 dwell 0.09688858181785649	90 flow_in 0.14988367854214818	90 flow_out 0.1930360060164631	.91 dwell 0.10422064721960594	91 flow_in 0.12637793805026323	91 flow_out 0.13695590956646855	.92 dwell 0.11131690924353317	92 flow_in 0.12825675511714824	92 flow_out 0.14578840435856352	.93 dwell 0.0894788715647674	93 flow_in 0.17716386651774246	93 flow_out 0.20275723190385492	.94 dwell 0.09297645248568452	94 flow_in 0.14298053008726727	94 flow_out 0.14788946552543958	.95 dwell 0.12609907210378424	95 flow_in 0.20684302046141279	95 flow_out 0.22026893429152894	.96 dwell 0.10894537099327935	96 flow_in 0.2187301721587604	96 flow_out 0.1951717365591377	.97 dwell 0.10517051740757442	97 flow_in 0.07066599976266977	97 flow_out 0.08062213399019076	.
 1


0 dwell 0.13534963155595917	0 flow_in 0.2038283996528442	0 flow_out 0.1765232718119226	.1 dwell 0.14101051108423293	1 flow_in 0.13354824421103084	1 flow_out 0.13496771709615046	.2 dwell 0.1320190152495475	2 flow_in 0.20933518780518945	2 flow_out 0.22644703938492527	.3 dwell 0.11797143936285945	3 flow_in 0.23679968376090674	3 flow_out 0.24953055574547914	.4 dwell 0.12251611416518014	4 flow_in 0.2103306909449868	4 flow_out 0.16689338335730564	.5 dwell 0.17333894889249193	5 flow_in 0.16127316262199062	5 flow_out 0.17709142213533355	.6 dwell 0.12663825296428213	6 flow_in 0.13909866512396712	6 flow_out 0.12398553595052031	.7 dwell 0.2184137770944919	7 flow_in 0.20673629019083323	7 flow_out 0.15083398571219112	.8 dwell 0.2367249062031093	8 flow_in 0.20107351229079473	8 flow_out 0.1965015683533711	.9 dwell 0.10500632754798396	9 flow_in 0.23567403760920733	9 flow_out 0.1823590978601634	.10 dwell 0.1834979128996748	10 flow_in 0.2304829778834182	10 flow_out 0.19538100378567436	.11 dwell 0.086661

89 flow_in 0.14201509937541973	89 flow_out 0.18347752150876023	.90 dwell 0.13556060954986218	90 flow_in 0.18513910371838826	90 flow_out 0.21959628314486684	.91 dwell 0.11951080478063783	91 flow_in 0.11697481106533725	91 flow_out 0.14104722042364942	.92 dwell 0.14960566769548578	92 flow_in 0.15833825634496124	92 flow_out 0.17644565596909917	.93 dwell 0.143112224802021	93 flow_in 0.17046661060819576	93 flow_out 0.1715636809791687	.94 dwell 0.09974102599448677	94 flow_in 0.21048559715874732	94 flow_out 0.20984579856980318	.95 dwell 0.11949893427478984	95 flow_in 0.21674210812763678	95 flow_out 0.21884486853079535	.96 dwell 0.11650639383772578	96 flow_in 0.1473854484258594	96 flow_out 0.1930095863389694	.97 dwell 0.05278084701949965	97 flow_in 0.11978056462979218	97 flow_out 0.11897826068241507	.
 2


0 dwell 0.16076490983662337	0 flow_in 0.1813336385552128	0 flow_out 0.18268483950862427	.1 dwell 0.1479118935745131	1 flow_in 0.16041303949709626	1 flow_out 0.15905988476715252	.2 dwell 0.20461210478886627	2 flow_in 0.25085610148523074	2 flow_out 0.25084038460915287	.3 dwell 0.106516687895757	3 flow_in 0.24174735991700624	3 flow_out 0.21639167727279818	.4 dwell 0.14173747963522593	4 flow_in 0.22939501847121127	4 flow_out 0.17950182508660273	.5 dwell 0.16342671787557173	5 flow_in 0.16115417694874698	5 flow_out 0.22129089382899694	.6 dwell 0.11284511711147675	6 flow_in 0.14484209838401157	6 flow_out 0.16166271574890229	.7 dwell 0.0944483218303436	7 flow_in 0.1767462966878136	7 flow_out 0.21415578939633637	.8 dwell 0.08783199913874157	8 flow_in 0.21154139323848864	8 flow_out 0.15827764779934064	.9 dwell 0.13735558238924447	9 flow_in 0.25878965958231986	9 flow_out 0.20050501091197326	.10 dwell 0.18478488216859879	10 flow_in 0.22601446113072376	10 flow_out 0.2400271919642953	.11 dwell 0.125

89 flow_in 0.16211497741672098	89 flow_out 0.17121215763319977	.90 dwell 0.13267382747225487	90 flow_in 0.139607898141119	90 flow_out 0.19178979813584895	.91 dwell 0.11586135325311946	91 flow_in 0.13098613513188537	91 flow_out 0.15033600276665693	.92 dwell 0.16704719811577273	92 flow_in 0.1781686192640363	92 flow_out 0.12936789062791	.93 dwell 0.09137601331431022	93 flow_in 0.2208308411515051	93 flow_out 0.17194427567236406	.94 dwell 0.13595321633907537	94 flow_in 0.16918247624929442	94 flow_out 0.17805701447186198	.95 dwell 0.14060907635356376	95 flow_in 0.17872197392361494	95 flow_out 0.21093410210105507	.96 dwell 0.17571595371302723	96 flow_in 0.21294210490000692	96 flow_out 0.18799538561736226	.97 dwell 0.061625218259457513	97 flow_in 0.1778971013563492	97 flow_out 0.11531289427011723	.
 3


0 dwell 0.2355188073190747	0 flow_in 0.19090303971234535	0 flow_out 0.17272565627896966	.1 dwell 0.1448160306053656	1 flow_in 0.15221217397022324	1 flow_out 0.16331519999051006	.2 dwell 0.1604385163007306	2 flow_in 0.23727724023834415	2 flow_out 0.2606449721575398	.3 dwell 0.09540421881935597	3 flow_in 0.19802378564855627	3 flow_out 0.27183081724679936	.4 dwell 0.13102354500165203	4 flow_in 0.2053280411912927	4 flow_out 0.17131980742914907	.5 dwell 0.16046896879212405	5 flow_in 0.18880126227299812	5 flow_out 0.1713422785604108	.6 dwell 0.11566293056202961	6 flow_in 0.17990821358204245	6 flow_out 0.14056958120086935	.7 dwell 0.12443409749098096	7 flow_in 0.13982166343801736	7 flow_out 0.18336140592189873	.8 dwell 0.16923867978370197	8 flow_in 0.2119882036846819	8 flow_out 0.15825208239823887	.9 dwell 0.15640641408756134	9 flow_in 0.18155455464182055	9 flow_out 0.20842241044633808	.10 dwell 0.1349074232239994	10 flow_in 0.24590426145560945	10 flow_out 0.24948008426340484	.11 dwell 0.1256

89 flow_in 0.1526486963570232	89 flow_out 0.16866592733021582	.90 dwell 0.15751533652714092	90 flow_in 0.17096541465521062	90 flow_out 0.14942071736198437	.91 dwell 0.13219814999794943	91 flow_in 0.13484069672508675	91 flow_out 0.15982324097670914	.92 dwell 0.11857176742910241	92 flow_in 0.22036253415250462	92 flow_out 0.15424603715802668	.93 dwell 0.0864593828454666	93 flow_in 0.20893253330125341	93 flow_out 0.17430035182234346	.94 dwell 0.14478695396195565	94 flow_in 0.1694088074684212	94 flow_out 0.1620764879317399	.95 dwell 0.10038241539900067	95 flow_in 0.16628479277480457	95 flow_out 0.18260252552765194	.96 dwell 0.11077840330295588	96 flow_in 0.17879557533687684	96 flow_out 0.1337148872882956	.97 dwell 0.06926894029851859	97 flow_in 0.12914772379868136	97 flow_out 0.14347341250251797	.
 4


0 dwell 0.15260455354750527	0 flow_in 0.15713109971545655	0 flow_out 0.2177137435754578	.1 dwell 0.12680141191761063	1 flow_in 0.1663190603525994	1 flow_out 0.1531201124243793	.2 dwell 0.1728676564880237	2 flow_in 0.22457714077317703	2 flow_out 0.2366138976171852	.3 dwell 0.143513347340008	3 flow_in 0.16994989436508173	3 flow_out 0.2476816043344194	.4 dwell 0.11476607736503837	4 flow_in 0.23873774213503754	4 flow_out 0.1493856521882768	.5 dwell 0.1738635294634146	5 flow_in 0.18247486624379836	5 flow_out 0.1729271159919708	.6 dwell 0.06392987253797168	6 flow_in 0.1373357490221606	6 flow_out 0.13321578965283115	.7 dwell 0.1621928950731202	7 flow_in 0.14795021131277297	7 flow_out 0.18990730255793825	.8 dwell 0.16159310874226213	8 flow_in 0.20223120704416914	8 flow_out 0.20631077708601303	.9 dwell 0.1406388704674427	9 flow_in 0.22569760211919965	9 flow_out 0.16433226409491403	.10 dwell 0.2079618436104163	10 flow_in 0.23636379941042332	10 flow_out 0.22567473104752736	.11 dwell 0.14940585032

89 flow_out 0.1223240113082496	.90 dwell 0.16766164105203762	90 flow_in 0.18863234854140537	90 flow_out 0.18623119557049503	.91 dwell 0.13826394027336433	91 flow_in 0.18743199461348564	91 flow_out 0.13574837134746803	.92 dwell 0.12558402600201493	92 flow_in 0.1602409341258286	92 flow_out 0.18079177251355483	.93 dwell 0.10975572638554687	93 flow_in 0.17813477722237767	93 flow_out 0.22004410085910572	.94 dwell 0.1431534283978573	94 flow_in 0.2035352238657998	94 flow_out 0.1850829553941994	.95 dwell 0.12747587286433637	95 flow_in 0.18725329787412634	95 flow_out 0.20103781602033635	.96 dwell 0.14920567277778307	96 flow_in 0.1833351663964084	96 flow_out 0.18434487152777218	.97 dwell 0.09431081066584213	97 flow_in 0.14870697541024255	97 flow_out 0.145993954893978	.
 5
0 dwell 0.18634785852212515	0 flow_in 0.17563789892836854	0 flow_out 0.22475989336821905	.1 dwell 0.14848739180992396	1 flow_in 0.19275908727224814	1 flow_out 0.2070545550442808	.2 dwell 0.14554883032500043	2 flow_in 0.21335174

In [17]:
print(np.sqrt(np.mean(scores)))

0.17508066427718813


In [None]:
0.12855791101761666
0.15971174916526226
0.16813657806652177

In [249]:
print(np.sqrt(np.mean(scores)))

In [250]:
pred_df = pd.DataFrame(pred_result)
datelist = dateStrlist(2018,3, 2, 15)
pred_df.columns = ['date_dt', 'district', 'dwell', 'flow_in', 'flow_out']
pred_df['date_dt'] = pred_df['date_dt'].apply(lambda x: datelist[x-1])
pred_df['district'] = pred_df['district'].apply(lambda x: districts[x])
pred_df['city'] = pred_df.district.apply(lambda x: d2c[x])
pred_df['dwell'] = pred_df.dwell.apply(lambda x: np.exp(x)-1)
pred_df['flow_in'] = pred_df.flow_in.apply(lambda x: np.exp(x)-1)
pred_df['flow_out'] = pred_df.flow_out.apply(lambda x: np.exp(x)-1)

In [251]:
pred_df[['date_dt', 'city', 'district', 'dwell', 'flow_in', 'flow_out']].to_csv("predict112702.csv", header = None, index = None)

In [50]:
train_x = pd.get_dummies(train['district_code'])
train_y = train['dwell']
test_x = pd.get_dummies(test['district_code'])
test_y = test['dwell']

# 统一建模预测

In [None]:
# 这里我们先拿前30天的数据 分别预测 1-15天后的数据。
# 后续可以尝试 今日预测1天后的，和2天谴预测2天后的，以此类推。多日模型进行融合。
dataDf = pd.DataFrame()
for span in range(1,15):
    for d in range(len(districts)):
        pred_values = []
        for col in ['dwell', 'flow_in', 'flow_out']:
            start_index = 30
            predict_span = span
            label = df["%s%d_b0"%(col,d)][start_index + predict_span:]
            feature = df[start_index:-predict_span]
            feature = featureExtractor(feature, d, col)
            feature['y'] = label
            test_x = featureExtractor(df[-1:], d, col)
            test_x['y'] = -1
            colDf = feature.append(test_x)
            colDf['type'] = col
            colDf['district'] = d
            colDf['city'] = d2c[districts[d]]
            colDf['span'] = span
            dataDf = dataDf.append(colDf, sort=True)
        print(end = ".")
    print("\n", span)

..................................................................................................
 1
.............

In [64]:
df

Unnamed: 0,dwell0_b0,flow_in0_b0,flow_out0_b0,dwell0_b1,flow_in0_b1,flow_out0_b1,dwell0_b2,flow_in0_b2,flow_out0_b2,dwell0_b3,...,flow_out97_b29,w0,w1,w2,w3,w4,w5,w6,md,mm
0,4.543220,4.093229,4.009962,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1,0,0,0,0,0,0,0,0
1,4.579500,4.106401,4.005465,4.543220,4.093229,4.009962,0.000000,0.000000,0.000000,0.000000,...,0.000000,0,1,0,0,0,0,0,1,0
2,4.580097,4.123510,4.170273,4.579500,4.106401,4.005465,4.543220,4.093229,4.009962,0.000000,...,0.000000,0,0,1,0,0,0,0,2,0
3,4.529092,4.018738,4.027490,4.580097,4.123510,4.170273,4.579500,4.106401,4.005465,4.543220,...,0.000000,0,0,0,1,0,0,0,3,0
4,4.361707,3.763853,3.746899,4.529092,4.018738,4.027490,4.580097,4.123510,4.170273,4.579500,...,0.000000,0,0,0,0,1,0,0,4,0
5,3.491880,3.371285,3.383013,4.361707,3.763853,3.746899,4.529092,4.018738,4.027490,4.580097,...,0.000000,0,0,0,0,0,1,0,5,0
6,3.460190,3.311975,3.317476,3.491880,3.371285,3.383013,4.361707,3.763853,3.746899,4.529092,...,0.000000,0,0,0,0,0,0,1,6,0
7,3.862646,3.602979,3.512751,3.460190,3.311975,3.317476,3.491880,3.371285,3.383013,4.361707,...,0.000000,1,0,0,0,0,0,0,7,0
8,4.539870,4.175699,4.123698,3.862646,3.602979,3.512751,3.460190,3.311975,3.317476,3.491880,...,0.000000,0,1,0,0,0,0,0,8,0
9,4.498475,4.125440,4.163414,4.539870,4.175699,4.123698,3.862646,3.602979,3.512751,3.460190,...,0.000000,0,0,1,0,0,0,0,9,0


In [None]:
            if eval_days <= 0:
                lgbr = lgb.LGBMRegressor(n_jobs = 4, learning_rate = 0.1, num_leaves = 5, subsample = 0.63, random_state = 2018, n_estimators = 50, colsample_bytree = 0.23, min_child_samples = 20, objective ="regression")
                lgbr.fit(feature, label)
            else:
                train_x = feature[:-eval_days]
                train_y = label[:-eval_days]
                eval_x = feature[-eval_days:]
                eval_y = label[-eval_days:]
                lgbr = lgb.LGBMRegressor(learning_rate = 0.05, num_leaves = 5, subsample = 0.63, random_state = 2018, n_estimators = 50, colsample_bytree = 0.23, min_child_samples = 20, objective ="regression")
                lgbr.fit(train_x, train_y, eval_set = (eval_x, eval_y), early_stopping_rounds = None, verbose  = False, eval_metric = 'rmse')
                pred = lgbr.predict(eval_x)
                print(d, col, np.sqrt(np.mean(np.square(pred - eval_y))), end = '\t')
                scores.append(np.mean(np.square(pred - eval_y)))