In [70]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

In [71]:
train_df = pd.read_csv("./jinnan_round1_train_20181227.csv")
test_df = pd.read_csv("./jinnan_round1_testA_20181227.csv")

In [72]:
def TimeToInt(str, previous = 0):
    now = 0
    try:
        time = str.split(":")
        now = (int(time[0]) * 60 + int(time[1])) * 60
        if now < previous:
            now += 86400
    except:
        now = previous
    return now

def getInt(str):
    val = str.split(":")
    return int(val[0]) * 60 + int(val[1])

def TimeToDuration(str):
    duration = 0
    try:
        time = str.split("-")
        duration = getInt(time[1]) - getInt(time[0])
    except:
        duration = 0
    if duration < 0:
        duration += 24 * 60
    return duration

In [73]:
train_df['timeA5'] = train_df['A5'].apply(lambda x: TimeToInt(x))
train_df['timeA9'] = train_df.apply(lambda x: TimeToInt(x.A9, x.timeA5), axis = 1)
train_df['timeA11'] = train_df.apply(lambda x: TimeToInt(x.A11, x.timeA9), axis = 1)
train_df['timeA14'] = train_df.apply(lambda x: TimeToInt(x.A14, x.timeA11), axis = 1)
train_df['timeA16'] = train_df.apply(lambda x: TimeToInt(x.A16, x.timeA14), axis = 1)
train_df['timeA24'] = train_df.apply(lambda x: TimeToInt(x.A24, x.timeA16), axis = 1)
train_df['timeA26'] = train_df.apply(lambda x: TimeToInt(x.A26, x.timeA24), axis = 1)
train_df['timeB5'] = train_df.apply(lambda x: TimeToInt(x.B5, x.timeA26), axis = 1)
train_df['timeB7'] = train_df.apply(lambda x: TimeToInt(x.B7, x.timeB5), axis = 1)
train_df = train_df.drop(['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7'], axis=1)
col = ['timeA5','timeA9','timeA11','timeA14','timeA16','timeA24','timeA26','timeB5','timeB7']
train_df[col] = train_df[col].fillna(0)

train_df['durationA20'] = train_df['A20'].apply(lambda x: TimeToDuration(x))
train_df['durationA28'] = train_df['A28'].apply(lambda x: TimeToDuration(x))
train_df['durationB4'] = train_df['B4'].apply(lambda x: TimeToDuration(x))
train_df['durationB9'] = train_df['B9'].apply(lambda x: TimeToDuration(x))
train_df['durationB10'] = train_df['B10'].apply(lambda x: TimeToDuration(x))
train_df['durationB11'] = train_df['B11'].apply(lambda x: TimeToDuration(x))
train_df = train_df.drop(['A20', 'A28', 'B4', 'B9', 'B10', 'B11'], axis=1)
col = ['durationA20','durationA28','durationB4','durationB9','durationB10','durationB11']
train_df[col] = train_df[col].fillna(0)

col = ['A1','A2','A3','A4','A6','A8','A10','A12','A13','A15','A17','A18','A19','A21','A22','A23','A25','A27','B1','B2','B3','B6','B8','B12','B13','B14']
train_df[col] = train_df[col].fillna(0)

train_df.head()

Unnamed: 0,sample id,A1,A2,A3,A4,A6,A8,A10,A12,A13,...,timeA24,timeA26,timeB5,timeB7,durationA20,durationA28,durationB4,durationB9,durationB10,durationB11
0,sample_1528,300,0.0,405.0,700,38.0,0.0,100,102,0.2,...,79200,81000,115200,127800,30,30,60,90,90,0
1,sample_1698,300,0.0,405.0,700,29.0,0.0,101,103,0.2,...,72000,75600,82800,108000,60,60,60,90,90,60
2,sample_639,300,0.0,405.0,700,29.0,0.0,102,103,0.2,...,72000,75600,82800,90000,30,60,60,90,90,60
3,sample_483,300,0.0,405.0,700,38.0,0.0,100,102,0.2,...,27000,28800,55800,64800,30,60,60,90,90,0
4,sample_617,300,0.0,405.0,700,29.0,0.0,101,103,0.2,...,100800,104400,111600,118800,60,60,60,90,90,60


In [74]:
test_df['timeA5'] = test_df['A5'].apply(lambda x: TimeToInt(x))
test_df['timeA9'] = test_df.apply(lambda x: TimeToInt(x.A9, x.timeA5), axis = 1)
test_df['timeA11'] = test_df.apply(lambda x: TimeToInt(x.A11, x.timeA9), axis = 1)
test_df['timeA14'] = test_df.apply(lambda x: TimeToInt(x.A14, x.timeA11), axis = 1)
test_df['timeA16'] = test_df.apply(lambda x: TimeToInt(x.A16, x.timeA14), axis = 1)
test_df['timeA24'] = test_df.apply(lambda x: TimeToInt(x.A24, x.timeA16), axis = 1)
test_df['timeA26'] = test_df.apply(lambda x: TimeToInt(x.A26, x.timeA24), axis = 1)
test_df['timeB5'] = test_df.apply(lambda x: TimeToInt(x.B5, x.timeA26), axis = 1)
test_df['timeB7'] = test_df.apply(lambda x: TimeToInt(x.B7, x.timeB5), axis = 1)
test_df = test_df.drop(['A5','A7','A9','A11','A14','A16','A24','A26','B5','B7'], axis=1)
col = ['timeA5','timeA9','timeA11','timeA14','timeA16','timeA24','timeA26','timeB5','timeB7']
test_df[col] = test_df[col].fillna(0)

test_df['durationA20'] = test_df['A20'].apply(lambda x: TimeToDuration(x))
test_df['durationA28'] = test_df['A28'].apply(lambda x: TimeToDuration(x))
test_df['durationB4'] = test_df['B4'].apply(lambda x: TimeToDuration(x))
test_df['durationB9'] = test_df['B9'].apply(lambda x: TimeToDuration(x))
test_df['durationB10'] = test_df['B10'].apply(lambda x: TimeToDuration(x))
test_df['durationB11'] = test_df['B11'].apply(lambda x: TimeToDuration(x))
test_df = test_df.drop(['A20', 'A28', 'B4', 'B9', 'B10', 'B11'], axis=1)
col = ['durationA20','durationA28','durationB4','durationB9','durationB10','durationB11']
test_df[col] = test_df[col].fillna(0)

col = ['A1','A2','A3','A4','A6','A8','A10','A12','A13','A15','A17','A18','A19','A21','A22','A23','A25','A27','B1','B2','B3','B6','B8','B12','B13','B14']
test_df[col] = test_df[col].fillna(0)

test_df.head()

Unnamed: 0,sample id,A1,A2,A3,A4,A6,A8,A10,A12,A13,...,timeA24,timeA26,timeB5,timeB7,durationA20,durationA28,durationB4,durationB9,durationB10,durationB11
0,sample_1656,300,0.0,405.0,700,29,0.0,101,103.0,0.2,...,43200,46800,54000,61200,60,60,60,90,90,60
1,sample_1548,300,0.0,405.0,700,39,80.0,100,102.0,0.2,...,75600,77400,114600,122400,30,30,80,60,90,0
2,sample_769,300,0.0,405.0,700,80,0.0,102,104.0,0.2,...,43200,46800,54000,61200,60,60,60,180,0,0
3,sample_1881,300,0.0,405.0,700,29,0.0,102,103.0,0.2,...,100800,104400,111600,118800,60,60,60,90,90,60
4,sample_1807,300,0.0,405.0,700,30,0.0,101,104.0,0.2,...,100800,104400,111600,118800,60,60,60,90,90,60


In [75]:
def toFloat(str):
    try:
        return float(str)
    except:
        return float(0)
train_set = train_df
train_set.loc[train_set['A25'].apply(lambda x: toFloat(x))]
train_set.head()

Unnamed: 0,sample id,A1,A2,A3,A4,A6,A8,A10,A12,A13,...,timeA24,timeA26,timeB5,timeB7,durationA20,durationA28,durationB4,durationB9,durationB10,durationB11
0,sample_1528,300,0.0,405.0,700,38.0,0.0,100,102,0.2,...,79200,81000,115200,127800,30,30,60,90,90,0
1,sample_1698,300,0.0,405.0,700,29.0,0.0,101,103,0.2,...,72000,75600,82800,108000,60,60,60,90,90,60
2,sample_639,300,0.0,405.0,700,29.0,0.0,102,103,0.2,...,72000,75600,82800,90000,30,60,60,90,90,60
3,sample_483,300,0.0,405.0,700,38.0,0.0,100,102,0.2,...,27000,28800,55800,64800,30,60,60,90,90,0
4,sample_617,300,0.0,405.0,700,29.0,0.0,101,103,0.2,...,100800,104400,111600,118800,60,60,60,90,90,60


In [76]:
train_set.columns

Index([u'sample id', u'A1', u'A2', u'A3', u'A4', u'A6', u'A8', u'A10', u'A12',
       u'A13', u'A15', u'A17', u'A18', u'A19', u'A21', u'A22', u'A23', u'A25',
       u'A27', u'B1', u'B2', u'B3', u'B6', u'B8', u'B12', u'B13', u'B14',
       u'score', u'timeA5', u'timeA9', u'timeA11', u'timeA14', u'timeA16',
       u'timeA24', u'timeA26', u'timeB5', u'timeB7', u'durationA20',
       u'durationA28', u'durationB4', u'durationB9', u'durationB10',
       u'durationB11'],
      dtype='object')

In [77]:
test_set = test_df
test_set.columns

Index([u'sample id', u'A1', u'A2', u'A3', u'A4', u'A6', u'A8', u'A10', u'A12',
       u'A13', u'A15', u'A17', u'A18', u'A19', u'A21', u'A22', u'A23', u'A25',
       u'A27', u'B1', u'B2', u'B3', u'B6', u'B8', u'B12', u'B13', u'B14',
       u'timeA5', u'timeA9', u'timeA11', u'timeA14', u'timeA16', u'timeA24',
       u'timeA26', u'timeB5', u'timeB7', u'durationA20', u'durationA28',
       u'durationB4', u'durationB9', u'durationB10', u'durationB11'],
      dtype='object')

In [78]:
target = train_df['score']
del train_df['score']
data = pd.concat([train_df,test_df],axis=0,ignore_index=True)
data = data.fillna(-1)

cate_columns = [f for f in data.columns if f != 'sample id']

#label encoder
for f in cate_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))
train = data[:train_df.shape[0]]
test  = data[train_df.shape[0]:]

# one-hot
X_train = pd.DataFrame()
X_test = pd.DataFrame()
enc = OneHotEncoder()
for f in cate_columns:
    enc.fit(data[f].values.reshape(-1, 1))
    X_train = sparse.hstack((X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
    X_test = sparse.hstack((X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')

In [79]:
y_train = target.values

param = {'num_leaves': 30,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mse',
         "lambda_l1": 0.1,
         "verbosity": -1}

# 五折交叉验证
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, 
                    trn_data, 
                    num_round, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 200, 
                    early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)))


fold n°1
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000429823	valid_1's l2: 0.000281404
[400]	training's l2: 0.000354302	valid_1's l2: 0.000251709
[600]	training's l2: 0.000323227	valid_1's l2: 0.00024473
[800]	training's l2: 0.000303147	valid_1's l2: 0.000242659
[1000]	training's l2: 0.00028816	valid_1's l2: 0.000241292
[1200]	training's l2: 0.000277617	valid_1's l2: 0.00024098
Early stopping, best iteration is:
[1135]	training's l2: 0.000280703	valid_1's l2: 0.000240847
fold n°2
Training until validation scores don't improve for 100 rounds.
[200]	training's l2: 0.000353716	valid_1's l2: 0.000748895
[400]	training's l2: 0.000282967	valid_1's l2: 0.000669562
[600]	training's l2: 0.000256757	valid_1's l2: 0.000638213
[800]	training's l2: 0.000242378	valid_1's l2: 0.000619329
[1000]	training's l2: 0.000233277	valid_1's l2: 0.000606814
[1200]	training's l2: 0.000226567	valid_1's l2: 0.00059778
[1400]	training's l2: 0.000222067	valid_1's l2: 0.000

In [80]:
sub_df = pd.read_csv('submission.csv', header=None)
sub_df[1] = predictions
sub_df.to_csv("sub_jinnan.csv", index=False, header=None)