In [1]:
from sklearn import datasets
from sklearn.impute import SimpleImputer
from datetime import datetime

import tensorflow as tf
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)

In [2]:
#读取数据
train = pd.read_csv('data/jinnan_round1_train_20181227.csv')
mytest = pd.read_csv('data/jinnan_round1_testA_20181227.csv')

In [9]:
#返回时间差，单位分钟
def delta_time(data, t1, t2, format1='%H:%M:%S', format2='%H:%M:%S'):
    new_col = t1 + '-' + t2
    data[new_col] = (pd.to_datetime(data[t1], format=format1) - pd.to_datetime(data[t2],  format=format2)).dt.total_seconds()/60
    data.loc[data[new_col] < 0, new_col]  = data[data[new_col] < 0][new_col] + 24*60
    return data


#将时间区间字段分割为两列时间字段
def split_time(data, split_col):
    col1 = split_col + 'a'
    col2 = split_col + 'b'
    data[[col1, col2]] = data[split_col].str.split('-', expand=True)   
    data = data.drop([split_col], axis=1)

    for col in [col1, col2]:
        data = clearn_time(data, col)
    return data


#清洗时间字段
def clearn_time(data, col):
    data[col] = data[col].str.replace(';', ':')    
    data[col] = data[col].str.replace('分', '')
    data[col] = data[col].str.replace('；', ':')
    data[col] = data[col].str.replace('"', ':')
    data[col] = data[col].str.replace('::', ':')
    return data


#根据colm名称确定时间转换的format
def judge_format(time_col):
    if time_col[-1] in ['a', 'b']:
        myformat = '%H:%M'
    else:
        myformat = '%H:%M:%S'
    return myformat


#数据清洗
def clearn_data(data):
    #NAN值太多，直接删除列
    data = data.drop(['A2', 'A7', 'A8', 'B11'], axis=1)
    #fillna with mean value
    my_inputer = SimpleImputer(copy=False)
    data[['A1','A3','A4', 'A21', 'A23', 'B1', 'B2', 'B3', 'B8', 'B12', 'B13']] = my_inputer.fit_transform(data[['A1','A3','A4', 'A21', 'A23', 'B1', 'B2', 'B3', 'B8', 'B12', 'B13']])
    
    #原时间格式有误
    data.loc[314, 'A9'] = '23:00:00'
    #根据上下文时间猜测前后时间间隔均为一小时，不一定准确
    data.loc[1320, 'A11'] = '22:30:00'
    data.loc[1079, 'A11'] = '00:30:00'
    data.loc[998, 'A16'] = '12:00:00'
    #data.loc[538, 'A9] = 700
    data = data.drop([386, 538, 586, 1140])    
    
    for col in ['A20', 'A28', 'B4', 'B9', 'B10']:
        data = split_time(data, col)    
    
    data.loc[641, 'A20a'] = '18:00'
    data.loc[197, 'A20a'] = '18:00'
    data.loc[358, 'A20a'] = '18:00'
    data.loc[700, 'A26'] = '13:00:00'#根据前后时间猜
    data.loc[141, 'A28a'] = '14:00'
    data.loc[17, 'A28b'] = '00:00'
    data.loc[17, 'B4a'] = '00:00'
    data.loc[600, 'B4a'] = '19:05'
    data.loc[161, 'B4b'] = '17:00'
    data.loc[237, 'B4b'] = '16:00'
    
    time_cols = ['A5', 'A9', 'A11', 'A14', 'A16', 'A20a', 'A20b', 'A24', 'A26', 'A28a', 'A28b', 'B4a', 'B4b', 'B7', 'B9a', 'B9b', 'B10a', 'B10b']
    for i in range(len(time_cols) - 1):
        format1 = judge_format(time_cols[i+1])
        format2 = judge_format(time_cols[i])      
        data = delta_time(data, time_cols[i+1], time_cols[i], format1=format1, format2=format2)
        
       
    
    #data = delta_time(data, 'A9', 'A5')
    #data = delta_time(data, 'A14', 'A11')
    #data = delta_time(data, 'A16', 'A14')    
    #data = split_time(data, 'A20')
    
    #data['A20a'] = data['A20a'].str.replace(';', ':')
    #data['A20b'] = data['A20b'].str.replace(';', ':')
    #data['A20b'] = data['A20b'].str.replace('分', '')
    
    #data = delta_time(data, 'A20a', 'A16', format1='%H:%M', format2='%H:%M:%S')
    #data = delta_time(data, 'A20b', 'A20a', format1='%H:%M', format2='%H:%M')
    #data = delta_time(data, 'A24', 'A20b', format1='%H:%M:%S', format2='%H:%M')
    
    #data = delta_time(data, 'A26', 'A24')
    
    #data = split_time(data, 'A28')
    
    #data['A28a'] = data['A28a'].str.replace('；', ':')
    
    #data = delta_time(data, 'A28a', 'A26', format1='%H:%M', format2='%H:%M:%S')
    #data = delta_time(data, 'A28b', 'A28a', format1='%H:%M', format2='%H:%M')
    
    #data = split_time(data, 'B4')
    
    #data['B4a'] = data['B4a'].str.replace('"', ':')
    
    #data['B4a'] = data['B4a'].str.replace('；', ':')
    #data['B4a'] = data['B4a'].str.replace(';', ':')
    #data['B4b'] = data['B4b'].str.replace(';', ':')
    #data['B4b'] = data['B4b'].str.replace('；', ':')
    
    #data = delta_time(data, 'B4a', 'A28b', format1='%H:%M', format2='%H:%M')
    #data = delta_time(data, 'B4b', 'B4a', format1='%H:%M', format2='%H:%M')
    #data = delta_time(data, 'B7', 'B4b', format1='%H:%M:%S', format2='%H:%M')
    
    #data = split_time(data, 'B9')
    #data['B9a'] = data['B9a'].str.replace('；', ':')
    #data['B9a'] = data['B9a'].str.replace(';', ':')
    #data['B9b'] = data['B9b'].str.replace('::', ':')
    #data['B9b'] = data['B9b'].str.replace(';', ':')
    #data = delta_time(data, 'B9a', 'B7', format1='%H:%M', format2='%H:%M:%S')
    #data = delta_time(data, 'B9b', 'B9a', format1='%H:%M', format2='%H:%M')
    
    #data = split_time(data, 'B10')
    #data['B10a'] = data['B10a'].str.replace('::', ':')
    #data['B10a'] = data['B10a'].str.replace(';', ':')
    #data['B10a'] = data['B10a'].str.replace('；', ':')
    #data['B10b'] = data['B10b'].str.replace(';', ':')
    #data['B10b'] = data['B10b'].str.replace('；', ':')    
    #data = delta_time(data, 'B10a', 'B9b', format1='%H:%M', format2='%H:%M')
    #data = delta_time(data, 'B10b', 'B10a', format1='%H:%M', format2='%H:%M')
    return data


#特征工程
def make_feature(data):
    return data

In [10]:
clearn_train = clearn_data(train)
my_inputer = SimpleImputer(copy=False)
clearn_train[['A24-A20b', 'A26-A24', 'A28a-A26', 'B10a-B9b', 'B10b-B10a']] = my_inputer.fit_transform(clearn_train[['A24-A20b', 'A26-A24', 'A28a-A26', 'B10a-B9b', 'B10b-B10a']])

clearn_train

Unnamed: 0,sample_id,A1,A3,A4,A5,A6,A9,A10,A11,A12,A13,A14,A15,A16,A17,A18,A19,A21,A22,A23,A24,A25,A26,A27,B1,B2,B3,B5,B6,B7,B8,B12,B13,B14,result,A20a,A20b,A28a,A28b,B4a,B4b,B9a,B9b,B10a,B10b,A9-A5,A11-A9,A14-A11,A16-A14,A20a-A16,A20b-A20a,A24-A20b,A26-A24,A28a-A26,A28b-A28a,B4a-A28b,B4b-B4a,B7-B4b,B9a-B7,B9b-B9a,B10a-B9b,B10b-B10a
0,sample_1528,300.0,405.0,700.0,13:30:00,38.0,15:30:00,100,16:30:00,102,0.2,17:30:00,103.0,18:30:00,104.0,0.2,300,50.0,9.0,5.0,22:00:00,75,22:30:00,70,350.0,3.5,3.5,8:00:00,65,11:30:00,45.0,800.0,0.15,400,0.879,21:00,21:30,6:30,7:00,7:00,8:00,11:30,13:00,14:00,15:30,120.0,60.0,60.0,60.0,150.0,30.0,30.0,30.0,480.0,30.0,0.0,60.0,210.0,0.0,90.0,60.0,90.0
1,sample_1698,300.0,405.0,700.0,14:00:00,29.0,16:00:00,101,17:00:00,103,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,50.0,9.0,5.0,20:00:00,80,21:00:00,73,320.0,3.5,3.5,23:00:00,80,6:00:00,45.0,1200.0,0.15,400,0.902,19:00,20:00,21:00,22:00,22:00,23:00,6:00,7:30,7:30,9:00,120.0,60.0,60.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,420.0,0.0,90.0,0.0,90.0
2,sample_639,300.0,405.0,700.0,14:00:00,29.0,16:00:00,102,17:00:00,103,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,50.0,9.0,5.0,20:00:00,79,21:00:00,73,320.0,3.5,3.5,23:00:00,80,1:00:00,45.0,1200.0,0.15,400,0.936,19:00,19:30,21:00,22:00,22:00,23:00,1:00,2:30,2:30,4:00,120.0,60.0,60.0,60.0,0.0,30.0,30.0,60.0,0.0,60.0,0.0,60.0,120.0,0.0,90.0,0.0,90.0
3,sample_483,300.0,405.0,700.0,1:30:00,38.0,3:00:00,100,4:00:00,102,0.2,5:00:00,103.0,6:00:00,104.0,0.2,200,50.0,10.0,5.0,7:30:00,70,8:00:00,78,290.0,3.5,3.5,15:30:00,65,18:00:00,45.0,800.0,0.15,400,0.902,6:30,7:00,13:30,14:30,14:30,15:30,19:00,20:30,21:30,23:00,90.0,60.0,60.0,60.0,30.0,30.0,30.0,30.0,330.0,60.0,0.0,60.0,150.0,60.0,90.0,60.0,90.0
4,sample_617,300.0,405.0,700.0,22:00:00,29.0,0:00:00,101,1:00:00,103,0.2,2:00:00,104.0,3:00:00,105.0,0.2,200,50.0,9.0,5.0,4:00:00,80,5:00:00,73,320.0,3.5,3.5,7:00:00,80,9:00:00,45.0,1200.0,0.15,420,0.983,3:00,4:00,5:00,6:00,6:00,7:00,9:00,10:30,10:30,12:00,120.0,60.0,60.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,120.0,0.0,90.0,0.0,90.0
5,sample_373,300.0,405.0,700.0,2:00:00,39.0,3:30:00,100,4:30:00,103,0.2,5:30:00,104.0,6:30:00,102.0,0.2,300,50.0,9.0,5.0,12:00:00,70,12:30:00,75,334.452742,3.5,3.5,20:00:00,65,3:00:00,45.0,800.0,0.15,420,0.935,11:30,12:00,17:30,18:00,18:00,20:00,3:00,4:30,5:30,7:00,90.0,60.0,60.0,60.0,300.0,30.0,0.0,30.0,300.0,30.0,0.0,120.0,420.0,0.0,90.0,60.0,90.0
6,sample_577,300.0,405.0,700.0,8:00:00,29.0,10:00:00,101,11:00:00,103,0.2,12:00:00,104.0,13:00:00,105.0,0.2,200,50.0,9.0,5.0,14:00:00,80,15:00:00,73,320.0,3.5,3.5,17:00:00,80,19:00:00,45.0,1200.0,0.15,400,0.902,13:00,14:00,15:00,16:00,16:00,17:00,19:00,20:30,20:30,22:00,120.0,60.0,60.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,120.0,0.0,90.0,0.0,90.0
7,sample_212,300.0,405.0,700.0,6:00:00,29.0,8:00:00,101,9:00:00,102,0.2,10:00:00,103.0,11:00:00,103.0,0.2,200,50.0,9.0,5.0,12:00:00,78,13:00:00,73,320.0,3.5,3.5,15:00:00,80,17:00:00,45.0,1200.0,0.15,400,0.891,11:00,12:00,13:00,14:00,14:00,15:00,17:00,18:30,18:30,20:00,120.0,60.0,60.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,0.0,60.0,120.0,0.0,90.0,0.0,90.0
8,sample_521,300.0,405.0,700.0,17:30:00,21.0,19:00:00,100,20:00:00,101,0.2,21:00:00,102.0,22:00:00,105.0,0.2,300,50.0,9.0,5.0,23:30:00,70,0:00:00,78,340.0,3.5,3.5,4:30:00,65,6:30:00,45.0,800.0,0.15,420,0.947,22:30,23:00,2:00,3:00,3:00,4:30,6:30,8:00,8:30,10:00,90.0,60.0,60.0,60.0,30.0,30.0,30.0,30.0,120.0,60.0,0.0,90.0,120.0,0.0,90.0,30.0,90.0
9,sample_1026,300.0,405.0,700.0,11:00:00,21.0,12:30:00,100,13:30:00,102,0.2,14:30:00,103.0,15:30:00,105.0,0.2,200,50.0,9.0,5.0,17:00:00,70,17:30:00,78,280.0,3.5,3.5,0:00:00,64,2:00:00,50.0,800.0,0.15,400,0.902,16:00,16:30,22:00,23:00,23:00,0:00,6:00,7:30,8:00,9:30,90.0,60.0,60.0,60.0,30.0,30.0,30.0,30.0,270.0,60.0,0.0,60.0,120.0,240.0,90.0,30.0,90.0


In [50]:
mytest

Unnamed: 0,sample_id,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17,A18,A19,A20,A21,A22,A23,A24,A25,A26,A27,A28,B1,B2,B3,B4,B5,B6,B7,B8,B9,B10,B11,B12,B13,B14
0,sample_1656,300,,405.0,700,6:00:00,29,,,8:00:00,101,9:00:00,103.0,0.2,10:00:00,105.0,11:00:00,106.0,0.2,200,11:00-12:00,50,9,5,12:00:00,80.0,13:00:00,73.0,13:00-14:00,320.0,3.5,3.5,14:00-15:00,15:00:00,79,17:00:00,45,17:00-18:30,18:30-20:00,20:00-21:00,1200,0.15,400
1,sample_1548,300,,405.0,700,12:30:00,39,12:50:00,80.0,14:20:00,100,15:20:00,102.0,0.2,16:20:00,103.0,17:20:00,102.0,0.2,300,20:00-20:30,50,9,5,21:00:00,76.0,21:30:00,75.0,6:00-6:30,350.0,3.5,3.5,6:30-7:50,7:50:00,65,10:00:00,45,12:00-13:00,14:00-15:30,,800,0.15,385
2,sample_769,300,,405.0,700,6:00:00,80,,,8:00:00,102,9:00:00,104.0,0.2,10:00:00,104.0,11:00:00,105.0,0.2,200,11:00-12:00,50,9,5,12:00:00,80.0,13:00:00,73.0,13:00-14:00,320.0,3.5,3.5,14:00-15:00,15:00:00,80,17:00:00,45,17:00-20:00,,,1200,0.15,440
3,sample_1881,300,,405.0,700,22:00:00,29,,,0:00:00,102,1:00:00,103.0,0.2,2:00:00,104.0,3:00:00,105.0,0.2,200,3:00-4:00,50,9,5,4:00:00,78.0,5:00:00,73.0,5:00-6:00,320.0,3.5,3.5,6:00-7:00,7:00:00,80,9:00:00,45,9:00-10:30,10:30-12:00,12:00-13:00,1200,0.15,400
4,sample_1807,300,,405.0,700,22:00:00,30,,,0:00:00,101,1:00:00,104.0,0.2,2:00:00,106.0,3:00:00,107.0,0.2,200,3:00-4:00,50,9,5,4:00:00,79.0,5:00:00,72.0,5:00-6:00,320.0,3.5,3.5,6:00-7:00,7:00:00,79,9:00:00,45,9:00-10:30,10:30-12:00,12:00-13:00,1200,0.15,400
5,sample_145,300,,405.0,700,14:00:00,29,,,16:00:00,101,17:00:00,103.0,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,19:00-20:00,50,9,5,20:00:00,80.0,21:00:00,73.0,21:00-22:00,320.0,3.5,3.5,22:00-23:00,23:00:00,80,1:00:00,45,1:00-2:30,2:30-4:00,4:00-5:00,1200,0.15,400
6,sample_1212,300,,405.0,700,6:00:00,29,,,8:00:00,101,9:00:00,102.0,0.2,10:00:00,103.0,11:00:00,104.0,0.2,200,11:00-12:00,50,9,5,12:00:00,78.0,13:00:00,73.0,13:00-14:00,320.0,3.5,3.5,14:00-15:00,15:00:00,75,17:00:00,45,17:00-18:30,18:30-20:00,20:00-21:00,1200,0.15,400
7,sample_944,300,,405.0,700,10:00:00,24,,,11:00:00,100,12:00:00,103.0,0.2,13:00:00,106.0,14:00:00,106.0,0.2,300,14:10-14:40,50,9,5,15:00:00,70.0,15:30:00,75.0,15:30-16:30,350.0,3.5,3.5,17:00-18:00,18:00:00,64,20:00:00,45,20:10-21:10,21:30-22:40,,900,0.15,400
8,sample_829,300,,405.0,700,21:00:00,30,,,23:00:00,102,0:00:00,103.0,0.2,1:00:00,104.0,2:00:00,105.0,0.2,200,2:00-3:00,50,9,5,3:00:00,80.0,4:00:00,73.0,4:00-5:00,320.0,3.5,3.5,5:00-6:00,6:00:00,80,8:00:00,45,8:00-12:00,,,1200,0.15,440
9,sample_616,300,,405.0,700,14:00:00,28,,,16:00:00,102,17:00:00,103.0,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,19:00-20:00,50,9,5,20:00:00,80.0,21:00:00,73.0,21:00-22:00,320.0,3.5,3.5,22:00-23:00,23:00:00,80,1:00:00,45,1:00-2:30,2:30-4:00,4:00-5:00,1200,0.15,400


In [5]:
mytest.loc[86, 'A5'] = '22:00:00'

#NAN值太多，直接删除列
test = mytest.drop(['A2', 'A7', 'A8', 'B11'], axis=1)
#fillna with mean value
my_inputer = SimpleImputer(copy=False)
test[['A1','A3','A4', 'A21', 'A23', 'B1', 'B2', 'B3', 'B8', 'B12', 'B13']] = my_inputer.fit_transform(test[['A1','A3','A4', 'A21', 'A23', 'B1', 'B2', 'B3', 'B8', 'B12', 'B13']])
    
test = delta_time(test, 'A9', 'A5')
test = delta_time(test, 'A14', 'A11')
test = delta_time(test, 'A16', 'A14')

test = split_time(test, 'A20')
test.loc[641, 'A20a'] = '18:00'
test.loc[197, 'A20a'] = '18:00'
test.loc[358, 'A20a'] = '18:00'
test['A20a'] = test['A20a'].str.replace(';', ':')
test['A20b'] = test['A20b'].str.replace(';', ':')
test['A20b'] = test['A20b'].str.replace('分', '')

test = delta_time(test, 'A20a', 'A16', format1='%H:%M', format2='%H:%M:%S')
test = delta_time(test, 'A20b', 'A20a', format1='%H:%M', format2='%H:%M')
test = delta_time(test, 'A24', 'A20b', format1='%H:%M:%S', format2='%H:%M')
test.loc[700, 'A26'] = '13:00:00'#根据前后时间猜
test = delta_time(test, 'A26', 'A24')

test = split_time(test, 'A28')
test.loc[141, 'A28a'] = '14:00'
test['A28a'] = test['A28a'].str.replace('；', ':')
test['A28b'] = test['A28b'].str.replace('；', ':')
test.loc[17, 'A28b'] = '00:00'
test = delta_time(test, 'A28a', 'A26', format1='%H:%M', format2='%H:%M:%S')
test = delta_time(test, 'A28b', 'A28a', format1='%H:%M', format2='%H:%M')

test = split_time(test, 'B4')
test.loc[17, 'B4a'] = '00:00'
test['B4a'] = test['B4a'].str.replace('"', ':')
test.loc[600, 'B4a'] = '19:05'
test['B4a'] = test['B4a'].str.replace('；', ':')
test['B4a'] = test['B4a'].str.replace(';', ':')
test['B4b'] = test['B4b'].str.replace(';', ':')
test['B4b'] = test['B4b'].str.replace('；', ':')
test.loc[161, 'B4b'] = '17:00'
test.loc[237, 'B4b'] = '16:00'
test = delta_time(test, 'B4a', 'A28b', format1='%H:%M', format2='%H:%M')
test = delta_time(test, 'B4b', 'B4a', format1='%H:%M', format2='%H:%M')
test = delta_time(test, 'B7', 'B4b', format1='%H:%M:%S', format2='%H:%M')

test = split_time(test, 'B9')
test['B9a'] = test['B9a'].str.replace('；', ':')
test['B9a'] = test['B9a'].str.replace(';', ':')
test['B9b'] = test['B9b'].str.replace('::', ':')
test['B9b'] = test['B9b'].str.replace(';', ':')
test = delta_time(test, 'B9a', 'B7', format1='%H:%M', format2='%H:%M:%S')
test = delta_time(test, 'B9b', 'B9a', format1='%H:%M', format2='%H:%M')

test = split_time(test, 'B10')
test['B10a'] = test['B10a'].str.replace('::', ':')
test['B10a'] = test['B10a'].str.replace(';', ':')
test['B10a'] = test['B10a'].str.replace('；', ':')
test['B10b'] = test['B10b'].str.replace(';', ':')
test['B10b'] = test['B10b'].str.replace('；', ':')    
test = delta_time(test, 'B10a', 'B9b', format1='%H:%M', format2='%H:%M')
test = delta_time(test, 'B10b', 'B10a', format1='%H:%M', format2='%H:%M')

my_inputer = SimpleImputer(copy=False)
test[['A25','A27','A20a-A16', 'A20b-A20a', 'A24-A20b', 'B10a-B9b', 'B10b-B10a']] = my_inputer.fit_transform(test[['A25','A27','A20a-A16', 'A20b-A20a', 'A24-A20b', 'B10a-B9b', 'B10b-B10a']])
test = test.drop([641, 197, 358, 700, 600, 161, 237])

In [72]:
clearn_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1392 entries, 0 to 1395
Data columns (total 61 columns):
sample_id    1392 non-null object
A1           1392 non-null float64
A3           1392 non-null float64
A4           1392 non-null float64
A5           1392 non-null object
A6           1392 non-null float64
A9           1392 non-null object
A10          1392 non-null int64
A11          1392 non-null object
A12          1392 non-null int64
A13          1392 non-null float64
A14          1392 non-null object
A15          1392 non-null float64
A16          1392 non-null object
A17          1392 non-null float64
A18          1392 non-null float64
A19          1392 non-null int64
A21          1392 non-null float64
A22          1392 non-null float64
A23          1392 non-null float64
A24          1391 non-null object
A25          1392 non-null object
A26          1390 non-null object
A27          1392 non-null int64
B1           1392 non-null float64
B2           1392 non-null float64


In [6]:
my_inputer = SimpleImputer(copy=False)
clearn_train[['A24-A20b', 'A26-A24', 'A28a-A26', 'B10a-B9b', 'B10b-B10a']] = my_inputer.fit_transform(clearn_train[['A24-A20b', 'A26-A24', 'A28a-A26', 'B10a-B9b', 'B10b-B10a']])
#clearn_train = clearn_train.drop([641, 197, 358, 700, 600, 161, 237])

In [71]:
a = ((pd.to_datetime(clearn_train['B4b'], format='%H:%M') - pd.to_datetime(clearn_train['B7'],  format='%H:%M:%S')).dt.total_seconds()/60)
b = pd.to_datetime(clearn_train['B10b'], format='%H:%M')
c = clearn_train['A24'].value_counts()
d = train[train['A9'].notnull() == True]


In [10]:
clearn_train[clearn_train.notnull() == True]

Unnamed: 0,sample_id,A1,A3,A4,A5,A6,A9,A10,A11,A12,A13,A14,A15,A16,A17,A18,A19,A21,A22,A23,A24,A25,A26,A27,B1,B2,B3,B5,B6,B7,B8,B12,B13,B14,result,A9-A5,A14-A11,A16-A14,A20a,A20b,A20a-A16,A20b-A20a,A24-A20b,A26-A24,A28a,A28b,A28a-A26,A28b-A28a,B4a,B4b,B4a-A28b,B4b-B4a,B7-B4b,B9a,B9b,B9a-B7,B9b-B9a,B10a,B10b,B10a-B9b,B10b-B10a
0,sample_1528,300.0,405.0,700.0,13:30:00,38.0,15:30:00,100,16:30:00,102,0.2,17:30:00,103.0,18:30:00,104.0,0.2,300,50.0,9.0,5.0,22:00:00,75,22:30:00,70,350.0,3.5,3.5,8:00:00,65,11:30:00,45.0,800.0,0.15,400,0.879,120.0,60.0,60.0,21:00,21:30,150.0,30.0,30.0,30.0,6:30,7:00,480.0,30.0,7:00,8:00,0.0,60.0,210.0,11:30,13:00,0.0,90.0,14:00,15:30,60.0,90.0
1,sample_1698,300.0,405.0,700.0,14:00:00,29.0,16:00:00,101,17:00:00,103,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,50.0,9.0,5.0,20:00:00,80,21:00:00,73,320.0,3.5,3.5,23:00:00,80,6:00:00,45.0,1200.0,0.15,400,0.902,120.0,60.0,60.0,19:00,20:00,0.0,60.0,0.0,60.0,21:00,22:00,0.0,60.0,22:00,23:00,0.0,60.0,420.0,6:00,7:30,0.0,90.0,7:30,9:00,0.0,90.0
2,sample_639,300.0,405.0,700.0,14:00:00,29.0,16:00:00,102,17:00:00,103,0.2,18:00:00,104.0,19:00:00,105.0,0.2,200,50.0,9.0,5.0,20:00:00,79,21:00:00,73,320.0,3.5,3.5,23:00:00,80,1:00:00,45.0,1200.0,0.15,400,0.936,120.0,60.0,60.0,19:00,19:30,0.0,30.0,30.0,60.0,21:00,22:00,0.0,60.0,22:00,23:00,0.0,60.0,120.0,1:00,2:30,0.0,90.0,2:30,4:00,0.0,90.0
3,sample_483,300.0,405.0,700.0,1:30:00,38.0,3:00:00,100,4:00:00,102,0.2,5:00:00,103.0,6:00:00,104.0,0.2,200,50.0,10.0,5.0,7:30:00,70,8:00:00,78,290.0,3.5,3.5,15:30:00,65,18:00:00,45.0,800.0,0.15,400,0.902,90.0,60.0,60.0,6:30,7:00,30.0,30.0,30.0,30.0,13:30,14:30,330.0,60.0,14:30,15:30,0.0,60.0,150.0,19:00,20:30,60.0,90.0,21:30,23:00,60.0,90.0
4,sample_617,300.0,405.0,700.0,22:00:00,29.0,0:00:00,101,1:00:00,103,0.2,2:00:00,104.0,3:00:00,105.0,0.2,200,50.0,9.0,5.0,4:00:00,80,5:00:00,73,320.0,3.5,3.5,7:00:00,80,9:00:00,45.0,1200.0,0.15,420,0.983,120.0,60.0,60.0,3:00,4:00,0.0,60.0,0.0,60.0,5:00,6:00,0.0,60.0,6:00,7:00,0.0,60.0,120.0,9:00,10:30,0.0,90.0,10:30,12:00,0.0,90.0
5,sample_373,300.0,405.0,700.0,2:00:00,39.0,3:30:00,100,4:30:00,103,0.2,5:30:00,104.0,6:30:00,102.0,0.2,300,50.0,9.0,5.0,12:00:00,70,12:30:00,75,334.452742,3.5,3.5,20:00:00,65,3:00:00,45.0,800.0,0.15,420,0.935,90.0,60.0,60.0,11:30,12:00,300.0,30.0,0.0,30.0,17:30,18:00,300.0,30.0,18:00,20:00,0.0,120.0,420.0,3:00,4:30,0.0,90.0,5:30,7:00,60.0,90.0
6,sample_577,300.0,405.0,700.0,8:00:00,29.0,10:00:00,101,11:00:00,103,0.2,12:00:00,104.0,13:00:00,105.0,0.2,200,50.0,9.0,5.0,14:00:00,80,15:00:00,73,320.0,3.5,3.5,17:00:00,80,19:00:00,45.0,1200.0,0.15,400,0.902,120.0,60.0,60.0,13:00,14:00,0.0,60.0,0.0,60.0,15:00,16:00,0.0,60.0,16:00,17:00,0.0,60.0,120.0,19:00,20:30,0.0,90.0,20:30,22:00,0.0,90.0
7,sample_212,300.0,405.0,700.0,6:00:00,29.0,8:00:00,101,9:00:00,102,0.2,10:00:00,103.0,11:00:00,103.0,0.2,200,50.0,9.0,5.0,12:00:00,78,13:00:00,73,320.0,3.5,3.5,15:00:00,80,17:00:00,45.0,1200.0,0.15,400,0.891,120.0,60.0,60.0,11:00,12:00,0.0,60.0,0.0,60.0,13:00,14:00,0.0,60.0,14:00,15:00,0.0,60.0,120.0,17:00,18:30,0.0,90.0,18:30,20:00,0.0,90.0
8,sample_521,300.0,405.0,700.0,17:30:00,21.0,19:00:00,100,20:00:00,101,0.2,21:00:00,102.0,22:00:00,105.0,0.2,300,50.0,9.0,5.0,23:30:00,70,0:00:00,78,340.0,3.5,3.5,4:30:00,65,6:30:00,45.0,800.0,0.15,420,0.947,90.0,60.0,60.0,22:30,23:00,30.0,30.0,30.0,30.0,2:00,3:00,120.0,60.0,3:00,4:30,0.0,90.0,120.0,6:30,8:00,0.0,90.0,8:30,10:00,30.0,90.0
9,sample_1026,300.0,405.0,700.0,11:00:00,21.0,12:30:00,100,13:30:00,102,0.2,14:30:00,103.0,15:30:00,105.0,0.2,200,50.0,9.0,5.0,17:00:00,70,17:30:00,78,280.0,3.5,3.5,0:00:00,64,2:00:00,50.0,800.0,0.15,400,0.902,90.0,60.0,60.0,16:00,16:30,30.0,30.0,30.0,30.0,22:00,23:00,270.0,60.0,23:00,0:00,0.0,60.0,120.0,6:00,7:30,240.0,90.0,8:00,9:30,30.0,90.0


In [11]:
time_col = ['A5', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7', 'A20a', 'A20b', 'A28a', 'A28b', 'B4a', 'B4b', 'B9a', 'B9b', 'B10a', 'B10b']
clearn_train.loc[1304, 'A25'] = 80
y_train = clearn_train['result']
x_train = clearn_train.drop(['result', 'sample_id'], axis=1).drop(time_col, axis=1)
x_test = test.drop(['sample_id'], axis=1).drop(time_col, axis=1)

In [None]:
clearn_train

In [87]:
x = tf.placeholder(tf.float32, [None, 40])
y = tf.placeholder(tf.float32, [None, 1])

weights_L1 = tf.Variable(tf.random_normal([40,40]))
biases_L1 = tf.Variable(tf.random_normal([40,40]))
fx_L1 = tf.matmul(x,weights_L1) + biases_L1
L1 = tf.nn.tanh(fx_L1)

weights_L2 = tf.Variable(tf.random_normal([40,40]))
biases_L2 = tf.Variable(tf.random_normal([1,40]))
fx_L2 = tf.matmul(x,weights_L2) + biases_L2
prediction = tf.nn.tanh(fx_L2)

loss = tf.reduce_mean(tf.square(y - prediction))

train_step = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for _ in range(200):
        sess.run(train_step, feed_dict={x:x_train,y:y_train})
        
    prediction_y = sess.run(prediction, feed_dict={x:x_test})

ValueError: Cannot feed value of shape (1392,) for Tensor 'Placeholder_13:0', which has shape '(?, 1)'

In [12]:
from sklearn.linear_model import LinearRegression
# 定义线性回归模型
model = LinearRegression(fit_intercept=True, normalize=False, 
    copy_X=True, n_jobs=1)

linreg = LinearRegression()
linreg.fit(x_train, y_train)
y_pred = linreg.predict(x_test)

In [17]:
y_pred.shape

(150,)

In [25]:
my_result = pd.read_csv('data/jinnan_round1_submit_20181227.csv', header=None)

In [26]:
my_result

Unnamed: 0,0,1
0,sample_1656,0.01
1,sample_1548,0.01
2,sample_769,0.01
3,sample_1881,0.01
4,sample_1807,0.01
5,sample_145,0.01
6,sample_1212,0.01
7,sample_944,0.01
8,sample_829,0.01
9,sample_616,0.01


In [27]:
my_result[1] = y_pred

In [28]:
my_result

Unnamed: 0,0,1
0,sample_1656,0.927448
1,sample_1548,0.899055
2,sample_769,0.935809
3,sample_1881,0.925015
4,sample_1807,0.928041
5,sample_145,0.925668
6,sample_1212,0.923863
7,sample_944,0.919226
8,sample_829,0.937864
9,sample_616,0.925385


In [31]:
my_result.to_csv('my_submit_20190112.csv', index=0, header=0)