In [1]:
import pandas as pd
import numpy as np

"""
vehicle_id STRING 车辆唯一标志码
charge_start_time INT 充电开始时间
charge_end_time INT 充电结束时间
mileage FLOAT 充电开始时刻车辆仪表里程（km）
charge_start_soc INT 充电开始时刻动力电池 SOC
charge_end_soc INT 充电结束时刻动力电池 SOC
charge_start_U FLOAT 充电开始时刻动力电池总电压（V）
charge_end_U FLOAT 充电结束时刻动力电池总电压（V）
charge_start_I FLOAT 充电开始时刻动力电池总电流（A）
charge_end_I FLOAT 充电结束时刻动力电池总电流（A）
charge_max_temp FLOAT 充电过程中电池系统温度探针最大值（℃）
charge_min_temp FLOAT 充电过程中电池系统温度探针最小值（℃）
charge_energy FLOAT 此充电过程的充电能量（kWh）
"""

data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp,charge_energy
0,305,2,20180527121119,20180527130020,67716.9,52.0,91.0,362.4,396.7,-33.5,-17.0,33.0,24.0,15.182
1,1448,4,20180306031848,20180306031858,93773.7,100.0,100.0,550.1,549.8,-4.0,-4.0,35.0,29.0,0.006
2,1313,4,20180130142926,20180130151346,86975.4,50.0,,531.7,,-20.0,,29.0,,34.464
3,1641,4,20180407093409,20180407093419,101538.2,100.0,100.0,551.5,551.0,-2.0,-1.0,28.0,24.0,0.002
4,435,3,20171202184133,20171202191313,142198.0,5.2,23.2,331.8,361.1,-84.8,-26.0,18.0,8.0,14.134


In [2]:
# 找出缺失值的列
data.isnull().sum()

Unnamed: 0             0
vehicle_id             0
charge_start_time      0
charge_end_time        0
mileage                0
charge_start_soc      27
charge_end_soc       198
charge_start_U         0
charge_end_U         137
charge_start_I         0
charge_end_I         317
charge_max_temp        7
charge_min_temp      448
charge_energy          0
dtype: int64

In [3]:
# 计算中位数
charge_start_soc_mean = data.groupby(['vehicle_id']).charge_start_soc.mean()
charge_end_soc_mean = data.groupby(['vehicle_id']).charge_end_soc.mean()
charge_end_U_mean = data.groupby(['vehicle_id']).charge_end_U.mean()
charge_end_I_mean = data.groupby(['vehicle_id']).charge_end_I.mean()
charge_max_temp_mean = data.groupby(['vehicle_id']).charge_max_temp.mean()
charge_min_temp_mean = data.groupby(['vehicle_id']).charge_min_temp.mean()

# 设置索引
data.set_index(['vehicle_id'], inplace=True)

# 填充缺失值
data.charge_start_soc.fillna(charge_start_soc_mean, inplace=True)
data.charge_end_soc.fillna(charge_end_soc_mean, inplace=True)
data.charge_end_U.fillna(charge_end_U_mean, inplace=True)
data.charge_end_I.fillna(charge_end_I_mean, inplace=True)
data.charge_max_temp.fillna(charge_max_temp_mean, inplace=True)
data.charge_min_temp.fillna(charge_min_temp_mean, inplace=True)

# 重置索引
data.reset_index(inplace=True)

In [4]:
num = len(data)

train = data[:int(num*0.7)]
test = data[int(num*0.7):]
print(num, len(train), len(test))

2765 1935 830


In [5]:
selected_features = ['vehicle_id', 
                     'charge_start_time', 'charge_end_time',
                     'mileage',
                     'charge_start_soc', 'charge_end_soc',
                     'charge_start_U','charge_end_U',
                     'charge_start_I', 'charge_end_I',
                     'charge_max_temp', 'charge_min_temp']

X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['charge_energy']
y_test = test['charge_energy']

In [6]:
# 特征向量化
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)

X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [7]:
# 预测
while(true):
    
    from sklearn.ensemble import ExtraTreesRegressor
    model = ExtraTreesRegressor() # 极端随机森林回归模型

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

# 评估
    e = 0
    for a, r in zip(y_test, y_pred):
        e += ((r- a) / a) ** 2

    e ** 0.5

345.26257690723389

![title](0.png)

In [8]:
test = pd.read_csv('data/testA.csv')
test.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp
0,1,20171116161033,20171116170903,71710.6,44.0,90.8,370.5,383.9,-29.8,-40.1,21.0,13.0
1,1,20171210190533,20171210193743,75816.9,34.0,60.4,365.1,379.9,-25.9,-40.4,14.0,9.0
2,1,20180104213535,20180105042334,79867.3,39.2,100.0,364.0,399.9,-7.6,-7.8,11.0,2.0
3,1,20180215211935,20180216051456,86867.6,28.8,,362.9,399.9,-7.6,-7.6,8.0,3.0
4,2,20180525192134,20180526025112,67266.3,35.0,99.0,350.8,397.0,-9.4,-5.0,32.0,27.0


In [9]:
# 找出缺失值的列
test.isnull().any()

vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc     False
charge_end_soc        True
charge_start_U       False
charge_end_U         False
charge_start_I       False
charge_end_I         False
charge_max_temp      False
charge_min_temp      False
dtype: bool

In [10]:
# 填充缺失值的列
test['charge_end_soc'].fillna(test['charge_end_soc'].mean(), inplace=True)

In [11]:
x = dict_vec.transform(test[selected_features].to_dict(orient='record'))
y = model.predict(x)

In [12]:
ids = list(test['vehicle_id'])
len(ids)

27

In [13]:
with open('submit-A_张彬城_暨南大学_15521106350_20181009.csv', 'w') as f:
    f.write('vehicle_id,charge_energy\n')
    for _id, _y in zip(ids, y):
        line = '%d,%d\n' % (_id, _y)
        f.write(line)