In [2]:
import pandas as pd
import numpy as np

"""
vehicle_id STRING 车辆唯一标志码
charge_start_time INT 充电开始时间
charge_end_time INT 充电结束时间
mileage FLOAT 充电开始时刻车辆仪表里程（km）
charge_start_soc INT 充电开始时刻动力电池 SOC
charge_end_soc INT 充电结束时刻动力电池 SOC
charge_start_U FLOAT 充电开始时刻动力电池总电压（V）
charge_end_U FLOAT 充电结束时刻动力电池总电压（V）
charge_start_I FLOAT 充电开始时刻动力电池总电流（A）
charge_end_I FLOAT 充电结束时刻动力电池总电流（A）
charge_max_temp FLOAT 充电过程中电池系统温度探针最大值（℃）
charge_min_temp FLOAT 充电过程中电池系统温度探针最小值（℃）
charge_energy FLOAT 此充电过程的充电能量（kWh）
"""

data = pd.read_csv('data/predict_data_e_train.csv')
data[data == 0] = np.nan
data.head()

ImportError: dateutil 2.5.0 is the minimum required version

In [2]:
# 找出缺失值的列
data.isnull().any()

vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc      True
charge_end_soc        True
charge_start_U       False
charge_end_U          True
charge_start_I       False
charge_end_I          True
charge_max_temp       True
charge_min_temp       True
charge_energy         True
dtype: bool

In [3]:
# 填充缺失值的列
data['charge_start_soc'].fillna(data['charge_start_soc'].mean(), inplace=True)
data['charge_end_soc'].fillna(data['charge_end_soc'].mean(), inplace=True)
data['charge_start_U'].fillna(data['charge_start_U'].mean(), inplace=True)
data['charge_end_U'].fillna(data['charge_end_U'].mean(), inplace=True)
data['charge_start_I'].fillna(data['charge_start_I'].mean(), inplace=True)
data['charge_end_I'].fillna(data['charge_end_I'].mean(), inplace=True)
data['charge_max_temp'].fillna(data['charge_max_temp'].mean(), inplace=True)
data['charge_min_temp'].fillna(data['charge_min_temp'].mean(), inplace=True)

data['charge_energy'].fillna(data['charge_energy'].mean(), inplace=True)

In [4]:
# 两列相减
data['time'] = data.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
data['soc'] = data.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
data['U'] = data.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
data['I'] = data.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
data['temp'] = data.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [5]:
num = len(data)
data.sample(frac=1)
train = data[:int(num*0.7)]
test = data[int(num*0.7):]
print(num, len(train), len(test))#charge_energy

2782 1947 835


In [6]:
selected_features = ['vehicle_id',
                     'time',
                     'mileage',
                     'soc',
                     'U',
                     'I',
                     'temp']

X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['charge_energy']
y_test = test['charge_energy']

In [None]:
# 特征向量化
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)

X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [8]:
# 随机森林回归模型预测
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

![title](0.png)

In [9]:
y_pred = list(y_pred)
y_test = list(y_test)

In [10]:
e = 0
for a, r in zip(y_test, y_pred):
    e = ((r- a) / float(a)) ** 2

e

0.10145291909955811

![title](创新组.png)

In [11]:
test = pd.read_csv('data/testA.csv')
test.head()

Unnamed: 0,vehicle_id,charge_start_time,charge_end_time,mileage,charge_start_soc,charge_end_soc,charge_start_U,charge_end_U,charge_start_I,charge_end_I,charge_max_temp,charge_min_temp
0,1,20171116161033,20171116170903,71710.6,44.0,90.8,370.5,383.9,-29.8,-40.1,21.0,13.0
1,1,20171210190533,20171210193743,75816.9,34.0,60.4,365.1,379.9,-25.9,-40.4,14.0,9.0
2,1,20180104213535,20180105042334,79867.3,39.2,100.0,364.0,399.9,-7.6,-7.8,11.0,2.0
3,1,20180215211935,20180216051456,86867.6,28.8,,362.9,399.9,-7.6,-7.6,8.0,3.0
4,2,20180525192134,20180526025112,67266.3,35.0,99.0,350.8,397.0,-9.4,-5.0,32.0,27.0


In [12]:
# 找出缺失值的列
test.isnull().any()

vehicle_id           False
charge_start_time    False
charge_end_time      False
mileage              False
charge_start_soc     False
charge_end_soc        True
charge_start_U       False
charge_end_U         False
charge_start_I       False
charge_end_I         False
charge_max_temp      False
charge_min_temp      False
dtype: bool

In [13]:
# 填充缺失值的列
test['charge_end_soc'].fillna(test['charge_end_soc'].mean(), inplace=True)

In [14]:
# 两列相减
test['time'] = test.apply(lambda row: row.charge_end_time - row.charge_start_time, axis=1)
test['soc'] = test.apply(lambda row: row.charge_end_soc - row.charge_start_soc, axis=1)
test['U'] = test.apply(lambda row: row.charge_end_U - row.charge_start_U, axis=1)
test['I'] = test.apply(lambda row: row.charge_end_I - row.charge_start_I, axis=1)
test['temp'] = test.apply(lambda row: row.charge_max_temp - row.charge_min_temp, axis=1)

In [15]:
x = dict_vec.transform(test[selected_features].to_dict(orient='record'))
y = model.predict(x)

In [16]:
ids = list(test['vehicle_id'])
len(ids)

27

In [17]:
with open('submit-A_张彬城_暨南大学_15521106350_20181009.csv', 'w') as f:
    f.write('vehicle_id,charge_energy\n')
    for _id, _y in zip(ids, y):
        line = '%d,%d\n' % (_id, _y)
        f.write(line)