In [44]:
import pandas as pd 
import time
import os
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib

In [3]:
# 设置字符集，防止中文乱码
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False

In [9]:
# 1. 数据加载
power_datas = pd.read_csv(filepath_or_buffer='./datas/household_power_consumption_1000.txt',sep=';',low_memory=False)
power_datas.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [8]:
# 2. 数据清洗、处理
# 移除行数据中有空的数据行
power_datas = power_datas.dropna(axis=0,how='any')

In [28]:
# 时间转换方法
def timeToFormate(x):
    t = time.strptime(' '.join(x),'%d/%m/%Y %H:%M:%S')
    return (t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)

In [31]:
# 需求1：时间与功率的关系
# 时间为特征属性（年、月、日、时、分、秒）  功率为目标属性
# 首先取时间相关的列
X_time = power_datas.iloc[:,0:2]
X_time = X_time.apply(lambda x :pd.Series(timeToFormate(x)),axis = 1)
Y = power_datas['Global_active_power']

In [35]:
# 划分训练集和测试集
X_Time_train, X_Time_test, Y_train, Y_test = train_test_split(X_time, Y, test_size = 0.2, random_state = 214)
print(X_Time_train.shape)
print(X_Time_test.shape)

(800, 6)
(200, 6)


In [37]:
# 创建模型对象
# fit_intercept = True 截距项
lr = LinearRegression(fit_intercept = True)

In [38]:
# 模型训练
lr.fit(X_Time_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [41]:
# 模型效果评估
print('训练集上的效果:{}'.format(lr.score(X_Time_train,Y_train)))
print('测试集上的效果:{}'.format(lr.score(X_Time_train,Y_train)))

训练集上的效果:0.24121879952442246
测试集上的效果:0.24121879952442246


In [42]:
# 输出模型相关参数
print('模型系数θ:',end='')
print(lr.coef_)
print('截距项:',end='')
print(lr.intercept_)

模型系数θ:[  0.00000000e+00   2.98372438e-16  -2.76913422e+00  -1.06249822e-01
  -5.10649205e-03   0.00000000e+00]
截距项:49.6661902692


In [46]:
# 模型持久化

filename = 'result/power_time_lr.model'
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
    os.makedirs(dirname)
joblib.dump(value= lr,filename=filename)    

['result/power_time_lr.model']