In [3]:
from sklearn.model_selection import train_test_split #数据划分的包
from sklearn.linear_model import LinearRegression  #线性回归的包
from sklearn.preprocessing import StandardScaler  #归一化

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import time


In [4]:
## 设置字符集
# 解决中文显示问题
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [5]:
#加载数据
# 日期 时间 有无功率 
path = 'datas/household_power_consumption_1000.txt'
df = pd.read_csv(path, sep = ';', low_memory=False)

#查看数据类型
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date                     1000 non-null object
Time                     1000 non-null object
Global_active_power      1000 non-null float64
Global_reactive_power    1000 non-null float64
Voltage                  1000 non-null float64
Global_intensity         1000 non-null float64
Sub_metering_1           1000 non-null float64
Sub_metering_2           1000 non-null float64
Sub_metering_3           1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 70.4+ KB


In [6]:
df.head(10)

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0
5,16/12/2006,17:29:00,3.52,0.522,235.02,15.0,0.0,2.0,17.0
6,16/12/2006,17:30:00,3.702,0.52,235.09,15.8,0.0,1.0,17.0
7,16/12/2006,17:31:00,3.7,0.52,235.22,15.8,0.0,1.0,17.0
8,16/12/2006,17:32:00,3.668,0.51,233.99,15.8,0.0,1.0,17.0
9,16/12/2006,17:33:00,3.662,0.51,233.86,15.8,0.0,2.0,16.0


In [7]:
new_df = df.replace('?',np.nan)
datas = new_df.dropna(axis=0,how='any')
datas.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Global_active_power,1000.0,2.418772,1.239979,0.206,1.806,2.414,3.308,7.706
Global_reactive_power,1000.0,0.089232,0.088088,0.0,0.0,0.072,0.126,0.528
Voltage,1000.0,240.03579,4.08442,230.98,236.94,240.65,243.295,249.37
Global_intensity,1000.0,10.351,5.122214,0.8,8.4,10.0,14.0,33.2
Sub_metering_1,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Sub_metering_2,1000.0,2.749,8.104053,0.0,0.0,0.0,1.0,38.0
Sub_metering_3,1000.0,5.756,8.066941,0.0,0.0,0.0,17.0,19.0


In [8]:
datas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 9 columns):
Date                     1000 non-null object
Time                     1000 non-null object
Global_active_power      1000 non-null float64
Global_reactive_power    1000 non-null float64
Voltage                  1000 non-null float64
Global_intensity         1000 non-null float64
Sub_metering_1           1000 non-null float64
Sub_metering_2           1000 non-null float64
Sub_metering_3           1000 non-null float64
dtypes: float64(7), object(2)
memory usage: 78.1+ KB


In [9]:

#时间和功率数据
X = datas.iloc[:,:2]
## 创建一个时间函数格式化字符串
def date_format(dt):
    t = time.strptime(' '.join(dt),'%d/%m/%Y %H:%M:%S')
    return(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
X = X.apply(lambda x: pd.Series(date_format(x)),axis=1)
Y = datas.iloc[:,2]
X.head(4)

Unnamed: 0,0,1,2,3,4,5
0,2006,12,16,17,24,0
1,2006,12,16,17,25,0
2,2006,12,16,17,26,0
3,2006,12,16,17,27,0


In [10]:
#对数据集进行划分
#X:特征属性
#Y:目标属性
#test_size  0.2表示20%的数据进行测试
#random_state:数据划分是随机的，该参数给定一个随机种子
X_train, X_test, Y_train,Y_test = train_test_split(X,Y,test_size=0.2)
print(X_train.shape)
print(Y_train.shape)
X_train.describe()#模型对象构建
lr=LinearRegression()  #构建模型
lr.fit(X_train,Y_train)  #训练模型
Y_predict = lr.predict(X_test)

print(lr.score(X_train,Y_train))
print(lr.score(X_test,Y_test))
mse = np.average((Y_predict-Y_test)**2)
rmse = np.sqrt(mse)
print('rmse:',rmse)
print(lr.coef_)

(800, 6)
(800,)
0.227392462282
0.199111207622
rmse: 1.21715398703
[  0.00000000e+00  -3.95516953e-16  -2.51948224e+00  -9.27593422e-02
  -5.79202285e-03   0.00000000e+00]


In [11]:
#模型对象构建
lr=LinearRegression()  #构建模型
lr.fit(X_train,Y_train)  #训练模型
Y_predict = lr.predict(X_test)

print(lr.score(X_train,Y_train))
print(lr.score(X_test,Y_test))
mse = np.average((Y_predict-Y_test)**2)
rmse = np.sqrt(mse)
print('rmse:',rmse)
print(lr.coef_)

0.227392462282
0.199111207622
rmse: 1.21715398703
[  0.00000000e+00  -3.95516953e-16  -2.51948224e+00  -9.27593422e-02
  -5.79202285e-03   0.00000000e+00]


In [12]:
#数据标准化
#StandardScaler
#如果一个API名字中有fit，那么就有模型训练的含义
#如果一个API名字中有transform，那么他就表示对数据具有转换的含义操作
#如果一个API名字中有predict，那么就表示进行数据预测,会有一个预测结果输出
ss = StandardScaler()  #对象创建
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
pd.DataFrame(X_train).describe()

Unnamed: 0,0,1,2,3,4,5
count,800.0,800.0,800.0,800.0,800.0,800.0
mean,0.0,0.0,-1.826872e-15,-5.2180480000000004e-17,5.911938e-17,0.0
std,0.0,0.0,1.000626,1.000626,1.000626,0.0
min,0.0,0.0,-1.267227,-1.310319,-1.714177,0.0
25%,0.0,0.0,-1.267227,-0.8123355,-0.8388899,0.0
50%,0.0,0.0,0.7891249,-0.3143521,0.03639736,0.0
75%,0.0,0.0,0.7891249,1.055102,0.8533321,0.0
max,0.0,0.0,0.7891249,1.553086,1.728619,0.0


In [13]:
#模型对象构建
lr=LinearRegression()  #构建模型
lr.fit(X_train,Y_train)  #训练模型
Y_predict = lr.predict(X_test)

print(lr.score(X_train,Y_train))
print(lr.score(X_test,Y_test))
mse = np.average((Y_predict-Y_test)**2)
rmse = np.sqrt(mse)
print('rmse:',rmse)
print(lr.coef_)

0.227392462282
0.199111207622
rmse: 1.21715398703
[  0.00000000e+00   5.55111512e-17  -1.22521967e+00  -7.45079708e-01
  -9.92592335e-02   0.00000000e+00]


In [15]:
##模型保存/持久化

from sklearn.externals import joblib

joblib.dump(ss,'data_ss.model') #将标准化模型保存
joblib.dump(lr,'data_lr.model') #将模型保存

ss = joblib.load('data_ss.model')
lr = joblib.load('data_lr.model')