## 用逻辑回归算法训练模型

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
# 构建训练集和测试集
data = load_iris()
train_X, test_X, train_y, test_y = train_test_split(data.data, data.target, test_size=0.2, random_state=4)

In [4]:
# 训练模型
model = LogisticRegression(C=0.1,
                          max_iter=1000,
                          fit_intercept=True,
                          n_jobs=3)
model.fit(train_X, train_y)

LogisticRegression(C=0.1, max_iter=1000, n_jobs=3)

## Pickle Module

- pickle可以序列号对象并保存到磁盘中，并在需要的时候读取出来，任何对象都可以执行序列号操作，因此可以将训练好的模型对象序列化并存储到本地。
- cPickle是用C编码的pickle模块，性能更好，推荐在大多数场景中使用。

In [10]:
import pickle

# 序列化并保存模型
with open(r'models/pickle_model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [15]:
# 加载并调用模型
with open(r'models/pickle_model.pkl', 'rb') as file:
    pkl_model = pickle.load(file)
    
# 模型评分
score = pkl_model.score(test_X, test_y)
print('Test score: {0:.2f}%'.format(100 * score))

# 模型预测
predict_y = pkl_model.predict(test_X)
predict_y

Test score: 96.67%


array([2, 0, 2, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 1, 2, 0, 1, 0, 0, 2, 0, 2,
       1, 0, 0, 0, 0, 0, 0, 2])

## Joblib Module

- joblib是sklearn自带的一个工具，用于模型的持久化存储，做好很多优化。
- 在多少场景下，joblib的性能要优于pickle，尤其是当数据量较大的情况更加明显。

In [17]:
import joblib

# 持久化存储模型
joblib.dump(model, r'models/joblib_model.pkl')

['models/joblib_model.pkl']

In [18]:
# 加载模型
job_model = joblib.load(r'models/joblib_model.pkl')

# 模型评分
score = job_model.score(test_X, test_y)
print('Test score: {0:.2f}%'.format(100 * score))

# 模型预测
predict_y = job_model.predict(test_X)
predict_y

Test score: 96.67%


array([2, 0, 2, 2, 2, 1, 2, 0, 0, 2, 0, 0, 0, 1, 2, 0, 1, 0, 0, 2, 0, 2,
       1, 0, 0, 0, 0, 0, 0, 2])