## Machine Learning with scikit-learn 

In [None]:
import numpy as np
import pandas as pd
import datetime as dt
from pylab import mpl, plt
import warnings; warnings.simplefilter('ignore')

In [None]:
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'
np.random.seed(1000)
np.set_printoptions(suppress=True, precision=4) # 将数组科学计数法转化为浮点数，精度为4
%matplotlib inline

## Supervised Learning

### The Data

In [None]:
from sklearn.datasets import make_classification

In [None]:
n_samples = 100

In [None]:
X, y = make_classification(n_samples=n_samples, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, random_state=250)

# n_samples：样本点数，也就是n_features微向量的个数
# n_features： 样本点的向量维度
# n_informative： 样本矩阵的秩
# n_redundant： 冗余信息，是其他的样本点的线性组合
# n_repeated：重复信息，也是冗余信息，但是只有一个线性系数是1其他都是0
# n_classes：y=0～n_classes-1，就是将样本分成多少类
# n_clusters_per_class：每一类有多少簇，在类中有进一步细分
# random_state：随机数种子，若等于一个整数，则每次生成的随机样本是固定的，便于固定结果；如果为None，每次重新生成新的随机样本。
# 其中，n_informative + n_redundant + n_repeated <= n_samples ;
#      n_classes * n_clusters_per_class <= 2^n_features

In [None]:
X[:5]  

In [None]:
X.shape  

In [None]:
y[:5]  

In [None]:
y.shape  

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=X[:, 0], y=X[:, 1], c=y, cmap='coolwarm');
plt.savefig('ml_plot_03.png')

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
model = GaussianNB()

In [None]:
model.fit(X, y)

In [None]:
model.predict_proba(X).round(4)[:5]  # 返回的预测值为获得所有结果的概率
# 四舍五入，保留4位小数

In [None]:
pred = model.predict(X)  

In [None]:
pred  

In [None]:
pred == y  

In [None]:
accuracy_score(y, pred)  

In [None]:
Xc = X[y == pred]  
Xf = X[y != pred]  

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')  
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm')  
plt.savefig('ml_plot_04.png')

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression(C=1)
# C  float, default=1.0 Inverse of regularization strength; must be a positive float.
# 正则强度的倒数；必须为正浮点数。与支持向量机一样，较小的值指定更强的正则化。

In [None]:
model.fit(X, y)

In [None]:
model.predict_proba(X).round(4)[:5]

In [None]:
pred = model.predict(X)

In [None]:
accuracy_score(y, pred)

In [None]:
Xc = X[y == pred]
Xf = X[y != pred]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm');

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(max_depth=1)
# criterion="entropy"   #不纯度的计算方法。"entropy"表示使用信息熵；"gini"表示使用基尼系数
# splitter="best"	#控制决策树中的随机选项。“best”表示在分枝时会优先选择重要的特征进行分枝；“random”表示分枝时会更加随机，常用来防止过拟合
# max_depth=10	#限制树的最大深度
# min_samples_split=5	#节点必须包含训练样本的个数
# min_samples_leaf=1	#叶子最少包含样本的个数
# min_weight_fraction_leaf=0.0
# max_features=None	#限制分枝的特征个数
# random_state=None	#输入任意数字会让模型稳定下来。加上random_state这个参数后，score就不会总是变化
# max_leaf_nodes=None
# min_impurity_decrease=0.0	#限制信息增益的大小，信息增益小于设定值分枝不会发生
# min_impurity_split=None	#结点必须含有最小信息增益再划分
# class_weight=None	#设置样本的权重，当正反样本差别较大时，又需要对少的样本进行精确估计时使用，搭配min_weight_fraction_leaf来剪枝

In [None]:
model.fit(X, y)

In [None]:
model.predict_proba(X).round(4)[:5]

In [None]:
pred = model.predict(X)

In [None]:
accuracy_score(y, pred)

In [None]:
Xc = X[y == pred]
Xf = X[y != pred]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm');
plt.savefig('ml_plot_05.png')

In [None]:
print('{:>8s} | {:8s}'.format('depth', 'accuracy'))
print(20 * '-')
for depth in range(1, 7):
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(X, y)
    acc = accuracy_score(y, model.predict(X))
    print('{:8d} | {:8.2f}'.format(depth, acc))

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=2 * [75], random_state=10)

# slover：权重优化求解器。
# {‘lbfgs’，‘sgd’，‘adam’}，默认’adam’。权重优化的求解器：'lbfgs’是准牛顿方法族的优化器；'sgd’指的是随机梯度下降。'adam’是基于随机梯度的优化器。
# alpha：float，默认为0.0001。正则化参数。
# 2*[75]==[75,75]

In [None]:
%time model.fit(X, y)

In [None]:
pred = model.predict(X)
pred

In [None]:
accuracy_score(y, pred)

## support vector machine, SVM

In [None]:
from sklearn import svm

In [None]:
xx = [[0, 0], [2, 2]]

In [None]:
yy = [0.5, 2.5]

In [None]:
regr = svm.SVR()
# SVR()就是SVM算法来做回归用的方法

In [None]:
regr.fit(xx, yy)

In [None]:
regr.predict([[1, 1]])

## Feature Transforms

In [None]:
from sklearn import preprocessing

In [None]:
X[:5]

In [None]:
Xs = preprocessing.StandardScaler().fit_transform(X)  
Xs[:5]
# 将数据转化为给定均值和标准差的正态分布

In [None]:
Xm = preprocessing.MinMaxScaler().fit_transform(X)  
Xm[:5]
#  将数据在缩放在固定区间，默认缩放到区间 [0, 1]

In [None]:
Xn1 = preprocessing.Normalizer(norm='l1').transform(X)  
Xn1[:5]
# 将数据进行l-1范化,L1正则化:    向量中各元素绝对值之和;

In [None]:
Xn2 = preprocessing.Normalizer(norm='l2').transform(X)  
Xn2[:5]
#  L2正则化:    向量中各元素的平方之和

In [None]:
plt.figure(figsize=(10, 6))
markers = ['o', '.', 'x', '^', 'v']
data_sets = [X, Xs, Xm, Xn1, Xn2]
labels = ['raw', 'standard', 'minmax', 'norm(1)', 'norm(2)']
for x, m, l in zip(data_sets, markers, labels):
    x = np.array(x)
    plt.scatter(x=x[:, 0], y=x[:, 1], c=y,
            marker=m, cmap='coolwarm', label=l)
plt.legend();
plt.savefig('ml_plot_06.png');

In [None]:
X[:5]

In [None]:
Xb = preprocessing.Binarizer().fit_transform(X)  
Xb[:5]
# 将数据二值化，转化为0或1,小于等于阈值的，将特征值赋予0，大于特征值的赋予1，其阈值threshold默认都为0

In [None]:
2 ** 2  

In [None]:
Xd = np.digitize(X, bins=[-1, 0, 1])  
Xd[:5]
# 将数据分配到不同的筒(bin)里面去,bin为划分区间

In [None]:
4 ** 2  

## Train-Test Splits 

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
X, y = make_classification(n_samples=n_samples, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, random_state=250)
# n_samples=100

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, 
                                                    random_state=0)

In [None]:
model = SVC(C=1, kernel='linear')

In [None]:
model.fit(train_x, train_y)  

In [None]:
pred_train = model.predict(train_x)  

In [None]:
accuracy_score(train_y, pred_train)  

In [None]:
pred_test = model.predict(test_x)  

In [None]:
test_y == pred_test  

In [None]:
accuracy_score(test_y, pred_test)  

In [None]:
test_c = test_x[test_y == pred_test]
test_f = test_x[test_y != pred_test]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=test_c[:, 0], y=test_c[:, 1], c=test_y[test_y == pred_test],
            marker='o', cmap='coolwarm')
plt.scatter(x=test_f[:, 0], y=test_f[:, 1], c=test_y[test_y != pred_test],
            marker='x', cmap='coolwarm');
plt.savefig('ml_plot_07.png');

In [None]:
bins = np.linspace(-4.5, 4.5, 50)

In [None]:
Xd = np.digitize(X, bins=bins)
# 数据特征空间转换到离散值

In [None]:
Xd[:5]

In [None]:
train_x, test_x, train_y, test_y = train_test_split(Xd, y, test_size=0.33,
                                                    random_state=0)

In [None]:
print('{:>8s} | {:8s}'.format('kernel', 'accuracy'))
print(20 * '-')
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model = SVC(C=1, kernel=kernel)
    model.fit(train_x, train_y)
    acc = accuracy_score(test_y, model.predict(test_x))
    print('{:>8s} | {:8.3f}'.format(kernel, acc))

## Unsupervised Learning

### The Data

In [None]:
from sklearn.datasets._samples_generator import make_blobs

In [None]:
X, y = make_blobs(n_samples=250, centers=4,
                  random_state=500, cluster_std=1.25)  
# 创建含有4个中心的250个数据点，
# n_features表示每一个样本有多少特征值
# n_samples表示样本的个数
# centers是聚类中心点的个数，可以理解为label的种类数
# random_state是随机种子，可以固定生成的数据
# cluster_std设置每个类别的方差

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], s=50);
plt.savefig('ml_plot_01.png')

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans  

In [None]:
model = KMeans(n_clusters=4, random_state=0)  

In [None]:
model.fit(X)  

In [None]:
y_kmeans = model.predict(X)  

In [None]:
y_kmeans[:12]  

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans,  cmap='coolwarm');
plt.savefig('ml_plot_02.png');

### Gaussian Mixtures

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
model = GaussianMixture(n_components=4, random_state=0)

In [None]:
model.fit(X)

In [None]:
y_gm = model.predict(X)

In [None]:
y_gm[:12]

In [None]:
(y_gm == y_kmeans).all()  