<img src="./img/dataview_outline.png"> 
[r](#reg)

<br><br><br><br>

## <font size='7'>matplotlib</font>

In [None]:
#引入matplotlib包
import matplotlib.pyplot as plt


<img style="float:left;" src="./img/dataview_figure.png">

* ### figure

In [None]:
# 创建figure
fig = plt.figure()

<img style="float:left;" src="./img/dataview_add_subplot.png">

* ### subplot

In [None]:
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)

In [None]:
# 在subpot上作图
import numpy as np

random_arr = np.random.randn(100)
print(random_arr)

# 默认是在最后一次使用subplot的位置上作图，但是在jupyter里无效
plt.plot(random_arr)
plt.show()

In [None]:
# 在指定subplot作图

import scipy as sp
from scipy import stats

x = np.linspace(-5, 15, 50)
print(x.shape)

#绘制高斯分布
plt.plot(x, sp.stats.norm.pdf(x=x, loc=5, scale=2))

# 叠加直方图
plt.hist(sp.stats.norm.rvs(loc=5, scale=2, size=200), bins=50, normed=True, color='r')
plt.show()

<img style="float:left;" src="./img/dataview_hsbi.png">

In [None]:
# 绘制直方图
plt.hist(np.random.randn(100), bins=10, color='b', alpha=0.3)
plt.show()

In [None]:
# 绘制散点图
x = np.arange(50)
y = x + 5 * np.random.rand(50)
plt.scatter(x, y)

In [None]:
# 柱状图
x = np.arange(5)
y1, y2 = np.random.randint(1, 25, size=(2,5))
width = 0.5
ax = plt.subplot(1,1,1)
ax.bar(x, y1, width, color='r')
ax.bar(x+width, y2, width, color='g')
ax.set_xticks(x+width)
ax.set_xticklabels(['a', 'b', 'c', 'd', 'e'])
plt.show()

In [None]:
# 矩阵绘图
m = np.random.rand(10,10)
print(m)
plt.imshow(m, interpolation='nearest', cmap=plt.cm.ocean)
plt.colorbar()
plt.show()

<img style="float:left;" src="./img/dataview_subplots.png">

* ## plt.subplots()

In [None]:
fig, subplot_arr = plt.subplots(2,2)
subplot_arr[0,0].hist(np.random.randn(100), bins=10, color='b', alpha=0.3)
plt.show()

<img style="float:left;" src="./img/dataview_cml.png">

* ## 颜色、标签、线性

In [None]:
fig, axes = plt.subplots(2)
axes[0].plot(np.random.randint(0, 100, 50), 'ro--')

# 等价
axes[1].plot(np.random.randint(0, 100, 50), color='r',linestyle='dashed', marker='o')

<img style="float:left;" src="./img/dataview_ticks.png">

* ## 刻度、标签、图例

In [None]:
fig, ax = plt.subplots(1)
ax.plot(np.random.randn(1000).cumsum(), label='line0')

# 设置刻度
# plt.xlim([0,500])
ax.set_xlim([0,800])

# 设置显示的刻度
# plt.xticks([0,500])
ax.set_xticks(range(0,500,100))

# 设置刻度标签
ax.set_yticklabels(['Jan', 'Feb', 'Mar'])

# 设置坐标轴标签
ax.set_xlabel('Number')
ax.set_ylabel('Month')

# 设置标题
ax.set_title('Example')

# 图例
ax.plot(np.random.randn(1000).cumsum(), label='line1')
ax.plot(np.random.randn(1000).cumsum(), label='line2')
ax.legend()
ax.legend(loc='best')
# plt.legend()

<img style="float:left;" src="./img/dataview_seaborn_outline.png">

## <font size='7'>Seaborn</font>

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

* ## 数据集分布可视化

In [None]:
# 单变量分布
x1 = np.random.normal(size=1000)
sns.distplot(x1)

In [None]:
# 直方图
sns.distplot(x1, bins=20, kde=False, rug=True)

In [None]:
x2 = np.random.randint(0, 100, 500)
sns.distplot(x2)

In [None]:
# 核密度估计
sns.distplot(x2, hist=False, rug=True)

In [None]:
sns.kdeplot(x2, shade=True)
sns.rugplot(x2)

In [None]:
# 拟合参数分布
sns.distplot(x1, kde=False, fit=stats.gamma)

In [None]:
# 双变量分布
df_obj1 = pd.DataFrame({'x':np.random.randn(500),
                       'y':np.random.randn(500)})

df_obj2 = pd.DataFrame({'x':np.random.randn(500),
                       'y':np.random.randint(0, 100, 500)})

In [None]:
# 散布图
sns.jointplot(x='x', y='y', data=df_obj1)

In [None]:
# 二维直方图
sns.jointplot(x='x', y='y', data=df_obj1, kind='hex')

In [None]:
# 核密度估计
sns.jointplot(x='x', y='y', data=df_obj1, kind='kde')

In [None]:
# 数据集中变量间关系可视化
dataset = sns.load_dataset('tips')
sns.pairplot(dataset )

<img style="float:left;" src="./img/dataview_seaborn_type.png">

## 类别数据可视化

In [None]:
# titanic = sns.load_dataset('titanic')
# palnets = sns.load_dataset('palnets')
# flights = sns.load_dataset('flights')
# iris = sns.load_dataset('iris')
exercise = sns.load_dataset('exercise')

* ### 类别散布图

In [None]:
sns.stripplot(x='diet', y='pulse', data=exercise)

In [None]:
sns.swarmplot(x='diet', y='pulse', data=exercise, hue='kind')

* ### 类别内数据分布

In [None]:
# 盒子图
sns.boxplot(x='diet', y='pulse', data=exercise)
# sns.boxplot(x='diet', y='pulse', data=exercise, hue='kind')

In [None]:
# 小提琴图
# sns.violinplot(x='diet', y='pulse', data=exercise)
sns.violinplot(x='diet', y='pulse', data=exercise, hue='kind')

* ### 类别内统计图

In [None]:
# 柱状图
sns.barplot(x='diet', y='pulse', data=exercise, hue='kind')

In [None]:
# 点图
sns.pointplot(x='diet', y='pulse', data=exercise, hue='kind')

<img style="float:center;" src="./img/dataview_boken_outine.png">

In [None]:
from bokeh.io import output_notebook,output_file,show
from bokeh.layouts import row
from bokeh.plotting import figure
import seaborn as sns

In [None]:
# 导入数据 
exercise = sns.load_dataset('exercise')
output_notebook()
# output_file('test.html')

* ## bokeh.charts

In [None]:
#　散点图
exercise = sns.load_dataset('exercise')
p = figure(title='exercise dataset', plot_width=600, plot_height=400, x_axis_label='id', y_axis_label='pulse')
p.scatter('id','pulse', marker="square", source=exercise)
show(p)

<img style="float:left;" src="./img/dataview_boken_plotting.png">

* ## bokeh.plotting

In [None]:
from bokeh.plotting import figure
import numpy as np

p = figure(plot_width = 400, plot_height = 400)
# 方框
p.square(np.random.randint(1, 10, 5), np.random.randint(1, 10, 5), size=20, color="navy")

#圆图
p.circle(np.random.randint(1, 10, 5), np.random.randint(1, 10, 5), size=10, color="green")
show(p)

<img style="float:left;" src="./img/dataview_logistic.png">
<a id="reg">

* ## logistic Regression模型的手工实现

In [None]:
# lr_tools.py

import numpy as np
from scipy.optimize import fmin_l_bfgs_b

class LogisticRegression(object):
    """
        Logistic Regression 类
    """

    def __init__(self, c=1.):
        self.c = c

    def fit(self, X, y):
        """
            训练模型
        """

        self._beta = np.zeros((X.shape[1] + 1, 1))

        # 使用L-BFGS-B求最优化
        result = fmin_l_bfgs_b(cost_func,               # 损失函数
                               self._beta,              # 初始值
                               args=(X, y, self.c))     # 损失函数的参数

        self._beta = result[0]
        return self

    def predict(self, X):
        """
            预测，返回标签
        """
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        """
            预测，返回概率
        """
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        XBeta = np.dot(X, self._beta).reshape((-1, 1))

        probs = 1. / (1. + np.exp(-XBeta))
        return np.hstack((1 - probs, probs))


def cost_func(beta, X, y, C):
    """
        损失函数/目标函数
        返回正则化的负对数似然值 及 梯度值
    """

    # 给X加一列1，便于计算
    X = np.hstack((np.ones((X.shape[0], 1)), X))
    # 转成列向量
    y = y.reshape((-1,1))

    # 预先计算XBeta
    XBeta = np.dot(X, beta).reshape((-1,1))

    # 预先计算XBeta的exp值
    exp_XBeta = np.exp(XBeta)

    # 负对数似然值
    neg_ll = C*np.sum(np.log(1. + exp_XBeta) - y*XBeta, axis=0)

    # 负对数似然值的梯度
    grad_neg_ll = C*np.sum((1./ (1. + exp_XBeta))*exp_XBeta*X - y*X, axis=0) + beta

    return neg_ll, grad_neg_ll

def cal_acc(true_labels, pred_labels):
    """
        计算准确率
    """
    n_total = len(true_labels)
    correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]
    acc = sum(correct_list) / n_total
    return acc

In [None]:
# main.py

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# from lr_tools import LogisticRegression, cal_acc

def main():
    X, y = make_classification(
            n_samples=2000,
            n_features=100,
            n_classes=2,
            random_state=17)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                        random_state=17)

    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    y_pred = lr_model.predict(X_test)

    print('真实值：', y_test)
    print('预测值：', y_pred)
    acc = cal_acc(y_test, y_pred)
    print('准确率：{:.2%}'.format(acc))

if __name__ == '__main__':
    main()