In [3]:
# 导入包
import seaborn as sns
import pandas as pd
import plotly.express as px
# 使用Seaborn加载鸢尾花数据集
df = sns.load_dataset("iris")
# 用plotly绘制散点图， 边缘为箱型图， 分类为 species
fig = px.scatter(df, x = 'sepal_length', y = 'petal_length',
color = 'species',
marginal_x = 'box',marginal_y = 'box',
width=600, height=500, template = "plotly_white",
color_discrete_sequence=px.colors.qualitative.Pastel1,
labels={"sepal_length": "Sepal Length (cm)",
"petal_length": "Petal length (cm)"})
fig.show()


# 绘制成对散点图
fig = px.scatter_matrix(df,
dimensions=["sepal_length", "sepal_width",
"petal_length", "petal_width",
"species"],
template = "plotly_white",
color = 'species', width = 600, height = 600)
fig.update_traces(diagonal_visible=False)
fig.show()

# 用 花萼长度 * 花萼宽度 代表花萼面积
df['area'] = df['sepal_length'] * df['sepal_width']
# 用花萼面积大小将样本等分为数量 (大致) 相等的5个区间
df['Category'] = pd.qcut(df['area'], 5,
labels = ['A','B','C','D','E'])
# 按区间汇总 (最小值， 最大值， 均值， 标准差）
list_stats = ['min', 'max', 'mean', 'median', 'std']
stats_by_area = df.groupby('Category')['area'].agg(list_stats)
# 计算极差， 最大值 - 最小值
stats_by_area['Range'] = stats_by_area['max'] - stats_by_area['min']
# 每个区间的样本数量； 还可以在list_stats中加 'count'
stats_by_area['Number'] = df['Category'].value_counts()
# 将结果存为 CSV
stats_by_area.to_csv('stats_by_area.csv')

# 用Plotly绘制散点图， 维度为面积， 分类为Category
fig = px.scatter(df, x = 'area', y = 'Category',
color = 'Category',
template = "plotly_white",
width=600, height=300,
color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.show()
# 用Plotly绘制箱型图， 维度为面积， 分类为Category
fig = px.box(df, x = 'area', y = 'Category',
color = 'Category', points="all",
template = "plotly_white",
width=600, height=300,
color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.show()

# 花萼长度的箱型图， 考虑'Category'分类
fig = px.box(df, x = 'sepal_length', y = 'Category',
color = 'Category', points="all",
template = "plotly_white",
width=600, height=300,
category_orders={"Category": ["A", "B", "C", "D", "E"]},
color_discrete_sequence=px.colors.qualitative.Pastel1,
labels={"sepal_length": "Sepal Length (cm)"})
fig.show()
# 用plotly绘制散点图， 边缘为箱型图， 分类为 Category
fig = px.scatter(df, x = 'sepal_length', y = 'petal_length',
color = 'Category', marginal_x = 'box',
marginal_y = 'box', template = "plotly_white",
width=600, height=500,
color_discrete_sequence=px.colors.qualitative.Pastel1,
labels={"sepal_length": "Sepal Length (cm)",
"petal_length": "Petal length (cm)"})
fig.show()

# 从Category和species两个维度切割鸢尾花数据， 结果为二维频率
freq_matrix = pd.crosstab(index = df['Category'],
columns = df['species'])
# 可视化二元频率数组
fig = px.imshow(freq_matrix, text_auto=True)
fig.show()
# 从Category和species两个维度切割鸢尾花数据， 结果为二维概率
prob_matrix = pd.crosstab(index = df['Category'],
columns = df['species'],
normalize = 'all')
# 可视化二元概率数组
fig = px.imshow(prob_matrix, text_auto='.3f')
fig.show()

# 绘制二维直方热图 + 边缘直方图， 计数
fig = px.density_heatmap(df, x = 'species', y = 'Category',
category_orders={"Category":
["A", "B", "C", "D", "E"]},
marginal_x="histogram", marginal_y="histogram",
text_auto = True, width = 400, height = 500)
fig.show()
# 绘制二维直方热图 + 边缘直方图， 概率
fig = px.density_heatmap(df, x = 'species', y = 'Category',
category_orders={"Category":
["A", "B", "C", "D", "E"]},
marginal_x="histogram", marginal_y="histogram",
histnorm = 'probability',
text_auto='.3f', width = 400, height = 500)
fig.show()

# 绘制频率热图， 子图布置
fig = px.density_heatmap(df,
x="sepal_length",
y="sepal_width",
facet_row="Category",
facet_col="species",
width = 500,
height = 800,
text_auto=True)
fig.show()


# 计算 Category 分类比例
ctg_percent = df['Category'].value_counts(normalize=True)
ctg_percent = pd.DataFrame({'Category':ctg_percent.index,
'Percent':ctg_percent.values})
# 用柱状图展示 Category 分类比例
fig = px.bar(ctg_percent,
x="Percent", y="Category",
category_orders={"Category": ["A", "B", "C", "D", "E"]},
color = "Percent", orientation='h',
text_auto = '.3f')
fig.show()
# 用饼图可视化 Category 百分比
fig = px.pie(ctg_percent,
category_orders={"Category": ["A", "B", "C", "D", "E"]},
color_discrete_sequence=px.colors.qualitative.Pastel1,
values='Percent', names='Category')
fig.update_traces(hole=.68)
fig.show()

# 计算 species 分类比例
species_percent = df['species'].value_counts(normalize=True)
species_percent = pd.DataFrame({'species':species_percent.index,
'Ratio':species_percent.values})
# 用柱状图可视化 species 分类比例
fig = px.bar(species_percent,
x="Ratio", y="species",
category_orders={"species":
["setosa", "versicolor", "virginica"]},
color_discrete_sequence=px.colors.qualitative.Pastel1,
color = "species", orientation='h',
text_auto = '.3f')
fig.show()
# 用饼图可视化 species 分类百分比
fig = px.pie(species_percent,
category_orders={"species":
["setosa", "versicolor", "virginica"]},
color_discrete_sequence=px.colors.qualitative.Pastel1,
values='Ratio', names='species')
fig.update_traces(hole=.68)
fig.show()

# 对 Category 比例值在 species 维度上钻取
fig = px.bar(prob_matrix,
template = "plotly_white",orientation = 'h',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300, text_auto = '.2f')
fig.show()
# 对 species 比例值在 Category 维度上钻取
fig = px.bar(prob_matrix.T,
template = "plotly_white",orientation = 'h',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300, text_auto = '.2f')
fig.show()

# 计算Category中species的相对比例
ratio_species_in_category = pd.crosstab(index = df['Category'],
columns = df['species'],
normalize = 'index')
# 可视化
fig = px.bar(ratio_species_in_category,
template = "plotly_white",orientation = 'h',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300, text_auto = '.2f')
fig.show()
# 计算species中Category的相对比例
ratio_category_in_species = pd.crosstab(index = df['species'],
columns = df['Category'],
normalize = 'index')
# 可视化
fig = px.bar(ratio_category_in_species,
template = "plotly_white",orientation = 'h',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300, text_auto = '.2f')
fig.show()


# 将概率值 (比例值) stack 起来
prob_matrix_stacked = prob_matrix.stack().reset_index().rename(
columns={0: "Ratio"})
# 用太阳爆炸图进行钻取， 先 species， 再 Category
fig = px.sunburst(prob_matrix_stacked,
path=['species', 'Category'],
values='Ratio', color='Ratio',
width = 600, height = 600)
fig.show()
# 用太阳爆炸图进行钻取， 先 Category， 再 species
fig = px.sunburst(prob_matrix_stacked,
path=['Category', 'species'],
values='Ratio', color='Ratio',
width = 600, height = 600)
fig.show()

# 再增加一层钻取维度
# 设置标签
labels = ["{0} - {1} cm".format(i, i+1) for i in range(4, 8)]
# 用pandas.cut() 划分区间
df["sepal_length_bins"] = pd.cut(df.sepal_length, range(4, 9),
right=False, labels=labels)
# 计算频数
sepal_length_bins_counts = df["sepal_length_bins"].value_counts()
sepal_length_bins_counts = pd.DataFrame({
'sepal_length_bins':sepal_length_bins_counts.index,
'Count':sepal_length_bins_counts.values})
# 可视化第三维度样本计数
fig = px.bar(sepal_length_bins_counts,
x = 'Count', y = 'sepal_length_bins',
orientation = 'h', text_auto=True)
fig.show()
# 可视化第三维度样本百分比
fig = px.pie(sepal_length_bins_counts,
color_discrete_sequence=px.colors.qualitative.Pastel1,
values='Count', names='sepal_length_bins')
fig.update_traces(hole=.68)
fig.show()

# 计算三个维度转取的比例 (概率） 值
dims = ['species','Category','sepal_length_bins']
prob_matrix_by_3 = df.groupby(dims)['sepal_length'].apply(
lambda x: x.count()/len(df))
prob_matrix_by_3 = prob_matrix_by_3.reset_index()
prob_matrix_by_3.rename(columns = {'sepal_length':'Ratio'},
inplace = True)
# 用太阳爆炸图进行钻取， 先 Category， 再 species， 最后sepal_length_bins
fig = px.sunburst(prob_matrix_by_3,
path=dims,
values='Ratio',
width = 600, height = 600)
fig.show()

# 交叉计数， 计数
count_matrix = pd.crosstab(index = [df.species, df.Category],
columns = df.sepal_length_bins,
values=df.petal_length, aggfunc='count')
count_matrix = count_matrix.stack().reset_index()
count_matrix.rename(columns = {0:'count'}, inplace = True)
count_matrix = count_matrix[count_matrix['count'] != 0]
# 删除 'count' 列值为0的行
# 冰柱图
fig = px.icicle(count_matrix,
path=[px.Constant("all"),
'species', 'Category', 'sepal_length_bins'],
values = 'count',
color_continuous_scale='Blues',
color = 'count',
width = 600, height = 800)
fig.show()

# 矩形树形图
fig = px.treemap(count_matrix,
path=[px.Constant("all"),
'species', 'Category', 'sepal_length_bins'],
values = 'count',
color_continuous_scale='Blues',
color = 'count',
width = 600, height = 800)
fig.show()

# 分别计算每个子类 (species) petal_length均值
petal_length_mean_by_species = df.groupby([
'species'])['petal_length'].mean().reset_index()
fig = px.bar(petal_length_mean_by_species,
x = 'petal_length', y = 'species',
color = 'species',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300,
text_auto = '.2f', orientation = 'h',
template = "plotly_white")
fig.show()

# 分别计算每个子类 (x Category y species) petal_length均值
petal_length_mean_by_species_ctgr = df.groupby([
'Category', 'species'])['petal_length'].mean().reset_index()
# 另外一种计算方法
# 创建交叉指标， 计算petal_length平均值
# 行： species； 列： Category
pd.crosstab(index = df.species, columns = df.Category,
values=df.petal_length, aggfunc='mean')
# 可视化petal_length均值， 先species分类再Category分类
fig = px.bar(petal_length_mean_by_species_ctgr,
x = 'petal_length', y = 'species',
color = 'Category', barmode = 'group',
text_auto = '.2f', orientation = 'h',
width=600, height=600,
color_discrete_sequence=px.colors.qualitative.Pastel1,
template = "plotly_white")
fig.show()

# 分别计算每个子类 (Category) petal_length均值
petal_length_mean_by_ctgr = df.groupby([
'Category'])['petal_length'].mean().reset_index()
fig = px.bar(petal_length_mean_by_ctgr,
x = 'petal_length', y = 'Category',
color = 'Category',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=300,
text_auto = '.2f', orientation = 'h',
template = "plotly_white")
fig.show()

# 绘制水平分组柱状图
fig = px.bar(petal_length_mean_by_species_ctgr,
x = 'petal_length', y = 'Category',
color = 'species', barmode = 'group',
color_discrete_sequence=px.colors.qualitative.Pastel1,
width=600, height=600,
text_auto = '.2f', orientation = 'h',
template = "plotly_white")
fig.show()









































