In [1]:
import altair as alt
import numpy as np
import pandas as pd

In [2]:
iris = pd.read_csv('iris.csv')
iris.info()
iris.head(5)
iris.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SepalLength  150 non-null    float64
 1   SepalWidth   150 non-null    float64
 2   PetalLength  150 non-null    float64
 3   PetalWidth   150 non-null    float64
 4   Species      150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [3]:
alt.Chart(iris).mark_point(filled=True,size=50).encode(
    alt.X('SepalLength:Q', scale=alt.Scale(domain= [4,8]), axis=alt.Axis(title='Sepal Length')),
    alt.Y('SepalWidth:Q', scale=alt.Scale(domain= [1.5,5]), axis=alt.Axis(title='Sepal Width')),
    color='Species:N',
    tooltip=['Species', 'SepalLength', 'SepalWidth']
).interactive()

# Design Decision 1: 数据分布展示方法选择

密度图 vs 数据分布图 展示分布情况。密度图无条目记录，但是更直观的展现了每个参数的中位数。

In [4]:
violinplot = alt.Chart(iris).transform_density(
    'SepalLength',
    as_=['SepalLength', 'density'],
    groupby=['Species']
).mark_area(orient='horizontal').encode(
    y = 'SepalLength:Q',
    color='Species:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
    ),
    column=alt.Column(
        'Species:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ),
    )
).properties(
    width=100
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
)

violinplot

# Design Decision 2: 筛选与不筛选

For locating information。
筛选掉糅合项，能更便于观测某一品种的分布。需要观测所有属性的分布也可以在集合中把属性添加，比没有这个设计方案更便捷。

In [5]:
first = alt.Chart().mark_point(filled=True,size=20).encode(
    alt.X(alt.repeat('column'), type='quantitative'),
    alt.Y(alt.repeat('row'), type='quantitative'),
    color='Species:N',
).properties(
    width=150,
    height=150
).repeat(
    data = iris,
    row=['SepalLength'],
    column=['SepalWidth', 'PetalLength', 'PetalWidth']
)
#--------------------------------------------------------------------
second = alt.Chart().mark_point(filled=True,size=29).encode(
    alt.X(alt.repeat('column'), type='quantitative'),
    alt.Y(alt.repeat('row'), type='quantitative'),
    color='Species:N',
).properties(
    width=150,
    height=150
).repeat(
    data = iris,
    row=['SepalWidth'],
    column=['PetalLength', 'PetalWidth']
)
#--------------------------------------------------------------------
third = alt.Chart().mark_point(filled=True,size=20).encode(
    alt.X(alt.repeat('column'), type='quantitative'),
    alt.Y(alt.repeat('row'), type='quantitative'),
    color='Species:N',
).properties(
    width=150,
    height=150
).repeat(
    data = iris,
    row=['PetalLength'],
    column=['PetalWidth']
)
#--------------------------------------------------------------------
alt.vconcat(first,second,third)

# Design Decision 3: 

In [6]:
alt.Chart(iris).mark_boxplot().encode(
    y='PetalWidth:Q',
    # x= 'Species:N',
    # y='count(PetalLength):Q',
    # color='Species:N',
).properties(width=200, height=200)

In [7]:
line1 = alt.Chart(iris).mark_bar(interpolate='monotone').encode(
    x='PetalLength:Q', 
    y='count(PetalLength):Q',
    color='Species:N',
    # facet=alt.Facet('Species:N', title=None)
).properties(width=180, height=150)

line2 = alt.Chart(iris).mark_line(interpolate='monotone').encode(
    x='PetalWidth:Q',
    y='count(PetalWidth):Q',
    color='Species:N',
    # facet=alt.Facet('Species:N', title=None)
).properties(width=180, height=150)

line3 = alt.Chart(iris).mark_area(interpolate='monotone').encode(
    x='SepalLength:Q',
    y='count(SepalLength):Q',
    color='Species:N',
    # facet=alt.Facet('Species:N', title=None)
).properties(width=180, height=150)

line4 = alt.Chart(iris).mark_area(interpolate='monotone').encode(
    x='SepalWidth:Q',
    y='count(SepalWidth):Q',
    color='Species:N',
    # facet=alt.Facet('Species:N', title=None)
).properties(width=180, height=150)

alt.hconcat(line1,line2,line3,line4)

In [8]:
alt.Chart(iris).transform_fold(
    ['PetalWidth',
     'PetalLength',
     'SepalWidth',
     'SepalLength'],
    as_ = ['Measurement_type', 'value']
).transform_density(
    density='value',
    bandwidth=0.3,
    groupby=['Measurement_type'],
    extent= [0, 8],
    counts = True,
    steps=200
).mark_area().encode(
    alt.X('value:Q'),
    alt.Y('density:Q', stack='zero'),
    alt.Color('Measurement_type:N')
).properties(width=400, height=100)

In [9]:
alt.Chart(iris).mark_area(interpolate='monotone').encode(
    x='SepalLength:Q',
    y='count(SepalLength):Q',
    color='Species:N',
    facet=alt.Facet('Species:N', title=None)
).transform_filter(
    alt.FieldRangePredicate(field='SepalLength', range=[0, 5.5])
).properties(width=200, height=150)

In [10]:
'Setosa': [iris[iris['Species']=='Iris-setosa']],

SyntaxError: invalid syntax (<ipython-input-10-a948afef8674>, line 1)