In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Ex1：统计未出现的类别

In [2]:
def my_crosstab(s1, s2, dropna=True):
    idx1 = (s1.cat.categories if s1.dtype.name == 'category' and not dropna else s1.unique())
    idx2 = (s2.cat.categories if s2.dtype.name == 'category' and not dropna else s2.unique())
    res = pd.DataFrame(np.zeros((idx1.shape[0], idx2.shape[0])), index=idx1, columns=idx2)
    for i, j in zip(s1, s2):
        res.at[i, j] += 1
    res = res.rename_axis(index=s1.name, columns=s2.name).astype('int')
    return res
df = pd.DataFrame({'A':['a','b','c','a'], 'B':['cat','cat','dog','cat']})
df.B = df.B.astype('category').cat.add_categories('sheep')
my_crosstab(df.A, df.B)
my_crosstab(df.A, df.B, dropna=False)

B,cat,dog,sheep
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,0,0
b,1,0,0
c,0,1,0


### Ex2：钻石数据集
#### 1.

In [3]:
df = pd.read_csv('./data/diamonds.csv')
s_obj, s_cat = df.cut, df.cut.astype('category')

In [4]:
%timeit -n 30 s_obj.nunique()

2.58 ms ± 356 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)


In [5]:
%timeit -n 30 s_cat.nunique()

1.17 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 30 loops each)


#### 2.

In [6]:
df.cut = df.cut.astype('category').cat.reorder_categories(['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'],ordered=True)
df.clarity = df.clarity.astype('category').cat.reorder_categories(['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF'],ordered=True)
res = df.sort_values(['cut', 'clarity'], ascending=[False, True])
res.head(3)

Unnamed: 0,carat,cut,clarity,price
315,0.96,Ideal,I1,2801
535,0.96,Ideal,I1,2826
551,0.97,Ideal,I1,2830


In [7]:
res.tail(3)

Unnamed: 0,carat,cut,clarity,price
47407,0.52,Fair,IF,1849
49683,0.52,Fair,IF,2144
50126,0.47,Fair,IF,2211


#### 3.

In [8]:
df.cut = df.cut.cat.reorder_categories(df.cut.cat.categories[::-1])
df.clarity = df.clarity.cat.reorder_categories(df.clarity.cat.categories[::-1])

In [9]:
df.cut = df.cut.cat.codes # 方法一：利用cat.codes

In [10]:
clarity_cat = df.clarity.cat.categories
df.clarity = df.clarity.replace(dict(zip(clarity_cat, np.arange(len(clarity_cat))))) # 方法二：使用replace映射

In [11]:
df.head(3)

Unnamed: 0,carat,cut,clarity,price
0,0.23,0,6,326
1,0.21,1,5,326
2,0.23,3,3,327


#### 4.

In [12]:
q = [0, 0.2, 0.4, 0.6, 0.8, 1]
point = [-np.infty, 1000, 3500, 5500, 18000, np.infty]
avg = df.price / df.carat
df['avg_cut'] = pd.cut(avg, bins=point, labels=['Very Low', 'Low', 'Mid', 'High', 'Very High'])
df['avg_qcut'] = pd.qcut(avg, q=q, labels=['Very Low', 'Low', 'Mid', 'High', 'Very High'])
df.head()

Unnamed: 0,carat,cut,clarity,price,avg_cut,avg_qcut
0,0.23,0,6,326,Low,Very Low
1,0.21,1,5,326,Low,Very Low
2,0.23,3,3,327,Low,Very Low
3,0.29,1,4,334,Low,Very Low
4,0.31,3,6,335,Low,Very Low


#### 5.

In [13]:
df.avg_cut.unique()

[Low, Mid, High]
Categories (3, object): [Low < Mid < High]

In [14]:
df.avg_cut.cat.categories

Index(['Very Low', 'Low', 'Mid', 'High', 'Very High'], dtype='object')

In [15]:
df.avg_cut = df.avg_cut.cat.remove_categories(['Very Low', 'Very High'])
df.avg_cut.head(3)

0    Low
1    Low
2    Low
Name: avg_cut, dtype: category
Categories (3, object): [Low < Mid < High]

#### 6.

In [16]:
interval_avg = pd.IntervalIndex(pd.qcut(avg, q=q))
interval_avg.right.to_series().reset_index(drop=True).head(3)

0    2295.0
1    2295.0
2    2295.0
dtype: float64

In [17]:
interval_avg.left.to_series().reset_index(drop=True).head(3)

0    1051.162
1    1051.162
2    1051.162
dtype: float64

In [18]:
interval_avg.length.to_series().reset_index(drop=True).head(3)

0    1243.838
1    1243.838
2    1243.838
dtype: float64