In [3]:
import numpy as np
import pandas as pd

Exercise 1: Multi-Level Grouping and Missing-Data Logic

In [4]:
df = pd.DataFrame({ 
    "key1": ["a", "a", None, "b", "b", "a", None], 
    "key2": pd.Series([1,2,1,2,1,None,1], dtype="Int64"), 
    "data1": np.random.standard_normal(7), 
    "data2": np.random.standard_normal(7) 
    }
)
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,0.205031,-1.971633
1,a,2.0,0.795969,0.140268
2,,1.0,2.057874,-0.742157
3,b,2.0,0.417936,1.267609
4,b,1.0,-0.756648,-0.519717
5,a,,1.937925,1.289718
6,,1.0,-0.764539,-1.394474


1. Create a version of the DataFrame where all missing values in key1 and key2 are 
replaced by the string "missing", and then compute the group mean of data1 for all 
(key1,key2) pairs.

In [5]:
#fill missing for key1/key2
df_filled = df.assign( #use assign to add columns 
    key1=df['key1'].fillna('missing'),
    key2 = df['key2'].astype('string').fillna('missing')
)
df_filled

Unnamed: 0,key1,key2,data1,data2
0,a,1,0.205031,-1.971633
1,a,2,0.795969,0.140268
2,missing,1,2.057874,-0.742157
3,b,2,0.417936,1.267609
4,b,1,-0.756648,-0.519717
5,a,missing,1.937925,1.289718
6,missing,1,-0.764539,-1.394474


In [6]:
#compute the group mean of data1 for all (key1,key2) pairs.
group_mean = df_filled.groupby(['key1', 'key2'])['data1'].mean()
group_mean

key1     key2   
a        1          0.205031
         2          0.795969
         missing    1.937925
b        1         -0.756648
         2          0.417936
missing  1          0.646668
Name: data1, dtype: float64

In [7]:
group_mean_df = group_mean.reset_index(name = 'data1_mean')
group_mean_df

Unnamed: 0,key1,key2,data1_mean
0,a,1,0.205031
1,a,2,0.795969
2,a,missing,1.937925
3,b,1,-0.756648
4,b,2,0.417936
5,missing,1,0.646668


2. For every (key1,key2) group, compute all of the following in one operation: 

In [8]:
#create function
def iqr(s):
    return s.quantile(0.75) - s.quantile(0.25)

def mad_mean(s):
    m = s.mean()
    return (s-m).abs().mean()


stat = (
    df_filled.groupby(['key1', 'key2']).agg(
        n_row = ('data1', 'size'),                 #so dong trong group
        n_data1_nonmissing = ('data1', 'count'),   #so data1 ko missing
        data1_iqr = ('data1', iqr),                #iqr(data1)
        data1_mad = ('data2', mad_mean),           #MAD(data2) theo mean abs dev    
    )
)
stat

Unnamed: 0_level_0,Unnamed: 1_level_0,n_row,n_data1_nonmissing,data1_iqr,data1_mad
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1,1,1,0.0,0.0
a,2,1,1,0.0,0.0
a,missing,1,1,0.0,0.0
b,1,1,1,0.0,0.0
b,2,1,1,0.0,0.0
missing,1,2,2,1.411206,0.326158


3. Identify the groups (by key1 and key2) whose mean of data1 is above the global 
mean of data1 in the entire DataFrame.

In [9]:
#tu cau 1
group_mean = df_filled['data1'].mean()   #df_filled dung ban dau , compute mean of the data1 on the df_filled dataframe

mean_data1 = df_filled.groupby(['key1', 'key2'], as_index = False)['data1'].mean()

group_above_global = mean_data1[mean_data1['data1'] > group_mean]
group_above_global

Unnamed: 0,key1,key2,data1
1,a,2,0.795969
2,a,missing,1.937925
5,missing,1,0.646668


4. Using the arrays:

In [10]:
df_filled_1 = df_filled.assign(states = np.array(["OH","CA","CA","OH","OH","CA","OH"]), 
    years = [2005,2005,2006,2005,2006,2005,2006] 
)
df_filled_1

Unnamed: 0,key1,key2,data1,data2,states,years
0,a,1,0.205031,-1.971633,OH,2005
1,a,2,0.795969,0.140268,CA,2005
2,missing,1,2.057874,-0.742157,CA,2006
3,b,2,0.417936,1.267609,OH,2005
4,b,1,-0.756648,-0.519717,OH,2006
5,a,missing,1.937925,1.289718,CA,2005
6,missing,1,-0.764539,-1.394474,OH,2006


Compute the groupwise coefficient of variation of data1 (std/mean) for each (state,year) 
pair, and return only the pairs whose coefficient is greater than 0.5.

In [11]:
group_wise = df_filled_1.groupby(['states', 'years'])['data1']
group_wise

cv = group_wise.std() / group_wise.mean()     #coefficient of variance of  the data1
cv

pairs_cv = cv[cv > 0.5]
pairs_cv.reset_index  #them or ko them 'reset_index'

<bound method Series.reset_index of states  years
CA      2005     0.590721
Name: data1, dtype: float64>

Exercise 2: Complex Aggregations With Filtering 

In [12]:
#dataframe
df_filled 

Unnamed: 0,key1,key2,data1,data2
0,a,1,0.205031,-1.971633
1,a,2,0.795969,0.140268
2,missing,1,2.057874,-0.742157
3,b,2,0.417936,1.267609
4,b,1,-0.756648,-0.519717
5,a,missing,1.937925,1.289718
6,missing,1,-0.764539,-1.394474


1. For each unique value in key1, compute:

In [13]:
#The median of the data1
data1_median = df_filled.groupby(['key1'])['data1'].median()
data1_median

key1
a          0.795969
b         -0.169356
missing    0.646668
Name: data1, dtype: float64

In [14]:
#the sum of absolute values of data2
data2_abs_sum = df_filled.groupby(['key1'])['data2'].apply(lambda s: s.abs().sum())
data2_abs_sum

key1
a          3.401619
b          1.787326
missing    2.136631
Name: data2, dtype: float64

In [15]:
# the proportion of rows where data1 exceeds its group median.
prop_data1_gt_median = (
    (df_filled["data1"] > df_filled.groupby("key1")["data1"].transform("median"))
    .groupby(df_filled["key1"])
    .mean()
)
prop_data1_gt_median

key1
a          0.333333
b          0.500000
missing    0.500000
Name: data1, dtype: float64

2. Select only those groups where more than half of the rows have data1 above the group mean. Return a DataFrame containing only those qualifying groups. 

In [16]:
qualify_grp = df_filled.groupby(['key1'], group_keys = False).filter(
    lambda g: (g["data1"] > g["data1"].mean()).mean() > 0.5
)
qualify_grp

# ?

Unnamed: 0,key1,key2,data1,data2


3. For each key1, compute the 3 smallest values of data1, but exclude any values that 
fall below the 10th percentile of data1 within that group.

In [18]:
df_filled

Unnamed: 0,key1,key2,data1,data2
0,a,1,0.205031,-1.971633
1,a,2,0.795969,0.140268
2,missing,1,2.057874,-0.742157
3,b,2,0.417936,1.267609
4,b,1,-0.756648,-0.519717
5,a,missing,1.937925,1.289718
6,missing,1,-0.764539,-1.394474


In [19]:
#  Ngưỡng p10 theo từng key1, rồi lọc
p10 = df_filled.groupby("key1")["data1"].transform(lambda s: s.quantile(0.10))
df_ok = df_filled[df_filled["data1"] >= p10]
df_ok

Unnamed: 0,key1,key2,data1,data2
1,a,2,0.795969,0.140268
2,missing,1,2.057874,-0.742157
3,b,2,0.417936,1.267609
5,a,missing,1.937925,1.289718


In [20]:
smallest3 = (
    df_ok.groupby('key1')['data1']
    .apply(lambda s : s.nsmallest(3)).reset_index(name = 'data1_smallest')
)
smallest3

Unnamed: 0,key1,level_1,data1_smallest
0,a,1,0.795969
1,a,5,1.937925
2,b,3,0.417936
3,missing,2,2.057874


4.  For each (key1,key2) group, compute both: 
o weighted mean of data1 using weights = abs(data2) 
o weighted standard deviation using the same weights.

In [21]:
def weighted_mean(x, w):
    x = np.asarray(x)
    w = np.asarray(w)
    return (w * x).sum() / w.sum()

def weighted_std(x, w):
    # weighted population std: sqrt( sum(w*(x-mu)^2) / sum(w) )
    mu = weighted_mean(x, w)
    return np.sqrt((w * (x - mu) ** 2).sum() / w.sum())

def w_stats(g):
    # an toàn nếu có missing ở data1/data2
    tmp = g[["data1", "data2"]].dropna()
    w = tmp["data2"].abs().to_numpy()
    x = tmp["data1"].to_numpy()
    return pd.Series({
        "wmean_data1": weighted_mean(x, w),
        "wstd_data1": weighted_std(x, w),
    })

result = (
    df_filled.groupby(["key1", "key2"])
    .apply(w_stats)
    .reset_index()
)

result

  .apply(w_stats)


Unnamed: 0,key1,key2,wmean_data1,wstd_data1
0,a,1,0.205031,0.0
1,a,2,0.795969,0.0
2,a,missing,1.937925,0.0
3,b,1,-0.756648,0.0
4,b,2,0.417936,0.0
5,missing,1,0.215824,1.343829


Exercise 3: Advanced Apply Functions

1. Write a function that, given any group, returns a DataFrame containing:

In [22]:
def select_rows(g: pd.DataFrame) -> pd.DataFrame:
    #  row có data1 lớn nhất
    idx_max_data1 = g["data1"].idxmax()

    #  row có data2 nhỏ nhất
    idx_min_data2 = g["data2"].idxmin()

    # row maximize data1 / (|data2| + 1)
    ratio = g["data1"] / (g["data2"].abs() + 1)
    idx_max_ratio = ratio.idxmax()

    out = g.loc[[idx_max_data1, idx_min_data2, idx_max_ratio]].copy()
    out["picked_by"] = ["largest_data1", "smallest_data2", "max_ratio"]
    out["ratio"] = ratio.loc[out.index].to_numpy()   # tuỳ chọn: để kiểm tra
    return out

# (Tuỳ chọn) nếu muốn key1 bị thiếu thành 1 group riêng

result = df_filled.groupby("key1", group_keys=True).apply(select_rows)
result

  result = df_filled.groupby("key1", group_keys=True).apply(select_rows)


Unnamed: 0_level_0,Unnamed: 1_level_0,key1,key2,data1,data2,picked_by,ratio
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,5,a,missing,1.937925,1.289718,largest_data1,0.84636
a,0,a,1,0.205031,-1.971633,smallest_data2,0.068996
a,5,a,missing,1.937925,1.289718,max_ratio,0.84636
b,3,b,2,0.417936,1.267609,largest_data1,0.184307
b,4,b,1,-0.756648,-0.519717,smallest_data2,-0.497887
b,3,b,2,0.417936,1.267609,max_ratio,0.184307
missing,2,missing,1,2.057874,-0.742157,largest_data1,1.181222
missing,6,missing,1,-0.764539,-1.394474,smallest_data2,-0.319293
missing,2,missing,1,2.057874,-0.742157,max_ratio,1.181222


2. Write a function that splits each group into quartiles based on data1 and returns the 
top 25% of rows per group. Apply to (key1,key2) groups.

In [23]:
def top_25pct(g: pd.DataFrame) -> pd.DataFrame:
    # Nếu group đủ lớn và đủ giá trị khác nhau thì dùng qcut để chia đúng 4 quartiles
    if len(g) >= 4 and g["data1"].nunique() >= 4:
        q = pd.qcut(g["data1"], 4, labels=False, duplicates="drop")
        return g[q == q.max()]  # quartile cao nhất
    # Fallback an toàn cho group nhỏ/ít unique: dùng ngưỡng percentile 75%
    thr = g["data1"].quantile(0.75)
    return g[g["data1"] >= thr]

top_rows = df_filled.groupby(["key1", "key2"], group_keys=False).apply(top_25pct)
top_rows


  top_rows = df_filled.groupby(["key1", "key2"], group_keys=False).apply(top_25pct)


Unnamed: 0,key1,key2,data1,data2
0,a,1,0.205031,-1.971633
1,a,2,0.795969,0.140268
5,a,missing,1.937925,1.289718
4,b,1,-0.756648,-0.519717
3,b,2,0.417936,1.267609
2,missing,1,2.057874,-0.742157


3. Write a function that  performs  group-specific  normalization for both data1 and 
data2  simultaneously  and  returns  only  rows  whose  Mahalanobis  distance 
(computed within each group) is above the group’s 90th percentile.

In [24]:
def filter_mahal_q90(g: pd.DataFrame) -> pd.DataFrame:
    X = g[["data1", "data2"]].to_numpy(dtype=float)

    # group-specific normalization (z-score theo từng group)
    mu = np.nanmean(X, axis=0)
    sigma = np.nanstd(X, axis=0, ddof=0)
    sigma[sigma == 0] = np.nan  # tránh chia 0

    Z = (X - mu) / sigma

    # chỉ tính distance cho các row có đủ 2 biến
    ok = np.isfinite(Z).all(axis=1)
    Z = Z[ok]
    idx = g.index[ok]

    if len(Z) < 2:
        return g.iloc[0:0].copy()  # group quá nhỏ -> không đủ để tính covariance ổn định

    # covariance trong không gian đã chuẩn hoá (ddof=0)
    S = np.cov(Z, rowvar=False, ddof=0)
    S_inv = np.linalg.pinv(S)  # pinv để xử lý covariance singular/near-singular

    d2 = np.einsum("ij,jk,ik->i", Z, S_inv, Z)  # squared Mahalanobis
    d = np.sqrt(d2)

    thr = np.quantile(d, 0.90)
    keep = d > thr  # “above the 90th percentile” đúng như đề bài

    out = g.loc[idx[keep]].copy()
    out["mahal_dist"] = d[keep]
    out["mahal_thr_q90"] = thr
    return out

result = df_filled.groupby(["key1", "key2"], group_keys=False).apply(filter_mahal_q90)
result

  result = df_filled.groupby(["key1", "key2"], group_keys=False).apply(filter_mahal_q90)


Unnamed: 0,key1,key2,data1,data2,mahal_dist,mahal_thr_q90


4. Apply a function to each group that: 
o sorts the group by data2, 
o divides it into 3 equal-sized chunks, 
o computes the median of data1 for each chunk, 
o returns the three medians as a Series.

In [25]:
def medians_by_sorted_chunks(g: pd.DataFrame, n_chunks: int = 3) -> pd.Series:
    g = g.sort_values("data2", kind="mergesort")  # sort trong group theo data2

    # chia gần-equal-size theo vị trí sau khi sort: 0..n_chunks-1
    # (nếu group nhỏ/khó chia thì vẫn chạy được với cut)
    chunk_id = pd.cut(
        np.arange(len(g)),
        bins=n_chunks,
        labels=False,
        include_lowest=True,
    )

    meds = g.groupby(chunk_id)["data1"].median()

    # đảm bảo trả đúng 3 median theo thứ tự chunk1..chunk3 (thiếu -> NaN)
    meds = meds.reindex(range(n_chunks))
    meds.index = [f"chunk{i+1}_median" for i in range(n_chunks)]
    return meds

result = df_filled.groupby("key1").apply(medians_by_sorted_chunks)
result

  result = df_filled.groupby("key1").apply(medians_by_sorted_chunks)


data1,chunk1_median,chunk2_median,chunk3_median
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.205031,0.795969,1.937925
b,-0.756648,,0.417936
missing,-0.764539,,2.057874


Exercise 4: Transform and Multi-Step Group Derivations

In [27]:
df2 = pd.DataFrame({ 
    "key": ["a","b","c"] * 4, 
    "value": np.arange(12.) 
    }
) 
df2

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


1. Create a new column containing the Z-score of each value within its group, but 
compute  the  mean  and  std  manually  using  transform  rather  than  any  built-in 
normalization.

In [28]:
# mean và std tính thủ công bằng transform
mu = df2.groupby("key")["value"].transform("mean")
sigma = df2.groupby("key")["value"].transform(lambda s: s.std(ddof=0))  # ddof=0 (population std)

df2["zscore"] = (df2["value"] - mu) / sigma
df2

Unnamed: 0,key,value,zscore
0,a,0.0,-1.341641
1,b,1.0,-1.341641
2,c,2.0,-1.341641
3,a,3.0,-0.447214
4,b,4.0,-0.447214
5,c,5.0,-0.447214
6,a,6.0,0.447214
7,b,7.0,0.447214
8,c,8.0,0.447214
9,a,9.0,1.341641


2. Create another column containing the rank percentile of each value inside its group 
(0 = smallest, 1 = largest).

In [29]:
# n phần tử trong mỗi group
n = df2.groupby("key")["value"].transform("size")

# rank trong group (1..n), rồi đưa về 0..1
df2["rank_pct_0_1"] = (df2.groupby("key")["value"].rank(method="average") - 1) / (n - 1)
df2


Unnamed: 0,key,value,zscore,rank_pct_0_1
0,a,0.0,-1.341641,0.0
1,b,1.0,-1.341641,0.0
2,c,2.0,-1.341641,0.0
3,a,3.0,-0.447214,0.333333
4,b,4.0,-0.447214,0.333333
5,c,5.0,-0.447214,0.333333
6,a,6.0,0.447214,0.666667
7,b,7.0,0.447214,0.666667
8,c,8.0,0.447214,0.666667
9,a,9.0,1.341641,1.0


3. Add a column that indicates whether each value is within the middle 50% range 
(between 25th and 75th percentiles) of its group.

In [30]:
q25 = df2.groupby(['key'])['value'].transform(lambda s : s.quantile(0.25))

q75 = df2.groupby("key")["value"].transform(lambda s: s.quantile(0.75))

df2['in_middle_50'] = df2['value'].between(q25,q75, inclusive= 'both')
df2

Unnamed: 0,key,value,zscore,rank_pct_0_1,in_middle_50
0,a,0.0,-1.341641,0.0,False
1,b,1.0,-1.341641,0.0,False
2,c,2.0,-1.341641,0.0,False
3,a,3.0,-0.447214,0.333333,True
4,b,4.0,-0.447214,0.333333,True
5,c,5.0,-0.447214,0.333333,True
6,a,6.0,0.447214,0.666667,True
7,b,7.0,0.447214,0.666667,True
8,c,8.0,0.447214,0.666667,True
9,a,9.0,1.341641,1.0,False


4. Using only vectorized operations and group transforms, compute a group-specific 
linear transformation mapping the minimum of each group to 0 and the maximum 
to 1.

In [31]:
gmin = df2.groupby("key")["value"].transform("min")
gmax = df2.groupby("key")["value"].transform("max")

denom = gmax - gmin

# xử lý trường hợp group có max == min (tránh chia 0)
df2["scaled_0_1"] = np.where(denom.eq(0), 0.0, (df2["value"] - gmin) / denom)

df2


Unnamed: 0,key,value,zscore,rank_pct_0_1,in_middle_50,scaled_0_1
0,a,0.0,-1.341641,0.0,False,0.0
1,b,1.0,-1.341641,0.0,False,0.0
2,c,2.0,-1.341641,0.0,False,0.0
3,a,3.0,-0.447214,0.333333,True,0.333333
4,b,4.0,-0.447214,0.333333,True,0.333333
5,c,5.0,-0.447214,0.333333,True,0.333333
6,a,6.0,0.447214,0.666667,True,0.666667
7,b,7.0,0.447214,0.666667,True,0.666667
8,c,8.0,0.447214,0.666667,True,0.666667
9,a,9.0,1.341641,1.0,False,1.0


5. Reconstruct each group’s values so that they become symmetric around zero (i.e., 
transformed values sum to 0 within each group).

In [32]:
s = df2.groupby("key")["value"].transform("sum")
n = df2.groupby("key")["value"].transform("size")

df2["value_sym0"] = df2["value"] - (s / n)

# kiểm tra: tổng trong từng group = 0
df2.groupby("key")["value_sym0"].sum()


key
a    0.0
b    0.0
c    0.0
Name: value_sym0, dtype: float64

Exercise 5: High-Dimensional Pivot-Style Summaries 

Create or use any DataFrame with: 
- at least two categorical columns, 
- at least three numerical columns, 
- at least 200 rows.

In [34]:
rng = np.random.default_rng(42)
n = 240  # >= 200

df3 = pd.DataFrame({
    "region": rng.choice(["North","South","East","West"], size=n),
    "channel": rng.choice(["Retail","Online","Wholesale"], size=n),
    # sales > 0 để geometric mean hợp lệ
    "sales": rng.lognormal(mean=2.0, sigma=0.6, size=n),
    "cost": rng.normal(loc=50, scale=10, size=n),
    "units": rng.integers(1, 200, size=n),
})
df3

Unnamed: 0,region,channel,sales,cost,units
0,North,Retail,4.770970,56.191711,23
1,West,Online,4.990380,46.607415,102
2,East,Retail,2.037302,60.638515,55
3,South,Online,6.701963,38.580618,98
4,South,Online,3.906147,50.063391,6
...,...,...,...,...,...
235,East,Wholesale,4.148494,56.474847,146
236,East,Wholesale,9.103192,49.863532,107
237,South,Retail,5.787805,57.016636,156
238,South,Online,6.230033,39.649219,190


1. Compute a pivot-like table where rows are grouped by one categorical variable, 
columns by another, and each cell contains the geometric  mean of a numerical 
variable (use logs to compute safely).

In [35]:
# Geometric mean: exp(mean(log(x)))  (x phải > 0)
df3["log_sales"] = np.log(df3["sales"])

log_pivot = df3.pivot_table(
    index="region",
    columns="channel",
    values="log_sales",
    aggfunc="mean"
)

gmean_pivot = np.exp(log_pivot)   # back-transform
gmean_pivot

channel,Online,Retail,Wholesale
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,8.87318,6.760989,7.051668
North,7.361969,6.763698,8.156978
South,7.392916,7.906951,7.916565
West,6.42718,8.723991,5.908993


2. Extend the table to aggregate three numerical variables simultaneously using a 
custom aggregation for each variable

In [36]:
def geo_mean_pos(s):
    s = s.dropna()
    return float(np.exp(np.log(s).mean()))  # geometric mean (log-safe, cần s > 0)

def p90(s):
    return float(s.quantile(0.90))

agg_long = (
    df3.groupby(["region", "channel"], as_index=False)
      .agg(
          sales_gmean=("sales", geo_mean_pos),   # custom 1: geometric mean
          cost_median=("cost", "median"),        # custom 2: median
          units_p90=("units", p90),              # custom 3: 90th percentile
      )
)

pivot3 = agg_long.pivot(index="region", columns="channel")
pivot3

Unnamed: 0_level_0,sales_gmean,sales_gmean,sales_gmean,cost_median,cost_median,cost_median,units_p90,units_p90,units_p90
channel,Online,Retail,Wholesale,Online,Retail,Wholesale,Online,Retail,Wholesale
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
East,8.87318,6.760989,7.051668,46.480494,49.161558,47.02552,177.1,169.4,183.2
North,7.361969,6.763698,8.156978,54.650802,48.579968,49.875766,171.0,173.2,174.5
South,7.392916,7.906951,7.916565,50.063391,49.601254,49.824372,160.2,160.9,188.8
West,6.42718,8.723991,5.908993,48.08656,52.416597,47.250797,183.0,180.0,177.0


3.  Add a row and column of grand statistics using your own custom aggregation rule 
rather than simple totals.

In [37]:
def geo_mean_pos(s):
    s = s.dropna()
    return float(np.exp(np.log(s).mean()))  # cần s > 0

def p90(s):
    return float(s.quantile(0.90))

def agg_block(d: pd.DataFrame) -> pd.Series:
    return pd.Series({
        "sales_gmean": geo_mean_pos(d["sales"]),
        "cost_median": float(d["cost"].median()),
        "units_p90": p90(d["units"]),
    })

# 1) Bảng chính: region x channel (mỗi ô là 3 thống kê)
cell = df3.groupby(["region", "channel"]).apply(agg_block).reset_index()
main = cell.pivot(index="region", columns="channel")  # MultiIndex columns: (metric, channel)

# 2) Grand column "All": gộp theo region (bỏ channel) nhưng vẫn dùng custom agg
row_all_df = df3.groupby("region").apply(agg_block).reset_index()
row_all_df["channel"] = "All"
row_all = row_all_df.pivot(index="region", columns="channel")
main_plus = main.join(row_all, how="left")

# 3) Grand row "All": gộp theo channel (bỏ region) + thêm ô (All, All) là toàn bộ df
col_all_df = df3.groupby("channel").apply(agg_block).reset_index()
col_all_df["region"] = "All"
all_row = col_all_df.pivot(index="region", columns="channel")

overall = agg_block(df)  # thống kê toàn bộ dữ liệu
for metric, val in overall.items():
    all_row.loc["All", (metric, "All")] = val

# 4) Ghép ra bảng cuối
out = pd.concat([main_plus, all_row], axis=0)

# (tuỳ chọn) sắp thứ tự cột để 'All' nằm cuối mỗi metric
metrics = out.columns.levels[0]
channels = [c for c in out.columns.levels[1] if c != "All"] + ["All"]
out = out.reindex(columns=pd.MultiIndex.from_product([metrics, channels]))

out


  cell = df3.groupby(["region", "channel"]).apply(agg_block).reset_index()
  row_all_df = df3.groupby("region").apply(agg_block).reset_index()
  col_all_df = df3.groupby("channel").apply(agg_block).reset_index()


Unnamed: 0_level_0,sales_gmean,sales_gmean,sales_gmean,sales_gmean,cost_median,cost_median,cost_median,cost_median,units_p90,units_p90,units_p90,units_p90
Unnamed: 0_level_1,Online,Retail,Wholesale,All,Online,Retail,Wholesale,All,Online,Retail,Wholesale,All
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
East,8.87318,6.760989,7.051668,7.326091,46.480494,49.161558,47.02552,47.343582,177.1,169.4,183.2,176.2
North,7.361969,6.763698,8.156978,7.294578,54.650802,48.579968,49.875766,49.361825,171.0,173.2,174.5,173.8
South,7.392916,7.906951,7.916565,7.716559,50.063391,49.601254,49.824372,49.943881,160.2,160.9,188.8,172.0
West,6.42718,8.723991,5.908993,7.065641,48.08656,52.416597,47.250797,48.766288,183.0,180.0,177.0,182.0
All,7.348204,7.490405,7.213264,7.360818,48.295924,49.238659,48.408151,49.134026,178.0,175.4,185.8,178.1


4. Create  a  cross-tabulation  counting  occurrences  of  each  pair  of  categories,  then 
create a second version where each cell contains the entropy of the distribution of 
numerical values inside that category pair.

In [38]:
# 1) Cross-tab đếm số lần xuất hiện
ct_counts = pd.crosstab(df3["region"], df3["channel"])
ct_counts

#2) Cross-tab entropy của biến số theo từng cặp
def shannon_entropy_binned(x, bins=12, base=2):
    x = pd.Series(x).dropna().to_numpy()
    if x.size == 0:
        return np.nan

    counts, _ = np.histogram(x, bins=bins)   # tạo phân phối theo bins
    total = counts.sum()
    if total == 0:
        return np.nan

    p = counts[counts > 0] / total
    return float(-(p * np.log2(p)).sum())    # base=2 -> entropy tính theo “bits”

entropy_tbl = (
    df3.groupby(["region", "channel"])["units"]   # có thể thay "units" bằng "sales"/"cost"
      .apply(shannon_entropy_binned, bins=12)
      .unstack("channel")
)

entropy_tbl


channel,Online,Retail,Wholesale
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,3.039149,3.277613,3.256867
North,2.845351,3.03327,3.20282
South,3.349275,2.950826,3.141635
West,3.332095,3.099436,3.030639


5. Produce a “pivot” where each cell contains the correlation between two chosen 
numerical columns, computed for the corresponding row/column group pair.

In [39]:
# chọn 2 cột số để lấy correlation
xcol, ycol = "sales", "cost"

def corr_in_group(g):
    tmp = g[[xcol, ycol]].dropna()
    if len(tmp) < 2:          # nhóm quá nhỏ thì corr không xác định
        return np.nan
    return tmp[xcol].corr(tmp[ycol])

corr_pivot = (
    df3.groupby(["region", "channel"])
      .apply(corr_in_group)
      .unstack("channel")     # columns = channel, rows = region
)

corr_pivot


  .apply(corr_in_group)


channel,Online,Retail,Wholesale
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,-0.051822,-0.00576,0.004266
North,0.287902,-0.136407,-0.315774
South,0.333242,-0.078435,0.16231
West,0.32737,-0.634276,0.071511


Exercise 6: Multi-Stage Group Computations

1.  For each group defined by key1, compute: 
o the group mean of data1, 
o the group std of data1, 
o the Z-score of each row, 
o and then return only rows whose Z-score is between 1.0 and 2.5    

In [40]:
df_filled

Unnamed: 0,key1,key2,data1,data2
0,a,1,0.205031,-1.971633
1,a,2,0.795969,0.140268
2,missing,1,2.057874,-0.742157
3,b,2,0.417936,1.267609
4,b,1,-0.756648,-0.519717
5,a,missing,1.937925,1.289718
6,missing,1,-0.764539,-1.394474


In [41]:
# group mean / std của data1
mu = df_filled.groupby("key1")["data1"].transform("mean")
sd = df_filled.groupby("key1")["data1"].transform("std")   # std mẫu (ddof=1) mặc định

# Z-score từng dòng
z = (df_filled["data1"] - mu) / sd

# gắn thêm cột để xem + lọc theo điều kiện
out = df_filled.assign(data1_mean=mu, data1_std=sd, zscore=z)

result = out[out["zscore"].between(1.0, 2.5, inclusive="both")]
result

Unnamed: 0,key1,key2,data1,data2,data1_mean,data1_std,zscore
5,a,missing,1.937925,1.289718,0.979642,0.880927,1.087813


2. Compute for each key1: 
o the total variation of data1 (sum of squared deviations), 
o the number of sign changes in data2, 
o the groupwise correlation between data1 and data2.

In [42]:
def total_variation(s):
    # sum of squared deviations from group mean
    m = s.mean()
    return ((s - m) ** 2).sum()

def sign_changes(s):
    # đếm số lần đổi dấu liên tiếp trong data2 (bỏ qua 0 nếu có)
    x = s.dropna().to_numpy()
    sign = np.sign(x)
    sign = sign[sign != 0]
    if sign.size < 2:
        return 0
    return (sign[1:] != sign[:-1]).sum()

def corr_data1_data2(g):
    tmp = g[["data1", "data2"]].dropna()
    if len(tmp) < 2:
        return np.nan
    return tmp["data1"].corr(tmp["data2"])

result = (
    df_filled.groupby("key1")
       .apply(lambda g: pd.Series({
           "data1_total_variation": total_variation(g["data1"]),
           "data2_sign_changes": sign_changes(g["data2"]),
           "corr_data1_data2": corr_data1_data2(g),
       }))
       .reset_index()
)

result


  .apply(lambda g: pd.Series({


Unnamed: 0,key1,data1_total_variation,data2_sign_changes,corr_data1_data2
0,a,1.552063,1.0,0.939262
1,b,0.689823,1.0,1.0
2,missing,3.983007,0.0,1.0


3.  For each (key1,key2) pair, create a single summary row containing: 
o the determinant of the covariance matrix of (data1,data2), 
o the slope of the best-fit regression line predicting data2 from data1, 
o the ratio of positive to negative values in the group.

In [44]:
def summarize_pair(g: pd.DataFrame, ratio_col="data2") -> pd.Series:
    x = g["data1"].to_numpy(dtype=float)
    y = g["data2"].to_numpy(dtype=float)

    # 1) det(cov([data1, data2]))  (cần >=2 dòng)
    if len(g) >= 2:
        cov = np.cov(np.vstack([x, y]), ddof=1)   # 2x2 covariance matrix
        det_cov = float(np.linalg.det(cov))
    else:
        det_cov = np.nan

    # 2) slope của OLS: data2 ~ a + b*data1  => b = cov(x,y)/var(x)
    if len(g) >= 2:
        vx = np.var(x, ddof=1)
        slope = float(np.cov(x, y, ddof=1)[0, 1] / vx) if vx != 0 else np.nan
    else:
        slope = np.nan

    # 3) ratio positive/negative trong group (chọn cột để tính qua ratio_col)
    v = g[ratio_col].to_numpy(dtype=float)
    pos = np.sum(v > 0)
    neg = np.sum(v < 0)
    pos_neg_ratio = float(pos / neg) if neg != 0 else (np.inf if pos > 0 else np.nan)

    return pd.Series({
        "det_cov_data1_data2": det_cov,
        "slope_data2_on_data1": slope,
        f"pos_neg_ratio_{ratio_col}": pos_neg_ratio,
        "n_rows": len(g),
    })

result = (
    df_filled.groupby(["key1", "key2"])
        .apply(summarize_pair, ratio_col="data2")  # đổi "data2" -> "data1" nếu bạn muốn
        .reset_index()
)

result


  .apply(summarize_pair, ratio_col="data2")  # đổi "data2" -> "data1" nếu bạn muốn


Unnamed: 0,key1,key2,det_cov_data1_data2,slope_data2_on_data1,pos_neg_ratio_data2,n_rows
0,a,1,,,0.0,1.0
1,a,2,,,inf,1.0
2,a,missing,,,inf,1.0
3,b,1,,,0.0,1.0
4,b,2,,,inf,1.0
5,missing,1,-1.105507e-16,0.23112,0.0,2.0


4. Build a final DataFrame (one row per key1) with  at  least  six  custom  metrics, 
including at least: 
o one percentile statistic, 
o one robust statistic (MAD, trimmed mean, etc.), 
o one measure of variability, 
o one measure of asymmetry (skewness, or custom). 

In [45]:
def iqr(s):
    return float(s.quantile(0.75) - s.quantile(0.25))

def mad_mean(s):
    m = s.mean()
    return float((s - m).abs().mean())  # mean absolute deviation from mean

def trimmed_mean(s, trim=0.1):
    s = s.dropna().sort_values()
    n = len(s)
    if n == 0:
        return np.nan
    k = int(np.floor(n * trim))
    s2 = s.iloc[k:n-k] if n - 2*k > 0 else s
    return float(s2.mean())

def skewness(s):
    x = s.dropna().to_numpy(dtype=float)
    n = x.size
    if n < 3:
        return np.nan
    m = x.mean()
    sd = x.std(ddof=0)
    if sd == 0:
        return 0.0
    return float(np.mean(((x - m) / sd) ** 3))  # population skewness

final_df = (
    df_filled.groupby("key1")
       .apply(lambda g: pd.Series({
           # basic
           "n_rows": len(g),

           # percentile statistic (example)
           "data1_p90": float(g["data1"].quantile(0.90)),

           # robust statistics (examples)
           "data2_mad": mad_mean(g["data2"]),
           "data2_trimmed_mean_10pct": trimmed_mean(g["data2"], trim=0.10),

           # variability measures (examples)
           "data1_std": float(g["data1"].std(ddof=1)),
           "data1_iqr": iqr(g["data1"]),

           # asymmetry (example)
           "data1_skew": skewness(g["data1"]),
       }))
       .reset_index()
)

final_df


  .apply(lambda g: pd.Series({


Unnamed: 0,key1,n_rows,data1_p90,data2_mad,data2_trimmed_mean_10pct,data1_std,data1_iqr,data1_skew
0,a,3.0,1.709534,1.194056,-0.180549,0.880927,0.866447,0.366387
1,b,2.0,0.300478,0.893663,0.373946,0.830556,0.587292,
2,missing,2.0,1.775633,0.326158,-1.068315,1.995747,1.411206,
