### Part 1

In [None]:
import pandas as pd
import numpy as np
import math
from scipy.stats import shapiro
from collections import Counter

In [None]:
!wget https://aysuvorov.github.io/docs/pages/private/tasks/df_maga.txt
data = pd.read_csv("df_maga.txt", delim_whitespace=True)

--2023-12-20 18:20:24--  https://aysuvorov.github.io/docs/pages/private/tasks/df_maga.txt
Resolving aysuvorov.github.io (aysuvorov.github.io)... 185.199.110.153, 185.199.108.153, 185.199.111.153, ...
Connecting to aysuvorov.github.io (aysuvorov.github.io)|185.199.110.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 502107 (490K) [text/plain]
Saving to: ‘df_maga.txt’


2023-12-20 18:20:24 (13.1 MB/s) - ‘df_maga.txt’ saved [502107/502107]



In [None]:
cat_features = "group sex risk_sum_age_pre40 risk_score_category risk_score2_category risk_score2or_category risk_smart_category risk_advance_category smoking svd_amnesis menopause dm1 dm2 dm3 trt_hypoglik trt_insulin obesity dm_angioprthy ag_angiopathy chd angi angi_fc trt_statins trt_monoclol stroke_tia piks anemia chf gout gastritis asthma copd ag ag_control af aextr vextr atachy vtachy vpv ssu av_block trt_aarithm trt_ccb_nondyhidr trt_sota_meto trt_cordaron trt_allap trt_nitro trt_beta trt_ccb_dyhidr trt_apfinh trt_ara trt_diur trt_alpha_beta trt_iva trt_allopur trt_omnik trt_warf trt_dabi trt_riva trt_apix trt_asp trt_clop trt_tica trt_digo cckd albuminurea cancer pht rythm aort_valve tricus_valve mitral_valve dd artery_age risk_max 373_0698881 372_0831385 371_0825979 357_0574136 356_0598732 355_0581166 331_8339182 330_8312542 329_82565 297_0720535 281_0533825 204_9430066".split(" ")

for feat in data.columns:
  if feat in cat_features:
    continue

  for x in data[feat]:
    if not isinstance(x, float):
      cat_features.append(feat)
      break

In [None]:
skipped_data = data.notna().agg("sum").to_frame().reset_index().rename(columns={"index": "variable", 0: "in_table"})
skipped_data["total"] = len(data.index)
skipped_data["skipped_share"] = (1 - skipped_data["in_table"] / skipped_data["total"]) * 100

In [None]:
rare_features = list(skipped_data[skipped_data["skipped_share"] >= 15]["variable"])
print(rare_features)

['risk_sum_age_pre40', 'risk_score_perc', 'risk_score_category', 'risk_score2_perc', 'risk_score2_category', 'risk_score2or_perc', 'risk_score2or_category', 'risk_smart_perc', 'risk_smart_category', 'risk_advance_perc', 'risk_advance_category', 'svd_anamnesis', 'menopause', 'dm_angioprthy', 'ag_angiopathy', 'ag_control', 'sad_max', 'dad_max', 'trt_aarithm', 'creatinkinase', 'crp', 'urea', 'ureic_acid', 'ast', 'lponp', 'lpnp', 'lvp', 'triglec', 'natrium', 'kalium', 'calcium', 'ferrum', 'ttg', 'soe', 'ef', 'tzs', 'tmjp', 'aort_valve', 'tricus_valve', 'mitral_valve', 'sdla', 'dd', 'artery_age', 'r_cavy', 'l_cavy', 'l_abi', 'r_abi', 'r_ai', 's_rb', 'd_rb', 's_lb', 'd_lb', 'risk_max']


In [None]:
skipped_data["skipped_share"] = skipped_data["skipped_share"].map("{:,.2f}%".format)
print(skipped_data)

               variable  in_table  total skipped_share
0                 group       413    414         0.24%
1                   age       413    414         0.24%
2                   sex       413    414         0.24%
3    risk_sum_age_pre40        58    414        85.99%
4       risk_score_perc        75    414        81.88%
..                  ...       ...    ...           ...
227         371_0825979       414    414         0.00%
228         372_0831385       414    414         0.00%
229         373_0698881       414    414         0.00%
230            risk_max       284    414        31.40%
231            endpoint       413    414         0.24%

[232 rows x 4 columns]


In [None]:
skipped_data.drop(columns=["in_table", "total"], inplace=True)
print(skipped_data)

               variable skipped_share
0                 group         0.24%
1                   age         0.24%
2                   sex         0.24%
3    risk_sum_age_pre40        85.99%
4       risk_score_perc        81.88%
..                  ...           ...
227         371_0825979         0.00%
228         372_0831385         0.00%
229         373_0698881         0.00%
230            risk_max        31.40%
231            endpoint         0.24%

[232 rows x 2 columns]


### Part 2

In [None]:
def get_row_template():
  stats = ["name", "valid", "count", "mean", "standart_deviation", "median", "min", "max", "sw_test"]
  return pd.DataFrame(np.nan, index=[0], columns=stats)

def get_cat_feature_stat(name, arr):
  arr = list(arr[arr.notna()])
  rows = [get_row_template()]
  rows[0]["name"] = name
  for cat_value, count in Counter(arr).items():
    row = get_row_template()
    row["name"] = cat_value
    row["valid"] = len(arr)
    row["count"] = count
    rows.append(row)
  return pd.concat(rows, ignore_index=True)

def get_quant_feture_stat(name, arr):
  row = get_row_template()
  arr = list(arr[arr.notna()])
  row["name"] = name
  row["valid"] = len(arr)
  row["mean"] = sum(arr) / len(arr)
  row["standart_deviation"] = np.std(arr)
  row["median"] = np.median(arr)
  row["min"] = min(arr)
  row["max"] = max(arr)
  row["sw_test"] = shapiro(arr).pvalue
  return row

def get_feature_stat(data, feat_name):
  global cat_features
  if feat_name in cat_features:
    return get_cat_feature_stat(feat_name, data[feat_name])
  return get_quant_feture_stat(feat_name, data[feat_name])

def get_stats(df):
  return pd.concat([get_feature_stat(df, feat) for feat in df.columns], ignore_index=True).fillna("-")

def format_mean_std_tuple(x):
  if isinstance(x[0], str):
    return "-"
  return "{:,.1f} \u00B1 {:,.1f}".format(*x)

def format_add_percent(x):
  if isinstance(x[0], str):
    return "-"
  return "{} ({:,.1f}%)".format(int(x[0]), 100 * x[0] / x[1])

def format_median_ranges(x):
  if isinstance(x[0], str):
    return "-"
  return "{:,.1f}, [-{:,.1f}, +{:,.1f}]".format(x[0], x[0] - x[1], x[2] - x[0])

def format_to_int(x):
  if isinstance(x, str):
    return x
  return int(x)

def format(df):
  df["mean, std"] = list(zip(df["mean"], df["standart_deviation"]))
  df["mean, std"] = df["mean, std"].map(format_mean_std_tuple)
  df["valid"] = df["valid"].map(format_to_int)
  df["count"] = list(zip(df["count"], df["valid"]))
  df["count"] = df["count"].map(format_add_percent)
  df["name"] = df["name"].map(format_to_int)
  df["median"] = list(zip(df["median"], df["min"], df["max"]))
  df["median"] = df["median"].map(format_median_ranges)
  df = df.drop(columns=["mean", "standart_deviation"])
  return df

In [None]:
print(format(get_stats(data)))

           name valid        count                median   min   max sw_test  \
0         group     -            -                     -     -     -       -   
1             1   413  321 (77.7%)                     -     -     -       -   
2             0   413   92 (22.3%)                     -     -     -       -   
3           age   413            -  61.0, [-43.0, +30.0]  18.0  91.0     0.0   
4           sex     -            -                     -     -     -       -   
...         ...   ...          ...                   ...   ...   ...     ...   
5350  AG.DM.CHD   413    31 (7.5%)                     -     -     -       -   
5351     AG.CHD   413   77 (18.6%)                     -     -     -       -   
5352      AG.DM   413    32 (7.7%)                     -     -     -       -   
5353        CHD   413     7 (1.7%)                     -     -     -       -   
5354       Ctrl   413   92 (22.3%)                     -     -     -       -   

        mean, std  
0               -  