In [1]:
import pandas as pd
import numpy as np
import polars as pl
import polars.selectors as cs
import sys, os, re
import pickle as pkl
import dproc
from functools import partial

print(sys.version)

for i in [pd, pl, np]:
    print(i.__name__, i.__version__)

3.12.3 (main, May  7 2024, 08:28:12) [GCC 9.4.0]
pandas 2.2.2
polars 0.20.24
numpy 1.26.4


## 다운로드

In [34]:
data_path = 'data'
if not os.path.isdir(data_path):
    os.mkdir(data_path)
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org', data_path, 'data.csv'),
        ('train_pkl', data_path, 'train.pkl'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_pkl', data_path, 'test.pkl'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

if not os.path.isfile(files['train']):
    !kaggle competitions download -c playground-series-s4e6
    !unzip playground-series-s4e6.zip -d data
    !rm playground-series-s4e6.zip
    !wget https://archive.ics.uci.edu/static/public/697/predict+students+dropout+and+academic+success.zip
    !unzip predict+students+dropout+and+academic+success.zip -d data
    !rm predict+students+dropout+and+academic+success.zip

## 변수 정보 수집 및 변수 타입 선정

In [35]:
# UCI Repository에서 수집한 변수 정보를 variable.tsv에 저장해두었습니다.
# 이를 불러옵니다.
df_var = pd.read_csv('variables.tsv', sep='\t')\
    .drop(columns=['Units', 'Missing Values'])\
    .rename(columns={'Variable Name': 'name'})\
    .set_index('name')

In [36]:
# 변수의 타입을 정하기 위한 정보를 수집해와서 변수의 타입을 정합니다.
df_type = dproc.merge_type_df([
    pl.scan_csv(files['train']).pipe(dproc.get_type_df).pipe(lambda x: x.set_index(x.index.map(lambda x: x.strip()))),
    pl.scan_csv(files['test']).pipe(dproc.get_type_df).pipe(lambda x: x.set_index(x.index.map(lambda x: x.strip()))),
    pl.scan_csv(files['org'], separator=';').pipe(dproc.get_type_df).pipe(lambda x: x.set_index(x.index.map(lambda x: x.strip())))
])
X_cat= ['Marital status', 'Application mode', 'Course', 'Nacionality', "Mother's occupation", "Father's occupation"]
dt = dproc.get_type_pl(
    df_type
)

In [37]:
df_var_type = df_var.join(
    pd.Series(dt, name='type')
).assign(src='org')
df_var_type

Unnamed: 0_level_0,Role,Type,Demographic,Description,type,src
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Marital status,Feature,Integer,Marital status,1 – single 2 – married 3 – widower 4 – divorce...,Int8,org
Application mode,Feature,Integer,,1 - 1st phase - general contingent 2 - Ordinan...,Int8,org
Application order,Feature,Integer,,Application order (between 0 - first choice; a...,Int8,org
Course,Feature,Integer,,33 - Biofuel Production Technologies 171 - Ani...,Int16,org
Daytime/evening attendance,Feature,Integer,,1 – daytime 0 - evening,Int8,org
Previous qualification,Feature,Integer,Education Level,1 - Secondary education 2 - Higher education -...,Int8,org
Previous qualification (grade),Feature,Continuous,,Grade of previous qualification (between 0 and...,Float32,org
Nacionality,Feature,Integer,Nationality,1 - Portuguese; 2 - German; 6 - Spanish; 11 - ...,Int8,org
Mother's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,Int8,org
Father's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,Int8,org


## 변수 처리

In [38]:
dfl_train = pl.read_csv(files['train'], dtypes = dt)

In [39]:
def get_cat_values(var_name):
    return pd.Series(re.split("(?P<no>[0-9]+ - )", df_var.loc[var_name, "Description"])[1:])\
    .rename(index=lambda x: 'no' if x % 2 == 0 else 'desc').rename('var').reset_index().assign(
        item=lambda x: x.index // 2,
    ).pivot(index='item', columns='index', values='var')\
    .assign(
        no=lambda x: x['no'].str.slice(0, -3),
    )['no'].astype('int').values
cat_to_arrage = ['Application mode', 'Course', 'Previous qualification', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", "Nacionality"]

In [40]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
dprocs = list()

filter_cat_val = lambda x: x.select(
    *[pl.when(pl.col(i).is_in(get_cat_values(i))).then(i).otherwise(-1) for i in cat_to_arrage]
)
desc = [('filter_{}'.format(i) , 'i: 설명에 등장하지 않은 변수를 -1로 치환합니다') for i in cat_to_arrage]
dprocs.append(
    partial(dproc.select_opr, select_proc=filter_cat_val, src='filtering_cat_val', desc=desc, df_feat=df_var_type)
)
dprocs.append(
    partial(dproc.apply_processor, processor=ord_enc, X_val=X_cat, 
            info_prov=partial(dproc.ord_prov, suffix='_o'), df_feat=df_var_type),
)

In [41]:
dfl_train, df_var_type = dproc.apply_procs(dfl_train, dprocs, df_feat=df_var_type)

## 데이터 저장

In [42]:
dfl_train.to_pandas().set_index('id').to_pickle(files['train_pkl'])
del dfl_train

In [43]:
dfl_org = pl.read_csv(files['org'], dtypes = dt, separator=';')

In [44]:
dfl_org, _ = dproc.apply_procs(dfl_org, dprocs)
dfl_org.with_columns(**{'id': -pl.int_range(0, pl.len()) - 1}).to_pandas().set_index('id').to_pickle(files['org_pkl'])
del dfl_org

In [45]:
dfl_test = pl.read_csv(files['test'], dtypes = dt)

In [46]:
dfl_test, _ = dproc.apply_procs(dfl_test, dprocs)
dfl_test.to_pandas().set_index('id').to_pickle(files['test_pkl'])
del dfl_test

In [47]:
df_var_type.assign(
    var_type = lambda x: x['type'].astype(str)
).to_pickle(files['var_pkl'])