In [1]:
import os

In [2]:
# tmp 디렉토리가 없으면 생성합니다.
if not os.path.isdir('tmp'):
    os.mkdir('tmp')
    # Abalone Dataset을 다운로드 받습니다.
    !wget https://archive.ics.uci.edu/static/public/1/abalone.zip -P tmp
    !unzip tmp/abalone.zip

In [3]:
import dproc
import pandas as pd
import polars as pl

In [12]:
# 변수를 나타내는 
df_feature = pd.DataFrame({
    "Description" : [
            "M, F, and I (infant)",
            "Longest shell measurement",
            "perpendicular to length",
            "with meat in shell",
            "whole abalone",
            "weight of meat",
            "gut weight (after bleeding)",
            "after being dried",
            "+1.5 gives the age in years"], 
    "Units": ['', 'mm', 'mm', 'mm', 'grams', 'grams', 'grams', 'grams', '']
    }, index = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 'Rings']
)
df_feature

Unnamed: 0,Description,Units
Sex,"M, F, and I (infant)",
Length,Longest shell measurement,mm
Diameter,perpendicular to length,mm
Height,with meat in shell,mm
Whole weight,whole abalone,grams
Whole weight.1,weight of meat,grams
Whole weight.2,gut weight (after bleeding),grams
Shell weight,after being dried,grams
Rings,+1.5 gives the age in years,


In [9]:
# Abalone 데이터셋에 적합한 데이터 타입을 찾기 위한 정보를 가져옵니다.
df_type = pl.read_csv('tmp/abalone.data', has_header=False, new_columns=df_feature.index.tolist()).pipe(
    dproc.get_type_df
)
df_type

Unnamed: 0_level_0,min,max,na,count,n_unique,dtype,f32,i32,i16,i8
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sex,,,0.0,4177.0,3.0,String,,,,
Length,0.075,0.815,0.0,4177.0,134.0,Float64,True,True,True,True
Diameter,0.055,0.65,0.0,4177.0,111.0,Float64,True,True,True,True
Height,0.0,1.13,0.0,4177.0,51.0,Float64,True,True,True,True
Whole weight,0.002,2.8255,0.0,4177.0,2429.0,Float64,True,True,True,True
Whole weight.1,0.001,1.488,0.0,4177.0,1515.0,Float64,True,True,True,True
Whole weight.2,0.0005,0.76,0.0,4177.0,880.0,Float64,True,True,True,True
Shell weight,0.0015,1.005,0.0,4177.0,926.0,Float64,True,True,True,True
Rings,1.0,29.0,0.0,4177.0,28.0,Int64,True,True,True,True


In [11]:
# 병렬화 기능을 가진 polars로 전처리를 합니다.
# Polars를 불러 올 때 사용하기 위한 데이터 타입을 가져옵니다.
pl_dtypes = dproc.get_type_pl(df_type)
pl_dtypes

{'Length': Float32,
 'Diameter': Float32,
 'Height': Float32,
 'Whole weight': Float32,
 'Whole weight.1': Float32,
 'Whole weight.2': Float32,
 'Shell weight': Float32,
 'Rings': Int8,
 'Sex': Categorical}

In [16]:
# dfl_로 시작하면 pl.DataFrame
dfl_abalone = pl.read_csv('tmp/abalone.data', has_header=False, new_columns=df_feature.index.tolist(), dtypes=pl_dtypes)
dfl_abalone

Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
cat,f32,f32,f32,f32,f32,f32,f32,i8
"""M""",0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
"""M""",0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
"""F""",0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
"""M""",0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
"""I""",0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
…,…,…,…,…,…,…,…,…
"""F""",0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
"""M""",0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
"""M""",0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
"""F""",0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10


Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
cat,f32,f32,f32,f32,f32,f32,f32,i8
"""M""",0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
"""M""",0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
"""F""",0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
"""M""",0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
"""I""",0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
…,…,…,…,…,…,…,…,…
"""F""",0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
"""M""",0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
"""M""",0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
"""F""",0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10
