In [28]:
import sys
import os

print(sys.version)

3.12.3 (main, May  7 2024, 08:28:12) [GCC 9.4.0]


In [2]:
# tmp 디렉토리가 없으면 생성합니다.
if not os.path.isdir('tmp'):
    os.mkdir('tmp')
    # Abalone Dataset을 다운로드 받습니다.
    !wget https://archive.ics.uci.edu/static/public/1/abalone.zip -P tmp
    !unzip tmp/abalone.zip

In [27]:
# Abalone 데이터셋으 설명이 들어간 파일을 출력합니다.
!cat tmp/abalone.names

1. Title of Database: Abalone data

2. Sources:

   (a) Original owners of database:
	Marine Resources Division
	Marine Research Laboratories - Taroona
	Department of Primary Industry and Fisheries, Tasmania
	GPO Box 619F, Hobart, Tasmania 7001, Australia
	(contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au)

   (b) Donor of database:
	Sam Waugh (Sam.Waugh@cs.utas.edu.au)
	Department of Computer Science, University of Tasmania
	GPO Box 252C, Hobart, Tasmania 7001, Australia

   (c) Date received: December 1995


3. Past Usage:

   Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD
   thesis, Computer Science Department, University of Tasmania.

   -- Test set performance (final 1044 examples, first 3133 used for training):
	24.86% Cascade-Correlation (no hidden nodes)
	26.25% Cascade-Correlation (5 hidden nodes)
	21.5%  C4.5
	 0.0%  Linear Discriminate Analysis
	 3.57% k=5 Nearest Neighbour
      (Problem encoded as a classification task)

   -- Data set samp

# 분석의 목표

Rings를 타겟으로 하는 머신러닝 모델을 만들어 Rings를 예측하는 모델을 만듭니다.

모델의 성능 지표는 RMSLE로 합니다. 

$\sqrt{ \frac{1}{n} \sum_{i=1}^n \left(\log (1 + \hat{y}_i) - \log (1 + y_i)\right)^2}$

Kaggle에서도 이 지표를 이용한 Playground 에서 다루었던 주제 입니다. 

[Kaggle: Regression with an Abalone Dataset](https://www.kaggle.com/competitions/playground-series-s4e4)


## 실험의 설정

UCI의 abalone 데이터셋의 70%는 학습용, 30%는 평가용 데이터셋으로 만듭니다. 이 때 Rings의 비율이 동일하도록 합니다.

In [36]:
import dproc
import pandas as pd
import polars as pl
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

for i in [pd, pl, np, mpl, sns]:
    print(i.__name__, i.__version__)

pandas 2.2.2
polars 0.20.24
numpy 1.26.4
matplotlib 3.8.4
seaborn 0.13.2


In [12]:
# 변수들에 대한 설명을 담은 데이터프레임입니다. 
# Index는 변수명, Description은 변수 설명
df_feature = pd.DataFrame({
    "Description" : [
            "M, F, and I (infant)",
            "Longest shell measurement",
            "perpendicular to length",
            "with meat in shell",
            "whole abalone",
            "weight of meat",
            "gut weight (after bleeding)",
            "after being dried",
            "+1.5 gives the age in years"], 
    "Units": ['', 'mm', 'mm', 'mm', 'grams', 'grams', 'grams', 'grams', '']
    }, index = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight', 'Rings']
)
df_feature

Unnamed: 0,Description,Units
Sex,"M, F, and I (infant)",
Length,Longest shell measurement,mm
Diameter,perpendicular to length,mm
Height,with meat in shell,mm
Whole weight,whole abalone,grams
Whole weight.1,weight of meat,grams
Whole weight.2,gut weight (after bleeding),grams
Shell weight,after being dried,grams
Rings,+1.5 gives the age in years,


In [9]:
# Abalone 데이터셋에 적합한 데이터 타입을 찾기 위한 정보를 가져옵니다.
df_type = pl.read_csv('tmp/abalone.data', has_header=False, new_columns=df_feature.index.tolist()).pipe(
    dproc.get_type_df
)
df_type

Unnamed: 0_level_0,min,max,na,count,n_unique,dtype,f32,i32,i16,i8
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Sex,,,0.0,4177.0,3.0,String,,,,
Length,0.075,0.815,0.0,4177.0,134.0,Float64,True,True,True,True
Diameter,0.055,0.65,0.0,4177.0,111.0,Float64,True,True,True,True
Height,0.0,1.13,0.0,4177.0,51.0,Float64,True,True,True,True
Whole weight,0.002,2.8255,0.0,4177.0,2429.0,Float64,True,True,True,True
Whole weight.1,0.001,1.488,0.0,4177.0,1515.0,Float64,True,True,True,True
Whole weight.2,0.0005,0.76,0.0,4177.0,880.0,Float64,True,True,True,True
Shell weight,0.0015,1.005,0.0,4177.0,926.0,Float64,True,True,True,True
Rings,1.0,29.0,0.0,4177.0,28.0,Int64,True,True,True,True


In [19]:
# 병렬화 기능을 가진 polars로 전처리를 합니다.
# Polars를 불러 올 때 사용하기 위한 데이터 타입을 가져옵니다.
pl_dtypes = dproc.get_type_pl(df_type)
pl_dtypes

{'Length': Float32,
 'Diameter': Float32,
 'Height': Float32,
 'Whole weight': Float32,
 'Whole weight.1': Float32,
 'Whole weight.2': Float32,
 'Shell weight': Float32,
 'Rings': Int8,
 'Sex': Categorical}

In [20]:
# dfl_로 시작하면 pl.DataFrame
dfl_abalone = pl.read_csv('tmp/abalone.data', has_header=False, new_columns=df_feature.index.tolist(), dtypes=pl_dtypes)
dfl_abalone

Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
cat,f32,f32,f32,f32,f32,f32,f32,i8
"""M""",0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
"""M""",0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
"""F""",0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
"""M""",0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
"""I""",0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
…,…,…,…,…,…,…,…,…
"""F""",0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
"""M""",0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
"""M""",0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
"""F""",0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10


In [25]:
# 불러운 데이터 타입을 feature 데이터프레임에 추가 합니다.
df_feature = df_feature.join(pd.Series(pl_dtypes, name='type').apply(lambda x: str(x)))
df_feature

Unnamed: 0,Description,Units,type
Sex,"M, F, and I (infant)",,Categorical
Length,Longest shell measurement,mm,Float32
Diameter,perpendicular to length,mm,Float32
Height,with meat in shell,mm,Float32
Whole weight,whole abalone,grams,Float32
Whole weight.1,weight of meat,grams,Float32
Whole weight.2,gut weight (after bleeding),grams,Float32
Shell weight,after being dried,grams,Float32
Rings,+1.5 gives the age in years,,Int8


In [103]:
# pl.DataFrame은 sklearn.model_selection train_test_split가 지원하지 않아 수동으로 학습과 평가 데이터를 나눕니다. 
# 0으로 시작하는 데이터의 인덱스 컬럼 no 를 만들고 Rings로 구분하여 no 리스트를 만들고,
# np.random.choice를 이용하여 인덱스를 섞어 줍니다. 
idx = [
    np.random.choice(i, size=len(i), replace=False)
    for i in dfl_abalone.with_columns(pl.int_range(pl.len()).alias('no')).group_by('Rings').agg(pl.col('no'))['no']
]
# 70%는 train의 인덱스로 가져와 모으고, 30%는 평가셋 인덱스로 모읍니다. 
train_idx = np.hstack([i[:int((np.ceil(len(i) * 0.7)))] for i in idx])
test_idx = np.hstack([i[int((np.ceil(len(i) * 0.7))):] for i in idx])
dfl_train = dfl_abalone[train_idx] 
dfl_test = dfl_abalone[test_idx]