In [11]:
import os, sys

import sgpp
import dproc
import sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


**Idea**
- 선형회귀 모델의 예측값을 cross-fitting을 하여 만들어냅니다.
- 이 때에 Episode_Length_minutes을 빼고 전체 데이터셋으로 만들고, 다른 하나는 Episode_Length_minutes 포함하여 미결측인 것에 대해 만듭니다.
- 잔차를 구하고, 잔차를 예측하는 GBM 모델들을 수치형 변수는 모두 범주형 변수로 취급한 상태로 만들어 냅니다.
- 수치형 변수를 범주형 변수로 만들 때에는 최소 등장 빈도 이상인 것에 대해서 범주화를 합니다. 
- 최적의 등장 빈도는 실험을 통해서 탐색해봅니다.

In [3]:
sc = sgutil.SGCache('img', 'result', 'model')
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': (pl.col('Guest_Popularity_percentage').fill_null(pl.col('Guest_Popularity_percentage').clip(0,100).mean()).clip(0, 100) / 50),
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 50,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id')
)
p.fit(['data/train.csv'])

In [5]:
df_train = p.transform(['data/train.csv'])
df_train_1 = df_train.loc[df_train['Episode_Length_minutes'].notna()]
df_test = p.transform(['data/test.csv'])

# 선형 회귀 모델 만들기

## ALL

In [6]:
from sklearn.linear_model import LinearRegression

X_cat_lr = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
X_num_lr = ['GP', 'HP', 'NAd']

reg_lr = make_pipeline(
    ColumnTransformer([
        ('pt', 'passthrogh', X_num_lr),
        ('ohe', OneHotEncoder(drop = 'first'), X_cat_lr)
    ]), LinearRegression()
)

X_lr = X_cat_lr + X_num_lr
reg_lr.fit(df_train[X_lr], df_train['target']

Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes', 'ELm_num', 'GP', 'HP', 'NAd', 'ELm_na',
       'GPp_na', 'ELm_sqrt'],
      dtype='object')

## Episode_Length_minutes not NA

In [16]:
from sklearn.linear_model import LinearRegression

X_cat_lr = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
X_num_lr = ['GP', 'HP', 'NAd', 'ELm_num', 'ELm_sqrt']

reg_lr = make_pipeline(
    ColumnTransformer([
        ('pt', 'passthrogh', X_num_lr),
        ('ohe', OneHotEncoder(drop = 'first'), X_cat_lr)
    ]), LinearRegression()
)

X_lr = X_cat_lr + X_num_lr
reg_lr.fit(df_train[X_lr], df_train['target']