In [1]:
import os, sys

import sgpp
import dproc
import sgutil
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print(sys.version)

from sklearn.pipeline import make_pipeline

3.12.6 (main, Sep 30 2024, 02:19:13) [GCC 9.4.0]


- Host_Popularity_percentage 범주형 변수화
  
- Guest_Popularity_percentage 범주형 변수화

- Host_Popularity_percentage 를 -1 ~ 1 사이로 MinMaxScaling 

- Number_of_Ads에 이상치가 존재합니다. 이를 최빈값으로 정리하고 0 ~ 1 사이로 MinMaxScaling

- Guest_Popularity_percentage 를 -1 ~ 1 사이로 MinMaxScaling

- Episode_Length_minutes 는 범주형 변수화

- Episode_Length_minutes를 5에서 120 사이로

In [2]:
sc = sgutil.SGCache('img', 'result', 'model')

In [3]:
p = make_pipeline(
    sgpp.PolarsProcessor(),
    sgpp.ExprProcessor({
        'ELm_num': (pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115,
        'GP': pl.col('Guest_Popularity_percentage').clip(0, 100) / 50 ,
        'HP': pl.col('Host_Popularity_percentage').clip(0, 100) / 50,
        'NAd': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0.0) /3 ,
        'Number_of_Ads': (pl.when(pl.col('Number_of_Ads') > 4).then(0).otherwise(pl.col('Number_of_Ads'))).fill_null(0).cast(pl.Int8),
        'ELm_na': pl.col('Episode_Length_minutes').is_null(),
        'GPp_na': pl.col('Guest_Popularity_percentage').is_null(),
        'ELm_sqrt': ((pl.col('Episode_Length_minutes').clip(5, 120) - 5) / 115)  ** 0.5
    }),
    sgpp.PandasConverter(index_col = 'id')
)
p.fit(['data/train.csv'])

In [4]:
df_train = p.transform(['data/train.csv'])
df_train_1 = df_train.loc[df_train['Episode_Length_minutes'].notna()]
target = 'Listening_Time_minutes'

- Target Encoding 실험 재설계,
  > Episode_Length_minutes 제외한 모든 변수로 모든 데이터셋으로 Combination 1 ~ 4
  >
  > Episode_Length_minutes 포함 모든 변수로 Episode_Length_minutes 미결측 데이터셋으로 Combination 0 ~ 3

In [5]:
from itertools import combinations
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm 
from cuml.preprocessing import TargetEncoder

kf = KFold(4, shuffle = True, random_state = 123)
def get_tgt_rmse(df, X_tgt, smooth_space = np.linspace(1, 10, 10)):
    rmse_fold = list()
    best_prd = None
    best_rmse = np.inf
    for i in smooth_space:
        tgt = TargetEncoder(smooth = i, split_method ='continuous')
        prds = list()
        rmses = list()
        for train_idx, test_idx in kf.split(df[X_tgt], df[target]):
            df.iloc[train_idx].pipe(lambda x: tgt.fit(x[X_tgt], x[target]))
            df_valid = df.iloc[test_idx]
            prds.append(
                pd.Series(tgt.transform(df_valid[X_tgt]), index = df_valid.index)
            )
            rmses.append(
                root_mean_squared_error(df_valid[target], prds[-1])
            )
        rmse_fold.append(np.mean(rmses))
        if best_rmse > rmse_fold[-1]:
            best_rmse = rmse_fold[-1]
            best_prd = pd.concat(prds)
    return np.min(rmse_fold), smooth_space[np.argmin(rmse_fold)], best_prd

def get_tgt_rmse_list(df, X_values, smooth_space = [0.01, 0.1, 1, 10, 100, 1000]):
    results = list()
    best_prd_list = list()
    for i in tqdm(X_values):
        rmse, best_smooth, best_prd = get_tgt_rmse(df, i, smooth_space = smooth_space)
        results.append(
            pd.Series(
                [i, rmse, best_smooth], index = ['X_tgt', 'RMSE', 'smooth']
            )
        )
        best_prd_list.append(best_prd.rename('__'.join(i)))
    return pd.DataFrame(results).sort_values('RMSE'), pd.concat(best_prd_list, axis=1).sort_index()

In [6]:
X_val = [
    'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads',
    'Episode_Sentiment', 'Episode_Title', 'Genre', 'Podcast_Name', 'Publication_Day', 'Publication_Time'
]

# ALL

In [7]:
df_c1, df_prd_1 = sc.cache_result(
    'tgt_rmse_c1_all', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 1)]), rerun = 0
)
df_c1

Unnamed: 0,X_tgt,RMSE,smooth
0,[Host_Popularity_percentage],26.851036,10
2,[Number_of_Ads],26.923613,100
1,[Guest_Popularity_percentage],26.991587,100
4,[Episode_Title],27.064105,100
6,[Podcast_Name],27.104318,100
3,[Episode_Sentiment],27.117177,100
8,[Publication_Time],27.130197,1000
5,[Genre],27.131325,1000
7,[Publication_Day],27.13408,1000


In [8]:
df_c2, df_prd_2 = sc.cache_result(
    'tgt_rmse_c2_all', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 2)]), rerun = 0
)
df_c2.iloc[:25]

Unnamed: 0,X_tgt,RMSE,smooth
0,"[Host_Popularity_percentage, Guest_Popularity_...",26.6142,0.1
1,"[Host_Popularity_percentage, Number_of_Ads]",26.720847,10.0
3,"[Host_Popularity_percentage, Episode_Title]",26.815162,10.0
16,"[Number_of_Ads, Episode_Title]",26.848316,10.0
8,"[Guest_Popularity_percentage, Number_of_Ads]",26.882301,10.0
2,"[Host_Popularity_percentage, Episode_Sentiment]",26.884671,10.0
18,"[Number_of_Ads, Podcast_Name]",26.885627,10.0
15,"[Number_of_Ads, Episode_Sentiment]",26.904024,100.0
10,"[Guest_Popularity_percentage, Episode_Title]",26.90986,10.0
17,"[Number_of_Ads, Genre]",26.914936,100.0


In [9]:
df_c3, df_prd_3 = sc.cache_result(
    'tgt_rmse_c3_all', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 3)]), rerun = 0
)
df_c3.iloc[:25]

Unnamed: 0,X_tgt,RMSE,smooth
1,"[Host_Popularity_percentage, Guest_Popularity_...",26.624471,0.1
6,"[Host_Popularity_percentage, Guest_Popularity_...",26.636031,0.1
0,"[Host_Popularity_percentage, Guest_Popularity_...",26.639306,0.1
19,"[Host_Popularity_percentage, Episode_Title, Po...",26.654809,1.0
5,"[Host_Popularity_percentage, Guest_Popularity_...",26.683332,0.01
20,"[Host_Popularity_percentage, Episode_Title, Pu...",26.697376,1.0
41,"[Guest_Popularity_percentage, Episode_Title, P...",26.732151,1.0
40,"[Guest_Popularity_percentage, Episode_Title, P...",26.749837,0.1
7,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.76205,10.0
12,"[Host_Popularity_percentage, Number_of_Ads, Pu...",26.780514,10.0


In [10]:
df_c4, df_prd_4 = sc.cache_result(
    'tgt_rmse_c4_all', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 4)]), rerun = 0
)
df_c4.iloc[:25]

Unnamed: 0,X_tgt,RMSE,smooth
51,"[Host_Popularity_percentage, Episode_Title, Pu...",26.536244,1.0
29,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.54089,1.0
38,"[Host_Popularity_percentage, Episode_Sentiment...",26.542964,1.0
28,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.543013,1.0
21,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.569461,1.0
39,"[Host_Popularity_percentage, Episode_Sentiment...",26.576025,1.0
50,"[Host_Popularity_percentage, Episode_Title, Po...",26.616268,0.1
37,"[Host_Popularity_percentage, Episode_Sentiment...",26.617641,0.1
27,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.632993,0.1
64,"[Guest_Popularity_percentage, Number_of_Ads, E...",26.63366,1.0


In [11]:
df_c5, df_prd_5 = sc.cache_result(
    'tgt_rmse_c5_all', lambda : get_tgt_rmse_list(df_train, [list(i) for i in combinations(X_val, 5)]), rerun = 0
)
df_c5.iloc[:25]

Unnamed: 0,X_tgt,RMSE,smooth
60,"[Host_Popularity_percentage, Episode_Sentiment...",26.493768,0.1
37,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.503238,0.1
50,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.50654,0.1
38,"[Host_Popularity_percentage, Number_of_Ads, Ep...",26.526353,1.0
73,"[Guest_Popularity_percentage, Number_of_Ads, E...",26.601522,0.1
95,"[Guest_Popularity_percentage, Episode_Sentimen...",26.610477,0.1
85,"[Guest_Popularity_percentage, Number_of_Ads, E...",26.613909,0.1
72,"[Guest_Popularity_percentage, Number_of_Ads, E...",26.61981,0.1
66,"[Host_Popularity_percentage, Episode_Title, Ge...",26.624103,0.1
55,"[Host_Popularity_percentage, Episode_Sentiment...",26.625182,0.1


# Episode_Length_minutes Not NA

In [12]:
df_c0_2, df_prd_0_2 = sc.cache_result(
    'tgt_rmse_c0_2', lambda : get_tgt_rmse_list(df_train, [['Episode_Length_minutes']]), rerun = 0
)
df_c0_2

Unnamed: 0,X_tgt,RMSE,smooth
0,[Episode_Length_minutes],13.478717,0.1


In [13]:
df_c1_2, df_prd_1_2 = sc.cache_result(
    'tgt_rmse_c1_2', 
    lambda : get_tgt_rmse_list(df_train, [['Episode_Length_minutes'] + list(i) for i in combinations(X_val, 1)]), rerun = 0
)
df_c1_2

Unnamed: 0,X_tgt,RMSE,smooth
2,"[Episode_Length_minutes, Number_of_Ads]",13.596797,0.1
3,"[Episode_Length_minutes, Episode_Sentiment]",13.658922,0.1
8,"[Episode_Length_minutes, Publication_Time]",13.801916,0.1
7,"[Episode_Length_minutes, Publication_Day]",14.249536,0.1
5,"[Episode_Length_minutes, Genre]",14.790121,0.1
6,"[Episode_Length_minutes, Podcast_Name]",19.202019,0.1
4,"[Episode_Length_minutes, Episode_Title]",21.356184,0.1
0,"[Episode_Length_minutes, Host_Popularity_perce...",26.004004,0.01
1,"[Episode_Length_minutes, Guest_Popularity_perc...",26.308444,0.01


In [14]:
df_c2_2, df_prd_2_2 = sc.cache_result(
    'tgt_rmse_c2_2', 
    lambda : get_tgt_rmse_list(df_train, [['Episode_Length_minutes'] + list(i) for i in combinations(X_val, 2)]), rerun = 0
)
df_c2_2

Unnamed: 0,X_tgt,RMSE,smooth
15,"[Episode_Length_minutes, Number_of_Ads, Episod...",14.821768,0.1
25,"[Episode_Length_minutes, Episode_Sentiment, Pu...",15.034009,0.1
20,"[Episode_Length_minutes, Number_of_Ads, Public...",15.450538,0.1
24,"[Episode_Length_minutes, Episode_Sentiment, Pu...",16.40736,0.1
19,"[Episode_Length_minutes, Number_of_Ads, Public...",17.046865,0.1
35,"[Episode_Length_minutes, Publication_Day, Publ...",17.318822,0.1
22,"[Episode_Length_minutes, Episode_Sentiment, Ge...",17.588273,0.1
17,"[Episode_Length_minutes, Number_of_Ads, Genre]",18.331327,0.1
32,"[Episode_Length_minutes, Genre, Publication_Time]",18.614968,0.1
30,"[Episode_Length_minutes, Genre, Podcast_Name]",19.223229,0.1


In [15]:
df_c3_2, df_prd_3_2 = sc.cache_result(
    'tgt_rmse_c3_2', 
    lambda : get_tgt_rmse_list(df_train, [['Episode_Length_minutes'] + list(i) for i in combinations(X_val, 3)]), rerun = 0
)
df_c3_2

Unnamed: 0,X_tgt,RMSE,smooth
53,"[Episode_Length_minutes, Number_of_Ads, Episod...",18.849478,0.10
52,"[Episode_Length_minutes, Number_of_Ads, Episod...",20.778507,0.10
73,"[Episode_Length_minutes, Episode_Sentiment, Pu...",21.032670,0.10
63,"[Episode_Length_minutes, Number_of_Ads, Public...",21.674427,0.10
50,"[Episode_Length_minutes, Number_of_Ads, Episod...",21.966997,0.10
...,...,...,...
5,"[Episode_Length_minutes, Host_Popularity_perce...",26.967607,0.01
40,"[Episode_Length_minutes, Guest_Popularity_perc...",26.969568,0.01
2,"[Episode_Length_minutes, Host_Popularity_perce...",27.019363,0.01
3,"[Episode_Length_minutes, Host_Popularity_perce...",27.020800,0.01


In [16]:
df_c4_2, df_prd_4_2 = sc.cache_result(
    'tgt_rmse_c4_2', 
    lambda : get_tgt_rmse_list(df_train, [['Episode_Length_minutes'] + list(i) for i in combinations(X_val, 4)]), rerun = 0
)
df_c4_2

Unnamed: 0,X_tgt,RMSE,smooth
100,"[Episode_Length_minutes, Number_of_Ads, Episod...",24.114509,0.10
97,"[Episode_Length_minutes, Number_of_Ads, Episod...",24.788553,0.10
95,"[Episode_Length_minutes, Number_of_Ads, Episod...",24.976019,0.10
118,"[Episode_Length_minutes, Episode_Sentiment, Ge...",25.091079,0.10
108,"[Episode_Length_minutes, Number_of_Ads, Genre,...",25.287839,0.10
...,...,...,...
19,"[Episode_Length_minutes, Host_Popularity_perce...",27.064177,0.01
3,"[Episode_Length_minutes, Host_Popularity_perce...",27.064631,0.01
18,"[Episode_Length_minutes, Host_Popularity_perce...",27.074869,0.01
11,"[Episode_Length_minutes, Host_Popularity_perce...",27.087706,0.01


# Linear Regression

In [19]:
from cuml.linear_model import LinearRegression
from sklearn.model_selection import cross_validate, cross_val_score, ShuffleSplit
from mlxtend.feature_selection import SequentialFeatureSelector
ss = ShuffleSplit(1, random_state = 123)

## ALL

In [17]:
df_tgt = pd.concat([df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5], axis=1)

In [29]:
scores_1 = cross_validate(
    LinearRegression(), df_tgt, df_train[target], cv = kf, scoring = 'neg_root_mean_squared_error', 
    return_train_score=True, return_estimator=True
)
scores_1

  return init_func(self, *args, **filtered_kwargs)


{'fit_time': array([2.25223279, 2.86336946, 2.25639129, 2.27023983]),
 'score_time': array([0.17719531, 0.17213297, 0.17238498, 0.17192268]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_score': array([-24.88267431, -24.90083827, -24.88951463, -24.95506685]),
 'train_score': array([-24.90247222, -24.89649926, -24.90004019, -24.87804744])}

In [19]:
np.mean(scores_1['test_score'])

np.float64(-24.907023515387714)

### Step-wise Forward Feature Selection

지나치게 오래걸립니다.

In [26]:
"""
sfs = SequentialFeatureSelector(
    LinearRegression(), k_features = 'best', forward = True, floating = True, scoring = 'neg_root_mean_squared_error', cv = kf, 
    fixed_features = ['Host_Popularity_percentage__Guest_Popularity_percentage']
)
"""

### PCA + Sequential Addition

In [33]:
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
pca = PCA()
pca.fit(df_tgt)

In [38]:
pca.explained_variance_ratio_.cumsum()
X_tgt_pca = pca.transform(df_tgt)

In [41]:
scores = list()
for i in tqdm(list(range(2, X_tgt_pca.shape[1] + 1))):
    scores.append(
        np.mean(cross_val_score(LinearRegression(), X_tgt_pca[:, :i], df_train[target], cv = kf, scoring = 'neg_root_mean_squared_error'))
    )

  0%|          | 0/254 [00:00<?, ?it/s]

In [48]:
np.argmax(scores), np.max(scores), scores[-1], pca.explained_variance_ratio_[:(np.argmax(scores) - 1)].sum()

(231, -24.969706520957885, -24.96997581945375, 0.9999670700570195)

- 거의 모든 속성이 유의미 하다고 보여집니다.

# Episode_Length_minutes Not NA

In [28]:
scores_2 = cross_validate(
    LinearRegression(), pd.concat([df_prd_0_2, df_prd_1_2, df_prd_2_2, df_prd_3_2, df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5], axis=1).loc[df_train['Episode_Length_minutes'].notna()], 
    df_train.loc[df_train['Episode_Length_minutes'].notna(), target], cv = kf, scoring = 'neg_root_mean_squared_error', 
    return_train_score=True, return_estimator=True
)
scores_2

  return init_func(self, *args, **filtered_kwargs)


{'fit_time': array([3.77009082, 3.36784077, 3.38008928, 3.37175846]),
 'score_time': array([0.21692801, 0.20029593, 0.20255375, 0.20389414]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_score': array([-10.34654615, -10.37086607, -10.36675097, -10.37383063]),
 'train_score': array([-10.35946928, -10.35194536, -10.35285819, -10.35029731])}

In [25]:
scores_2 = cross_validate(
    LinearRegression(), 
    pd.concat([
        df_prd_0_2, df_prd_1_2, df_prd_2_2, df_prd_3_2, df_prd_4_2, df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5
    ], axis=1).loc[
        df_train['Episode_Length_minutes'].notna()
    ], 
    df_train.loc[df_train['Episode_Length_minutes'].notna(), target], cv = kf, scoring = 'neg_root_mean_squared_error', 
    return_train_score=True, return_estimator=True
)
scores_2

  return init_func(self, *args, **filtered_kwargs)


{'fit_time': array([ 4.55789065, 46.85648894, 29.04466891, 13.68449473]),
 'score_time': array([0.25466108, 0.28060365, 0.87333012, 0.40946722]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_score': array([-10.34569726, -10.37092722, -10.36544579, -10.37223789]),
 'train_score': array([-10.35610562, -10.34835871, -10.34965034, -10.34709095])}

In [35]:
(
    (
        (scores_1['test_score'].mean() ** 2) * df_train['ELm_na'].sum() + (scores_2['test_score'].mean() ** 2) * (~df_train['ELm_na']).sum()
    ) / df_train.shape[0]
) ** 0.5

np.float64(12.922349767650793)

In [22]:
scores_3 = cross_validate(
    LinearRegression(), 
    pd.concat([
        df_prd_0_2, df_prd_1_2, df_prd_2_2, df_prd_3_2, df_prd_4_2, df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5
    ], axis=1), 
    df_train[target], cv = kf, scoring = 'neg_root_mean_squared_error', 
    return_train_score=True, return_estimator=True
)
scores_3

  return init_func(self, *args, **filtered_kwargs)


{'fit_time': array([ 5.04833889,  5.03152657,  4.16533256, 62.87113905]),
 'score_time': array([0.28913307, 0.29059243, 0.28520823, 0.53673291]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_score': array([-12.92041104, -12.94770163, -12.98073131, -13.00904559]),
 'train_score': array([-12.96708273, -12.95782664, -12.946687  , -12.9373174 ])}

# Generating Target Encoded Data for test set

유의한 Target Encoding 결과를 가려내는 것은 지금 상태로썬 소모적으로 보입니다.

또한 Episode_Length_minutes 와 Combination 4는 들어가는 리소스 대비 성능향상이 너무 미약하여 제외 합니다.

In [18]:
df_test = p.transform(['data/test.csv'])

In [19]:
X_tgt_list = list()
for i in range(1, 6):
    X_tgt_list.extend([list(i) for i in combinations(X_val, i)])
X_tgt_list.append(['Episode_Length_minutes'])
for i in range(1, 4):
    X_tgt_list.extend([['Episode_Length_minutes'] + list(i) for i in combinations(X_val, i)])

In [20]:
def get_tgt_fit_transform(df_train, df_test, X_tgt, smooth):
    results = list()
    best_prd_list = list()
    tgt = TargetEncoder(smooth = i, split_method ='continuous')
    return pd.Series(
        tgt.fit(df_train[X_tgt], df_train[target]).transform(df_test[X_tgt]),
        index = df_test.index, name = '__'.join(X_tgt)
    )

In [21]:
s_smooth = pd.concat([
    df_c0_2, df_c1_2, df_c2_2, df_c3_2, df_c4_2, df_c1, df_c2, df_c3, df_c4, df_c5
], axis=0).assign(
    X_tgt = lambda x: x['X_tgt'].apply(lambda x: '__'.join(x) if type(x) == list else x)
).set_index('X_tgt')['smooth']

In [22]:
df_tgt_test = sc.cache_result(
    'tgt_test',
    lambda : pd.concat([
        get_tgt_fit_transform(df_train, df_test, i, s_smooth['__'.join(i)])
        for i in X_tgt_list
    ], axis= 1), rerun = 1
)

In [23]:
df_tgt = pd.concat([df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5], axis=1)
df_tgt.head()

Unnamed: 0_level_0,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Title,Genre,Podcast_Name,Publication_Day,Publication_Time,Host_Popularity_percentage__Guest_Popularity_percentage,...,Number_of_Ads__Episode_Title__Genre__Podcast_Name__Publication_Time,Number_of_Ads__Episode_Title__Genre__Publication_Day__Publication_Time,Number_of_Ads__Episode_Title__Podcast_Name__Publication_Day__Publication_Time,Number_of_Ads__Genre__Podcast_Name__Publication_Day__Publication_Time,Episode_Sentiment__Episode_Title__Genre__Podcast_Name__Publication_Day,Episode_Sentiment__Episode_Title__Genre__Podcast_Name__Publication_Time,Episode_Sentiment__Episode_Title__Genre__Publication_Day__Publication_Time,Episode_Sentiment__Episode_Title__Podcast_Name__Publication_Day__Publication_Time,Episode_Sentiment__Genre__Podcast_Name__Publication_Day__Publication_Time,Episode_Title__Genre__Podcast_Name__Publication_Day__Publication_Time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,45.528527,45.452848,49.164262,46.713282,41.069154,45.992674,45.977349,44.877671,46.500598,45.452848,...,44.587512,43.909609,44.936909,48.807995,44.608894,45.78887,43.939541,42.784407,47.810233,42.533833
1,42.395656,46.499231,43.267578,44.068982,46.293466,44.392425,42.611624,45.355999,45.494455,45.431718,...,44.880209,42.151091,45.431718,43.549875,45.736214,45.764764,45.957461,47.504482,45.266086,50.25054
2,39.21455,47.665749,49.164262,44.141302,44.673185,45.719842,46.724965,46.172203,44.813361,45.452848,...,56.665385,45.735722,45.452848,47.299284,45.344471,45.876128,44.303632,44.531648,43.203534,44.531648
3,45.311334,44.992782,43.267578,46.710741,50.170953,45.607912,44.75121,45.959877,44.982866,45.431718,...,53.745453,38.550992,45.33727,45.471254,46.605399,45.965047,45.142086,47.384289,45.56959,48.656152
4,46.799378,45.510912,40.275518,45.523761,45.365815,45.781732,46.674087,45.978264,45.508629,45.443938,...,45.117797,48.263352,45.443938,45.82337,44.552437,45.494986,45.113844,41.749225,47.485344,40.788997


# TGT406 + LR

In [24]:
X_lr = df_tgt.columns.tolist()
reg_lr = LinearRegression().fit(df_tgt, df_train[target])

  return init_func(self, *args, **filtered_kwargs)


In [25]:
s_prd1 = df_tgt_test.loc[df_test['Episode_Length_minutes'].isna(), X_lr].pipe(
    lambda x: pd.Series(reg_lr.predict(x), index = x.index, name = target)
)
s_prd1.head()

id
750013    43.090339
750016    40.349254
750025    42.599306
750028    33.927858
750029    51.073081
Name: Listening_Time_minutes, dtype: float64

In [32]:
df_tgt = pd.concat([df_prd_0_2, df_prd_1_2, df_prd_2_2, df_prd_3_2, df_prd_1, df_prd_2, df_prd_3, df_prd_4, df_prd_5], axis=1).loc[
    df_train_1.index
]
df_tgt.head()

Unnamed: 0_level_0,Episode_Length_minutes,Episode_Length_minutes__Host_Popularity_percentage,Episode_Length_minutes__Guest_Popularity_percentage,Episode_Length_minutes__Number_of_Ads,Episode_Length_minutes__Episode_Sentiment,Episode_Length_minutes__Episode_Title,Episode_Length_minutes__Genre,Episode_Length_minutes__Podcast_Name,Episode_Length_minutes__Publication_Day,Episode_Length_minutes__Publication_Time,...,Number_of_Ads__Episode_Title__Genre__Podcast_Name__Publication_Time,Number_of_Ads__Episode_Title__Genre__Publication_Day__Publication_Time,Number_of_Ads__Episode_Title__Podcast_Name__Publication_Day__Publication_Time,Number_of_Ads__Genre__Podcast_Name__Publication_Day__Publication_Time,Episode_Sentiment__Episode_Title__Genre__Podcast_Name__Publication_Day,Episode_Sentiment__Episode_Title__Genre__Podcast_Name__Publication_Time,Episode_Sentiment__Episode_Title__Genre__Publication_Day__Publication_Time,Episode_Sentiment__Episode_Title__Podcast_Name__Publication_Day__Publication_Time,Episode_Sentiment__Genre__Podcast_Name__Publication_Day__Publication_Time,Episode_Title__Genre__Podcast_Name__Publication_Day__Publication_Time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,90.501628,45.431718,45.431718,97.542581,89.243015,45.431718,85.238236,45.431718,85.217378,90.180447,...,44.880209,42.151091,45.431718,43.549875,45.736214,45.764764,45.957461,47.504482,45.266086,50.25054
2,60.092867,44.930531,45.452848,63.857971,59.762401,44.950429,54.204573,44.973266,57.764394,63.221972,...,56.665385,45.735722,45.452848,47.299284,45.344471,45.876128,44.303632,44.531648,43.203534,44.531648
3,46.891796,45.431718,45.431718,50.175724,47.202526,45.431718,48.646514,46.056669,46.972954,50.063228,...,53.745453,38.550992,45.33727,45.471254,46.605399,45.965047,45.142086,47.384289,45.56959,48.656152
4,86.022025,45.443938,45.443938,74.082692,81.057203,45.443938,86.936465,45.443938,88.224154,79.385296,...,45.117797,48.263352,45.443938,45.82337,44.552437,45.494986,45.113844,41.749225,47.485344,40.788997
5,17.978931,45.443938,45.443938,19.344292,17.935752,45.443938,20.770887,21.736885,18.484191,16.738409,...,46.965593,45.004005,44.518584,44.423171,45.268944,45.70147,44.97698,44.516965,46.813403,45.207556


In [34]:
X_lr = df_tgt.columns.tolist()
reg_lr = LinearRegression().fit(df_tgt, df_train_1[target])

  return init_func(self, *args, **filtered_kwargs)


In [28]:
s_prd2 = df_tgt_test.loc[df_test['Episode_Length_minutes'].notna(), X_lr].pipe(
    lambda x: pd.Series(reg_lr.predict(x), index = x.index, name = target)
)
s_prd2.head()

id
750000    54.550794
750001    22.144942
750002    46.947827
750003    79.335009
750004    51.590329
Name: Listening_Time_minutes, dtype: float64

In [36]:
pd.concat([s_prd1, s_prd2]).sort_index().to_csv('result/submission1.csv')