### Настройки

Общие

In [2]:
from IPython.display import display, display_html, HTML, Image
# display(HTML("<style>.container { width:90% !important; }</style>"))

import warnings
warnings.filterwarnings("ignore")

GLOBAL_RANDOM_STATE=202404
GLOBAL_SAMPLE_SIZE=4

np / pd

In [3]:
# !pip install numpy pandas
import numpy as np
print(f'numpy: {np.__version__}')
np.set_printoptions(linewidth=10000, precision=4, edgeitems=20, suppress=True)

import pandas as pd
print(f'pandas: {pd.__version__}')
pd.set_option('display.max_rows', None,
              'display.max_columns', 500,
              'display.max_colwidth', 1,
              'display.precision', 2)

from scipy.stats import zscore

numpy: 1.24.1
pandas: 2.2.1


Утилиты

In [4]:
import os
import time
import tqdm

def getfilesize(strName):
    '''размер файла'''
    file_size = os.path.getsize(strName)
    return(file_size / (1024 * 1024))


Графика

In [5]:
# !pip install matplotlib seaborn plotly
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import plotly.graph_objs as go
import plotly.subplots as sp
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import plotly.express as px

### Загрузка данных

In [6]:
strFileName='data\IBM Watson Marketing Customer Value Data.zip'
strFileName=strFileName.replace("\\","/")
print(f"Размер zip файла: {getfilesize(strFileName):.2f} МБ")

df=pd.read_csv(strFileName, compression='zip')

Размер zip файла: 0.34 МБ


### Часть 1. EDA & Часть 2. Preprocessing & Feature Engineering
обьединим обе части: знакомство с данными, вывод, реализация

#### Вспомогательные функции

In [7]:
def short_describe(df, target_name=None, short=False):
    '''
    сводные данные по датафрейму
    возвращает 2 списка с категориальными и количественными признаками
    '''
    display(HTML(f'''
    <b>Shape:</b> {df.shape[0]} rows x {df.shape[1]} columns
    <br><b>Duplicates:</b> {df.duplicated().sum()}
    <br><b>Rows with null values:</b> {df.isna().any(axis=1).sum()}<hr>'''))

    lst_num = df.select_dtypes(include='number').columns.tolist()
    lst_str = df.select_dtypes(include='object').columns.tolist()
    df_desc=df[lst_num]
    display(df_desc.describe(include='all').T.assign(dtypes=df_desc.dtypes).assign(skew=df_desc.skew()).assign(kurt=df_desc.kurt()).assign(kurt=df_desc.kurt()))
    # Коэффициент асимметрии (skew): насколько распределение данных асимметрично.
    # Эксцесс (kurt): насколько распределение данных имеет более острые или плоские вершины, чем нормальное распределение.
    
    if not short:
        if len(lst_num):
            display(HTML(f'<hr><b>Numeric columns ({len(lst_num)}):</b>'))
            print(lst_num)
        if len(lst_str):
            display(HTML(f'<hr><b>String columns ({len(lst_str)}):</b>'))
            print(lst_str)
        
        if target_name:
            display(HTML("<hr>\n<b>Balance</b>"))
            display(df[target_name].value_counts(normalize=True))

        return lst_num, lst_str

def color_text(val):
    '''для форматирования датафреймов'''
    color = 'red' if val == 0 else 'black'
    return f'color: {color}'

def get_redundant_pairs(df):
    '''Треугольная матрица корреляций'''
    pairs_to_drop = set()
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(df=10, crit=0.85, method='spearman'): 
    '''ТОП высоко скоррелированных признаков'''   
    au_corr = df.corr(method=method).abs().unstack()
    labels_to_drop = get_redundant_pairs(df)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[au_corr>crit]

def facet_hist(df, cols=1, height=800):
    '''
    гистограммы распределения
    '''
    lst_num=df.columns
    rows=len(lst_num)// cols + 1
    fig = sp.make_subplots(rows=rows, cols=cols, subplot_titles=lst_num)

    # Нарисовать гистограммы
    for i, col in enumerate(lst_num):
        fig.add_trace(
            go.Histogram(x=df[col], name=col, histnorm='probability'),
            row=i // cols + 1, col=i % cols + 1
        )

    # Обновить макет
    fig.update_layout(
        title='Распределение переменных',
        height=height,
        margin=dict(l=20, r=20, t=80, b=20,),
        paper_bgcolor='lightgray',
        showlegend=False
    )

    # Вывести графики
    fig.show()


def cutNsigma(df, inField, sigma=3, verbose=False):
    outField=inField + f'_cut{sigma}sigma'

    df[outField]=round(zscore(df[inField]).abs(),0)
    upper_bound=df[df[outField]==sigma][inField].max()
    count_values=df[df[inField]>upper_bound][inField].count()

    if verbose: 
        display(df.value_counts(outField,normalize=True).to_frame().T)
        display(pd.pivot_table(df, values=inField, columns=[outField], aggfunc={"mean","count","min"}))
        print(f'upper_bound: {upper_bound}, rows modified: {count_values}')

    df.drop(columns=[outField], inplace=True)
    df[inField] = df[inField].where(df[inField]<upper_bound, upper_bound)

    if verbose:
        print(f'Result for \'{inField}\':')
        display(df[inField].describe().to_frame().T)


#### Первичное исследование данных

In [8]:
display(df.columns)
df.head(GLOBAL_SAMPLE_SIZE)
df.dtypes.value_counts()

Index(['Customer', 'State', 'Customer Lifetime Value', 'Response', 'Coverage',
       'Education', 'Effective To Date', 'EmploymentStatus', 'Gender',
       'Income', 'Location Code', 'Marital Status', 'Monthly Premium Auto',
       'Months Since Last Claim', 'Months Since Policy Inception',
       'Number of Open Complaints', 'Number of Policies', 'Policy Type',
       'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount',
       'Vehicle Class', 'Vehicle Size'],
      dtype='object')

object     16
int64      6 
float64    2 
Name: count, dtype: int64

In [9]:
lst_num, lst_str = short_describe(df)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,dtypes,skew,kurt
Customer Lifetime Value,9134.0,8004.94,6870.97,1898.01,3994.25,5780.18,8962.17,83325.38,float64,3.03,13.82
Income,9134.0,37657.38,30379.9,0.0,0.0,33889.5,62320.0,99981.0,int64,0.29,-1.09
Monthly Premium Auto,9134.0,93.22,34.41,61.0,68.0,83.0,109.0,298.0,int64,2.12,6.19
Months Since Last Claim,9134.0,15.1,10.07,0.0,6.0,14.0,23.0,35.0,int64,0.28,-1.07
Months Since Policy Inception,9134.0,48.06,27.91,0.0,24.0,48.0,71.0,99.0,int64,0.04,-1.13
Number of Open Complaints,9134.0,0.38,0.91,0.0,0.0,0.0,0.0,5.0,int64,2.78,7.75
Number of Policies,9134.0,2.97,2.39,1.0,1.0,2.0,4.0,9.0,int64,1.25,0.36
Total Claim Amount,9134.0,434.09,290.5,0.1,272.26,383.95,547.51,2893.24,float64,1.71,5.98


['Customer Lifetime Value', 'Income', 'Monthly Premium Auto', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount']


['Customer', 'State', 'Response', 'Coverage', 'Education', 'Effective To Date', 'EmploymentStatus', 'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']


<b>Вывод по общему виду данных:</b><hr>
Целевая переменная - <b>Customer Lifetime Value</b>

Все данные без пустот. Категориальных - 16, количественных - 8.

Среди категориальных признаков немало с небольшим кол-вом значений. Часть удалим.

#### Категориальные признаки

In [10]:
MAX_VALUE_COUNT=10

for col in lst_str:
    nun=df[col].nunique()
    print(f'{col}: {nun}')
    if nun<=MAX_VALUE_COUNT:
        print(df[col].value_counts().head(MAX_VALUE_COUNT))
    print()

Customer: 9134

State: 5
State
California    3150
Oregon        2601
Arizona       1703
Nevada        882 
Washington    798 
Name: count, dtype: int64

Response: 2
Response
No     7826
Yes    1308
Name: count, dtype: int64

Coverage: 3
Coverage
Basic       5568
Extended    2742
Premium     824 
Name: count, dtype: int64

Education: 5
Education
Bachelor                2748
College                 2681
High School or Below    2622
Master                  741 
Doctor                  342 
Name: count, dtype: int64

Effective To Date: 59

EmploymentStatus: 5
EmploymentStatus
Employed         5698
Unemployed       2317
Medical Leave    432 
Disabled         405 
Retired          282 
Name: count, dtype: int64

Gender: 2
Gender
F    4658
M    4476
Name: count, dtype: int64

Location Code: 3
Location Code
Suburban    5779
Rural       1773
Urban       1582
Name: count, dtype: int64

Marital Status: 3
Marital Status
Married     5298
Single      2467
Divorced    1369
Name: count, dtype: int64



<b>Вывод по категориальным признакам:</b><hr>
- Customer - содержит UID пользователя и нам не нужен
- Effective To Date - дата завершения, видимо, обслуживания. В контексте рассмотрения, пренебрежем
- остальные вполне можно перекодировать с помощью <b>LabelEncoder()</b>

Drop columns & LabelEncoder

In [11]:
df.drop(columns=['Customer', 'Effective To Date'], inplace=True)
lst_str = df.select_dtypes(include='object').columns.tolist()
print(f"Кол-во категориальных после удаления:{len(lst_str)}\n",lst_str)

Кол-во категориальных после удаления:14
 ['State', 'Response', 'Coverage', 'Education', 'EmploymentStatus', 'Gender', 'Location Code', 'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type', 'Sales Channel', 'Vehicle Class', 'Vehicle Size']


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


print("До перекодировки:")
display(df[lst_str].sample(n=GLOBAL_SAMPLE_SIZE,random_state=GLOBAL_RANDOM_STATE))
for f in lst_str:
    df[f] = le.fit_transform(df[f])
print("После перекодировки:")
display(df[lst_str].sample(n=GLOBAL_SAMPLE_SIZE,random_state=GLOBAL_RANDOM_STATE))


До перекодировки:


Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
5422,Arizona,No,Basic,High School or Below,Employed,M,Rural,Married,Personal Auto,Personal L3,Offer4,Call Center,Four-Door Car,Medsize
6101,Washington,No,Basic,High School or Below,Unemployed,F,Suburban,Married,Corporate Auto,Corporate L3,Offer1,Call Center,Four-Door Car,Medsize
4942,Arizona,No,Extended,College,Employed,F,Suburban,Divorced,Corporate Auto,Corporate L3,Offer1,Call Center,Four-Door Car,Small
4294,Oregon,No,Premium,Bachelor,Employed,F,Rural,Married,Corporate Auto,Corporate L2,Offer3,Agent,Four-Door Car,Medsize


После перекодировки:


Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Location Code,Marital Status,Policy Type,Policy,Renew Offer Type,Sales Channel,Vehicle Class,Vehicle Size
5422,0,0,0,3,1,1,0,1,1,5,3,2,0,1
6101,4,0,0,3,4,0,1,1,0,2,0,2,0,1
4942,0,0,1,1,1,0,1,0,0,2,0,2,0,2
4294,3,0,2,0,1,0,0,1,0,1,2,0,0,1


#### Количественные признаки

In [13]:
short_describe(df[lst_num], short=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,dtypes,skew,kurt
Customer Lifetime Value,9134.0,8004.94,6870.97,1898.01,3994.25,5780.18,8962.17,83325.38,float64,3.03,13.82
Income,9134.0,37657.38,30379.9,0.0,0.0,33889.5,62320.0,99981.0,int64,0.29,-1.09
Monthly Premium Auto,9134.0,93.22,34.41,61.0,68.0,83.0,109.0,298.0,int64,2.12,6.19
Months Since Last Claim,9134.0,15.1,10.07,0.0,6.0,14.0,23.0,35.0,int64,0.28,-1.07
Months Since Policy Inception,9134.0,48.06,27.91,0.0,24.0,48.0,71.0,99.0,int64,0.04,-1.13
Number of Open Complaints,9134.0,0.38,0.91,0.0,0.0,0.0,0.0,5.0,int64,2.78,7.75
Number of Policies,9134.0,2.97,2.39,1.0,1.0,2.0,4.0,9.0,int64,1.25,0.36
Total Claim Amount,9134.0,434.09,290.5,0.1,272.26,383.95,547.51,2893.24,float64,1.71,5.98


In [14]:
facet_hist(df[lst_num],cols=3)

<b>Вывод по количественным признакам:</b><hr>
- Monthly Premium Auto - перевод в 4 категории (<=75, <100, <120, >=120)
- Number of Open Complaints, Number of Policies - сократить кол-во вариантов
- Total Claim Amount, <b><span style="color:blue">Customer Lifetime Value</span></b> - срезать по сигмам (см ниже)

<b><span style="color:red">идея отброшена:</span></b><span style="color:gray">- Income доработать пустые (нулевые) значения</span>

<img src="src/Normal_distribution_and_scales.gif" alt="Alternative text" height="300"/>

In [15]:
fname='Monthly Premium Auto'
df[fname] = pd.cut(df[fname], [0,75,100,120,1000], right=False, labels=[1,2,3,4])

fname='Number of Open Complaints'
df[fname] = df[fname].where(df[fname]<1, 1)

fname='Number of Policies'
df[fname] = df[fname].where(df[fname]<4, 4)

# df['Income'] = df['Income'].replace(0, df['Income'].mean())

Распределение "длинное", оставим $2\sigma$ остальное приравняем верхней границе

In [16]:
cutNsigma(df,'Total Claim Amount'     ,sigma=2,verbose=True)
cutNsigma(df,'Customer Lifetime Value',sigma=2,verbose=True)

Total Claim Amount_cut2sigma,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
proportion,0.52,0.41,0.04,0.02,0.00394,0.00252,0.00131,0.000438,0.000219


Total Claim Amount_cut2sigma,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
count,4746.0,3746.0,385.0,180.0,36.0,23.0,12.0,4.0,2.0
mean,412.92,334.53,981.48,1296.22,1561.88,1887.41,2163.79,2419.45,2826.52
min,289.04,0.1,870.63,1163.62,1452.21,1742.4,2042.57,2327.17,2759.79


upper_bound: 1159.2, rows modified: 257
Result for 'Total Claim Amount':


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Total Claim Amount,9134.0,425.73,258.92,0.1,272.26,383.95,547.51,1159.2


Customer Lifetime Value_cut2sigma,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
proportion,0.52,0.41,0.04,0.02,0.00952,0.00405,0.00131,0.000766,0.000547,0.000219,0.000109,0.000109


Customer Lifetime Value_cut2sigma,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
count,4742.0,3716.0,362.0,162.0,87.0,37.0,12.0,7.0,5.0,2.0,1.0,1.0
mean,7008.63,5762.67,21412.79,28034.69,34817.57,41909.52,48749.7,56773.28,62837.11,70566.61,74228.52,83325.38
min,4569.88,1898.01,18321.24,25253.1,32069.88,39033.08,46302.08,52811.49,60556.19,67907.27,74228.52,83325.38


upper_bound: 25169.18817, rows modified: 314
Result for 'Customer Lifetime Value':


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Customer Lifetime Value,9134.0,7696.91,5557.39,1898.01,3994.25,5780.18,8962.17,25169.19


In [17]:
lst_num, lst_str = short_describe(df)
facet_hist(df,3, height=1200)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,dtypes,skew,kurt
State,9134.0,1.74,1.29,0.00,1.00,1.00,3.00,4.00,int32,0.21,-1.25
Customer Lifetime Value,9134.0,7696.91,5557.39,1898.01,3994.25,5780.18,8962.17,25169.19,float64,1.70,2.45
Response,9134.0,0.14,0.35,0.00,0.00,0.00,0.00,1.00,int32,2.04,2.15
Coverage,9134.0,0.48,0.66,0.00,0.00,0.00,1.00,2.00,int32,1.03,-0.11
Education,9134.0,1.55,1.38,0.00,0.00,1.00,3.00,4.00,int32,0.33,-1.37
...,...,...,...,...,...,...,...,...,...,...,...
Renew Offer Type,9134.0,0.97,1.01,0.00,0.00,1.00,2.00,3.00,int32,0.72,-0.63
Sales Channel,9134.0,1.10,1.07,0.00,0.00,1.00,2.00,3.00,int32,0.51,-1.04
Total Claim Amount,9134.0,425.73,258.92,0.10,272.26,383.95,547.51,1159.20,float64,0.84,0.69
Vehicle Class,9134.0,1.89,2.07,0.00,0.00,0.00,4.00,5.00,int32,0.40,-1.53


['State', 'Customer Lifetime Value', 'Response', 'Coverage', 'Education', 'EmploymentStatus', 'Gender', 'Income', 'Location Code', 'Marital Status', 'Months Since Last Claim', 'Months Since Policy Inception', 'Number of Open Complaints', 'Number of Policies', 'Policy Type', 'Policy', 'Renew Offer Type', 'Sales Channel', 'Total Claim Amount', 'Vehicle Class', 'Vehicle Size']


#### Корреляция

In [18]:
crit=0.70
corr_df=df.sample(n=1000,random_state=GLOBAL_RANDOM_STATE)
result=get_top_abs_correlations(df=corr_df, crit=crit, method='pearson')
display(HTML(f'<b>Strong correlation (>{crit}): {result.shape[0]} pairs</b><hr>'))
print(result)

Policy Type       Policy    0.88
EmploymentStatus  Income    0.72
dtype: float64


In [19]:
corr_df= df.corr(method='pearson').abs()
df_lt = corr_df.where(np.tril(np.ones(corr_df.shape)).astype(np.bool_))
df_lt.style.background_gradient().highlight_null(color="white").format("{:.2f}")

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
State,1.0,,,,,,,,,,,,,,,,,,,,,
Customer Lifetime Value,0.01,1.0,,,,,,,,,,,,,,,,,,,,
Response,0.0,0.0,1.0,,,,,,,,,,,,,,,,,,,
Coverage,0.0,0.18,0.0,1.0,,,,,,,,,,,,,,,,,,
Education,0.02,0.03,0.0,0.02,1.0,,,,,,,,,,,,,,,,,
EmploymentStatus,0.0,0.04,0.02,0.0,0.03,1.0,,,,,,,,,,,,,,,,
Gender,0.0,0.01,0.0,0.01,0.0,0.03,1.0,,,,,,,,,,,,,,,
Income,0.0,0.03,0.01,0.01,0.02,0.73,0.02,1.0,,,,,,,,,,,,,,
Location Code,0.01,0.0,0.0,0.02,0.01,0.02,0.0,0.03,1.0,,,,,,,,,,,,,
Marital Status,0.02,0.03,0.09,0.0,0.01,0.31,0.03,0.23,0.02,1.0,,,,,,,,,,,,


<b>Вывод по корреляции:</b><hr>
- Policy Type - удалить, Policy - выше 5 свести в одну группу 6
- EmploymentStatus  Income - не трогать, все оставить как есть

In [20]:
df.drop(columns=['Policy Type'], inplace=True)

fname='Policy'
df[fname] = df[fname].where(df[fname]<6, 6)

StandardScaler

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
lst_num=['Customer Lifetime Value', 'Income', 'Monthly Premium Auto',
         'Months Since Last Claim', 'Months Since Policy Inception',
         'Number of Open Complaints', 'Number of Policies', 'Total Claim Amount']

ss=StandardScaler()

print("До нормализации:")
display(df[lst_num].sample(n=GLOBAL_SAMPLE_SIZE,random_state=GLOBAL_RANDOM_STATE))

for f in lst_num:
    df[f] = ss.fit_transform(df[[f]])

print("После нормализации:")
display(df[lst_num].sample(n=GLOBAL_SAMPLE_SIZE,random_state=GLOBAL_RANDOM_STATE))

До нормализации:


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
5422,8139.53,86876,1,9,3,0,2,118.81
6101,2305.24,0,1,14,81,0,1,297.6
4942,7122.27,64571,2,34,91,0,4,452.12
4294,8679.84,52717,3,5,60,0,4,60.15


После нормализации:


Unnamed: 0,Customer Lifetime Value,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount
5422,0.08,1.62,-0.96,-0.61,-1.61,-0.51,-0.25,-1.19
6101,-0.97,-1.24,-0.96,-0.11,1.18,-0.51,-1.08,-0.49
4942,-0.1,0.89,-0.05,1.88,1.54,-0.51,1.41,0.1
4294,0.18,0.5,0.87,-1.0,0.43,-0.51,1.41,-1.41


In [23]:
short_describe(df, short=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,dtypes,skew,kurt
State,9134.0,1.74e+00,1.29,0.00,1.00,1.00,3.00,4.00,int32,0.21,-1.25
Customer Lifetime Value,9134.0,1.80e-16,1.00,-1.04,-0.67,-0.34,0.23,3.14,float64,1.70,2.45
Response,9134.0,1.43e-01,0.35,0.00,0.00,0.00,0.00,1.00,int32,2.04,2.15
Coverage,9134.0,4.81e-01,0.66,0.00,0.00,0.00,1.00,2.00,int32,1.03,-0.11
Education,9134.0,1.55e+00,1.38,0.00,0.00,1.00,3.00,4.00,int32,0.33,-1.37
...,...,...,...,...,...,...,...,...,...,...,...
Renew Offer Type,9134.0,9.70e-01,1.01,0.00,0.00,1.00,2.00,3.00,int32,0.72,-0.63
Sales Channel,9134.0,1.10e+00,1.07,0.00,0.00,1.00,2.00,3.00,int32,0.51,-1.04
Total Claim Amount,9134.0,4.82e-17,1.00,-1.64,-0.59,-0.16,0.47,2.83,float64,0.84,0.69
Vehicle Class,9134.0,1.89e+00,2.07,0.00,0.00,0.00,4.00,5.00,int32,0.40,-1.53


### Часть 3. Who's the mightiest of them all?

In [24]:
# !pip install xgboost catboost lightgbm
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import train_test_split


In [25]:
targetField='Customer Lifetime Value'
X = df.drop(targetField, axis=1)
y = df[targetField]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=GLOBAL_RANDOM_STATE)

In [26]:
import joblib
from datetime import datetime

In [31]:
models = [
    BaggingRegressor(),
    GradientBoostingRegressor(),
    XGBRegressor(),
    CatBoostRegressor(),
    LGBMRegressor(),
]

results=pd.DataFrame(columns=['model', 'score', 'timestamp'])

for model in models:    
    model.fit(X_train, y_train)
    filename = f"models/{model.__class__.__name__}_simple.joblib"
    joblib.dump(model, filename)
    d={
        'model'     : [type(model).__name__ + '_simple'],
        'score'     : model.score(X_test, y_test),
        'timestamp' : [datetime.now()],
    }

    results=pd.concat([results, pd.DataFrame(data=d)],ignore_index=True)

Learning rate set to 0.05606
0:	learn: 0.9544053	total: 2.75ms	remaining: 2.75s
1:	learn: 0.9179497	total: 5.37ms	remaining: 2.68s
2:	learn: 0.8834239	total: 7.78ms	remaining: 2.59s
3:	learn: 0.8527812	total: 10.1ms	remaining: 2.51s
4:	learn: 0.8241126	total: 12.5ms	remaining: 2.48s
5:	learn: 0.7962467	total: 15.1ms	remaining: 2.5s
6:	learn: 0.7711075	total: 17.5ms	remaining: 2.48s
7:	learn: 0.7492893	total: 19.4ms	remaining: 2.41s
8:	learn: 0.7277246	total: 21.9ms	remaining: 2.41s
9:	learn: 0.7084904	total: 24.1ms	remaining: 2.39s
10:	learn: 0.6891327	total: 26.5ms	remaining: 2.38s
11:	learn: 0.6721300	total: 28.5ms	remaining: 2.34s
12:	learn: 0.6573123	total: 31ms	remaining: 2.35s
13:	learn: 0.6426511	total: 33.2ms	remaining: 2.34s
14:	learn: 0.6296443	total: 35.7ms	remaining: 2.34s
15:	learn: 0.6164396	total: 38.1ms	remaining: 2.34s
16:	learn: 0.6053410	total: 40.4ms	remaining: 2.34s
17:	learn: 0.5955456	total: 42.8ms	remaining: 2.33s
18:	learn: 0.5866430	total: 45.1ms	remaining: 2.

In [32]:
results.sort_values(by=['score'],ascending=False,inplace=True)

display(HTML('<hr>Результаты без настройки гиперпараметров'))
display(results[['model', 'score']].style.background_gradient().format({"score" : "{:,.3f}",}))

Unnamed: 0,model,score
3,CatBoostRegressor_simple,0.787
4,LGBMRegressor_simple,0.783
2,XGBRegressor_simple,0.769
1,GradientBoostingRegressor_simple,0.767
0,BaggingRegressor_simple,0.756


#### Подбор гиперпараметров

<table style="text-align: left;">
  <thead>
    <tr>
      <th></th>
      <th>BaggingRegressor</th>
      <th>GradientBoostingRegressor</th>
      <th>XGBRegressor</th>
      <th>CatBoostRegressor</th>
      <th>LGBMRegressor</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Количество базовых оценщиков</th>
      <td>n_estimators</td>
      <td>n_estimators</td>
      <td>n_estimators</td>
      <td>iterations</td>
      <td>num_boost_round</td>
    </tr>
    <tr>
      <th>Скорость обучения</th>
      <td>-</td>
      <td>learning_rate</td>
      <td>learning_rate</td>
      <td>learning_rate</td>
      <td>learning_rate</td>
    </tr>
    <tr>
      <th>Min вес узла</th>
      <td>-</td>
      <td>-</td>
      <td>min_child_weight</td>
      <td>-</td>
      <td>-</td>
    </tr>
    <tr>
      <th>Параметр регуляризации</th>
      <td>-</td>
      <td>-</td>
      <td>gamma</td>
      <td>l2_leaf_reg</td>
      <td>-</td>
    </tr>
    <tr>
      <th>Число образцов</th>
      <td>max_samples</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
    </tr>
    <tr>
      <th>Число признаков</th>
      <td>max_features</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
    </tr>
    <tr>
      <th>Max глубина деревьев</th>
      <td>-</td>
      <td>max_depth</td>
      <td>max_depth</td>
      <td>depth</td>
      <td>max_depth</td>
    </tr>
    <tr>
      <th>Max количество листьев</th>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>num_leaves</td>
    </tr>
    <tr>
      <th>Min количество данных в листе</th>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>min_data_in_leaf</td>
    </tr>
  </tbody>
</table>

<br>

<table style="border: 0px">
  <thead>
    <tr style="border-bottom: 1px solid black">
      <th>№</th>
      <th>Гиперпараметр</th>
      <th>Описание</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>1</td>
      <td>n_estimators</td>
      <td>Количество базовых оценщиков в ансамбле.</td>
    </tr>
    <tr>
      <td>2</td>
      <td>max_samples</td>
      <td>Число образцов, используемых для обучения каждого базового оценщика.</td>
    </tr>
    <tr>
      <td>3</td>
      <td>max_features</td>
      <td>Число признаков, используемых для обучения каждого базового оценщика.</td>
    </tr>
    <tr>
      <td>4</td>
      <td>learning_rate</td>
      <td>Скорость обучения.</td>
    </tr>
    <tr>
      <td>5</td>
      <td>max_depth</td>
      <td>Максимальная глубина деревьев.</td>
    </tr>
    <tr>
      <td>6</td>
      <td>min_child_weight</td>
      <td>Минимальный вес узла в дереве.</td>
    </tr>
    <tr>
      <td>7</td>
      <td>gamma</td>
      <td>Параметр регуляризации.</td>
    </tr>
    <tr>
      <td>8</td>
      <td>l2_leaf_reg</td>
      <td>Параметр регуляризации.</td>
    </tr>
    <tr>
      <td>9</td>
      <td>num_leaves</td>
      <td>Максимальное количество листьев в дереве.</td>
    </tr>
    <tr>
      <td>10</td>
      <td>min_data_in_leaf</td>
      <td>Минимальное количество данных в листе.</td>
    </tr>
  </tbody>
</table>