In [1]:
# pip install category_encoders

In [2]:
# pip install numba

In [3]:
# conda install cudatoolkit

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.model_selection import train_test_split, GridSearchCV, \
                                    ShuffleSplit, KFold

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.preprocessing import MinMaxScaler, StandardScaler, \
                                  RobustScaler, PolynomialFeatures, \
                                  OrdinalEncoder, LabelEncoder, \
                                  OneHotEncoder, TargetEncoder, \
                                  QuantileTransformer, PowerTransformer, \
                                  KBinsDiscretizer, FunctionTransformer

from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, RANSACRegressor, \
                                 Ridge, Lasso, LinearRegression

from sklearn.metrics import recall_score, precision_score, \
                            f1_score, ConfusionMatrixDisplay, \
                            confusion_matrix, roc_auc_score, \
                            RocCurveDisplay, PrecisionRecallDisplay, \
                            roc_curve, precision_recall_curve, \
                            PrecisionRecallDisplay, make_scorer, \
                            mean_squared_error

from sklearn.compose import ColumnTransformer, make_column_selector,\
                            make_column_transformer, TransformedTargetRegressor
# import category_encoders as ce
from sklearn.metrics import fbeta_score
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_validate
from sklearn.ensemble import VotingClassifier
import pickle
import numba
import category_encoders as ce

In [8]:
import lightgbm as lgbm
print(lgbm.__version__)

4.5.0


In [9]:
import catboost
print(catboost.__version__)

1.2.7


In [10]:
import math

In [11]:
data_raw = pd.read_csv('train.csv')

In [12]:
display(data_raw.shape)
display(data_raw.info)
display(data_raw.head(5))

(1200000, 21)

<bound method DataFrame.info of               id   Age  Gender  Annual Income Marital Status  \
0              0  19.0  Female        10049.0        Married   
1              1  39.0  Female        31678.0       Divorced   
2              2  23.0    Male        25602.0       Divorced   
3              3  21.0    Male       141855.0        Married   
4              4  21.0    Male        39651.0         Single   
...          ...   ...     ...            ...            ...   
1199995  1199995  36.0  Female        27316.0        Married   
1199996  1199996  54.0    Male        35786.0       Divorced   
1199997  1199997  19.0    Male        51884.0       Divorced   
1199998  1199998  55.0    Male            NaN         Single   
1199999  1199999  21.0  Female            NaN       Divorced   

         Number of Dependents Education Level     Occupation  Health Score  \
0                         1.0      Bachelor's  Self-Employed     22.598761   
1                         3.0        Master

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [13]:
display(data_raw.dtypes)

id                        int64
Age                     float64
Gender                   object
Annual Income           float64
Marital Status           object
Number of Dependents    float64
Education Level          object
Occupation               object
Health Score            float64
Location                 object
Policy Type              object
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Policy Start Date        object
Customer Feedback        object
Smoking Status           object
Exercise Frequency       object
Property Type            object
Premium Amount          float64
dtype: object

In [14]:
data_raw['Marital Status'].value_counts()

Marital Status
Single      395391
Married     394316
Divorced    391764
Name: count, dtype: int64

In [15]:
display(data_raw.isna().sum())

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [16]:
data = data_raw.drop_duplicates(keep='first', subset=['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type'])
display(data.shape)

(1200000, 21)

In [17]:
y = np.log1p(data['Premium Amount'])
X = data.drop(columns='Premium Amount')

In [18]:
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))
def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def transform_time(tmp='hour'):
    """Универсальеый конвектор datetime  в числовой формат"""
    """код пишет сам себя :)"""
    return eval('FunctionTransformer(lambda X: pd.DataFrame(X).apply(lambda x: \
                pd.DatetimeIndex(x).'+tmp+'))')

In [19]:
class RMSLE(object):
    def calc_ders_range(self, approxes, targets, weights):
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)

        result = []
        for index in range(len(targets)):
            val = max(approxes[index], 0)
            der1 = math.log1p(targets[index]) - math.log1p(max(0, approxes[index]))
            der2 = -1 / (max(0, approxes[index]) + 1)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result
class RMSLE_val(object):
    def get_final_error(self, error, weight):
        return np.sqrt(error / (weight + 1e-38))

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += w * ((math.log1p(max(0, approx[i])) - math.log1p(max(0, target[i])))**2)

        return error_sum, weight_sum

In [20]:
SEED = 13

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED)

In [22]:
data_train, data_valid, target_train, target_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=SEED+1)

In [23]:
ct_time = ColumnTransformer([
     ("hour",  transform_time('hour'),['Policy Start Date']),
     ("week",  transform_time('weekday'),['Policy Start Date']),
     ("month", transform_time('month'),['Policy Start Date']),
     ("year", transform_time('year'),['Policy Start Date'])
    ],remainder='drop').set_output(transform='pandas')


CT_sin_cos = ColumnTransformer([
    # ('month_sin',sin_transformer(12),['month__Policy Start Date']),
    # ('month_cos',cos_transformer(12),['month__Policy Start Date']),
    # ('week_sin',sin_transformer(7),['week__Policy Start Date']),
    # ('week_cos',cos_transformer(7),['week__Policy Start Date']),
    # ('hour_sin',sin_transformer(24),['hour__Policy Start Date']),
    # ('hour_cos',cos_transformer(24),['hour__Policy Start Date']),
    ('year_encoder',OneHotEncoder(sparse_output=False),['year__Policy Start Date'])
]).set_output(transform='pandas')


time_pipe=Pipeline(steps=[
    ('time',ct_time),
    ('sin_cos',CT_sin_cos)
])

In [24]:
# тереются некоторые колонки, надо доделать
category_features = ['Gender', 'Smoking Status']
numerical_features = ['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
spec_m = ['Education Level', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Exercise Frequency', 'Property Type', 'Marital Status']

In [25]:
educ_mapping = { 
    'High School': 3,
    "Bachelor's": 1,
    'PhD': 0,
    "Master's": 2
}

occup_mapping = {#
    'Unemployed': 0,
    'Employed': 2,
    'Self-Employed': 4
}

loc_mapping = {
    'Rural': 0,
    'Suburban': 1,
    'Urban': 2
}

pt_mapping = {
    'Basic': 2,
    'Comprehensive': 1,
    'Premium': 0
}

cf_mapping = {
    'Poor': 0,
    'Average': 1,
    'Good': 2
}

ef_mapping = {
    'Monthly': 12,
    'Weekly': 54,
    'Rarely': 5,
    'Daily': 365
}

pt_mapping = {
    'condo': 0,
    'apartment': 0,
    'House': 1
}

ms_mapping = {
    'Single': 0,
    'Divorced': 1,
    'Married': 2
}

In [26]:
data_test = pd.read_csv('test.csv')
data_test.head(5)

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [27]:
nums = Pipeline(steps=[
   # ('missing_nums',IterativeImputer(max_iter=20)), #проверить, дропнул ли na в eda, если да, то бесполезный код
    #('bins', KBinsDiscretizer(n_bins=10, strategy='uniform', encode='ordinal')), #делаем бинаризацию
    #('polynom', PolynomialFeatures(3, include_bias=False)), #делаем полиномы, если надо
    ('scaler', StandardScaler())])

annual_income_pipe= Pipeline(steps=[
    ('missing_num',IterativeImputer(max_iter=20)),
    ('transformer',PowerTransformer())
])
credit_score_pipe = Pipeline(steps=[
    ('missing_num',IterativeImputer(max_iter=20)),
    ('FU', FeatureUnion([
        ('origin', 'passthrough'),
        ('kbin', KBinsDiscretizer(n_bins=10, strategy='uniform', encode='onehot-dense'))])),
    ('poly',PolynomialFeatures((1,2),interaction_only=True,include_bias=False))
     ])

clarity_map = [{
    'col':'Education Level',
    'mapping': educ_mapping
    },
    {'col': 'Occupation',
    'mapping':occup_mapping
    },
    {'col': 'Location',
    'mapping':loc_mapping
    },
    {'col': 'Policy Type',
    'mapping':pt_mapping
    },
    {'col': 'Customer Feedback',
    'mapping':cf_mapping
    },
    {'col': 'Exercise Frequency',
    'mapping':ef_mapping
    },
    {'col': 'Property Type',
    'mapping':pt_mapping
    },
    {'col': 'Marital Status',
    'mapping':ms_mapping
    }]

special_transformer = Pipeline(steps=[
    ('missing_nums', SimpleImputer(strategy = 'most_frequent')),
    ('ce',ce.OrdinalEncoder(mapping=clarity_map)),
    ('scaler', MinMaxScaler())
    ])    
    
CT = ColumnTransformer([
        ("with_nums", nums, numerical_features),
        ("with_category", OneHotEncoder(sparse_output=False, handle_unknown='error'), category_features), #если не одна категория, то лучше прописать параметр drop
        ("special_map", special_transformer, spec_m),
        ('time',time_pipe,['Policy Start Date']),
        ("credit_score", credit_score_pipe, ['Credit Score']), 
        ('annual_income',annual_income_pipe,['Annual Income'])
        ]).set_output(transform='pandas')


display(CT)

ct_train = CT.fit_transform(data_train)
ct_valid = CT.transform(data_valid)
pd.DataFrame(ct_train).head(10).T




Unnamed: 0,613903,767459,745421,869637,955803,948781,365848,1197096,370159,473187
with_nums__id,0.040656,0.483923,0.420306,0.778878,1.027611,1.007341,-0.675400,1.724147,-0.662955,-0.365546
with_nums__Age,-0.750040,1.022798,0.579589,-0.602303,1.687612,-0.306830,0.210247,0.875062,1.022798,0.875062
with_nums__Annual Income,-0.119117,-0.805076,0.088216,0.737111,0.573309,0.893357,3.614431,-0.481856,0.167412,-0.458785
with_nums__Number of Dependents,-0.712189,1.404437,1.404437,-1.417731,-1.417731,1.404437,1.404437,1.404437,1.404437,-1.417731
with_nums__Health Score,,-0.283391,-0.618613,2.249404,0.543443,1.011538,,-0.998167,0.715495,0.352722
...,...,...,...,...,...,...,...,...,...,...
credit_score__kbin__Credit Score_6.0 kbin__Credit Score_9.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_8.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_9.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
credit_score__kbin__Credit Score_8.0 kbin__Credit Score_9.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [28]:
ct_test = CT.fit_transform(data_test)



In [29]:
ct_train.head(5)

Unnamed: 0,with_nums__id,with_nums__Age,with_nums__Annual Income,with_nums__Number of Dependents,with_nums__Health Score,with_nums__Previous Claims,with_nums__Vehicle Age,with_nums__Credit Score,with_nums__Insurance Duration,with_category__Gender_Female,...,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_7.0,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_7.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_7.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_7.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_8.0 kbin__Credit Score_9.0,annual_income__Annual Income
613903,0.040656,-0.75004,-0.119117,-0.712189,,-0.002185,1.286696,0.694324,1.149055,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233461
767459,0.483923,1.022798,-0.805076,1.404437,-0.283391,,0.24739,1.234686,-0.392383,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.883087
745421,0.420306,0.579589,0.088216,1.404437,-0.618613,-0.002185,1.286696,-1.787338,-0.007024,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.435806
869637,0.778878,-0.602303,0.737111,-1.417731,2.249404,-0.002185,-0.099045,0.560901,-1.548462,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.931469
955803,1.027611,1.687612,0.573309,-1.417731,0.543443,-0.002185,1.113478,-1.607217,1.534415,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.820701


In [30]:
ct_train.shape

(765000, 94)

In [31]:
display(ct_train.dtypes)

with_nums__id                                                  float64
with_nums__Age                                                 float64
with_nums__Annual Income                                       float64
with_nums__Number of Dependents                                float64
with_nums__Health Score                                        float64
                                                                ...   
credit_score__kbin__Credit Score_6.0 kbin__Credit Score_9.0    float64
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_8.0    float64
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_9.0    float64
credit_score__kbin__Credit Score_8.0 kbin__Credit Score_9.0    float64
annual_income__Annual Income                                   float64
Length: 94, dtype: object

In [32]:
display(ct_valid.head(4))
display(ct_valid.shape)
display(ct_valid.dtypes)

Unnamed: 0,with_nums__id,with_nums__Age,with_nums__Annual Income,with_nums__Number of Dependents,with_nums__Health Score,with_nums__Previous Claims,with_nums__Vehicle Age,with_nums__Credit Score,with_nums__Insurance Duration,with_category__Gender_Female,...,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_7.0,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_5.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_7.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_6.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_7.0 kbin__Credit Score_8.0,credit_score__kbin__Credit Score_7.0 kbin__Credit Score_9.0,credit_score__kbin__Credit Score_8.0 kbin__Credit Score_9.0,annual_income__Annual Income
738079,0.399112,-1.710327,0.455029,-0.712189,-1.241538,-1.019663,-1.484786,-0.866721,-0.392383,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.73555
1030416,1.242995,1.022798,0.08265,-0.006647,-0.229571,-1.019663,0.24739,0.607599,1.534415,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.430744
1061123,1.331636,-0.528435,-0.708934,1.404437,0.394734,-1.019663,1.633131,-1.347043,-1.163103,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.636758
762716,0.470231,-0.159094,0.835834,,0.490154,-0.002185,0.24739,-1.373727,1.534415,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994708


(255000, 94)

with_nums__id                                                  float64
with_nums__Age                                                 float64
with_nums__Annual Income                                       float64
with_nums__Number of Dependents                                float64
with_nums__Health Score                                        float64
                                                                ...   
credit_score__kbin__Credit Score_6.0 kbin__Credit Score_9.0    float64
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_8.0    float64
credit_score__kbin__Credit Score_7.0 kbin__Credit Score_9.0    float64
credit_score__kbin__Credit Score_8.0 kbin__Credit Score_9.0    float64
annual_income__Annual Income                                   float64
Length: 94, dtype: object

In [33]:
# %%time

# params = {'loss_function':, # objective function
#           'eval_metric':'AUC', # metric
#           'verbose': 200, # output to stdout info about training process every 200 iterations
#           'random_seed': SEED
#          }
# cbc_1 = CatBoostClassifier(**params)
# cbc_1.fit(X_train, y_train, # data to train on (required parameters, unless we provide X as a pool object, will be shown below)
#           eval_set=(X_valid, y_valid), # data to validate on
#           use_best_model=True, # True if we don't want to save trees created after iteration with the best validation score
#           plot=True # True for visualization of the training process (it is not shown in a published kernel - try executing this code)
#          );

In [34]:
from catboost import CatBoostRegressor

In [35]:
catboost_pipeline = Pipeline(steps=[
    ('preprocessing', CT), #забиваем свой препроцессор, если надо, можно написать в feature engineering universal_preprocessor
    ('model', CatBoostRegressor())]) #забиваем свою модель, если надо
display(catboost_pipeline)

In [36]:
# def rmsle(y_pred, data):
#     y_true = data.get_label()

#     grad = []
#     hess = []

#     for idx in range(len(y_true)):
#         grad_ = -math.log1p(y_true[idx]) + math.log1p(max(0.0, y_pred[idx]))
#         hess_ = 1.0 / (max(0, y_pred[idx]) + 1.0)

#         grad.append(grad_)
#         hess.append(hess_)

#     return np.array(grad), np.array(hess)
    

# def rmsle_eval(y_pred, data):
#     y_true = data.get_label()

#     err = 0.0
#     for i in range(len(y_pred)):
#         err += (-math.log1p(max(0.0, y_true[i])) + math.log1p(max(0.0, y_pred[i])))**2

#     return 'rmsle', err, False

In [37]:
%%time

params = {'loss_function':RMSLE(),#'Quantile:alpha=0.41',
          'eval_metric':RMSLE_val(),
          #'task_type': 'GPU',
          'verbose': 200,
          'random_seed': SEED,
          'grow_policy': 'Lossguide',
          'l2_leaf_reg': 0.7,
          'depth': 8,
          'learning_rate': 0.015,
          'iterations': 800
         }
cbr_1 = CatBoostRegressor(**params)
cbr_1.fit(ct_train, target_train,
          eval_set=(ct_valid, target_valid), 
          use_best_model=True,
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.9919073	test: 1.9919679	best: 1.9919679 (0)	total: 448ms	remaining: 5m 57s
200:	learn: 0.1887760	test: 0.1887365	best: 0.1887365 (200)	total: 25.7s	remaining: 1m 16s
400:	learn: 0.1588893	test: 0.1588732	best: 0.1588732 (400)	total: 51.3s	remaining: 51.1s
600:	learn: 0.1584935	test: 0.1585675	best: 0.1585675 (600)	total: 1m 18s	remaining: 26s
799:	learn: 0.1583753	test: 0.1585281	best: 0.1585281 (799)	total: 1m 45s	remaining: 0us

bestTest = 0.1585280906
bestIteration = 799

CPU times: user 4min 21s, sys: 15.9 s, total: 4min 37s
Wall time: 1min 46s


In [38]:
# for i in ['SymmetricTree', 'Lossguide', 'Depthwise', 'Region']:
#     params = {'loss_function':RMSLE(),#'Quantile:alpha=0.41',
#               'eval_metric':RMSLE_val(),
#               #'task_type': 'GPU',
#               'verbose': 200,
#               'random_seed': SEED,
#               'grow_policy': i,
#               'l2_leaf_reg': 3,
#               'depth': 8,
#               'learning_rate': 0.1,
#               'iterations': 600
#              }
#     print(f'For {i}')
#     cbr_1 = CatBoostRegressor(**params)
#     cbr_1.fit(ct_train, target_train,
#               eval_set=(ct_valid, target_valid), 
#               use_best_model=True,
#               plot=True
#              );

In [39]:
cbr_1.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'PythonUserDefinedPerObject',
 'iterations': 800,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'random_score_type': 'NormalWithModelSizeDecrease',
 'grow_policy': 'Lossguide',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 0.699999988079071,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.800000011920929,
 'use_best_model': True,
 'random_seed': 13,
 'depth': 8,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'No',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_fun

In [40]:
rmsle(target_valid, cbr_1.predict(ct_valid))

0.1585280909118505

In [41]:
1.05554
#     params = {'loss_function':RMSLE(),#'Quantile:alpha=0.41',
#               'eval_metric':RMSLE_val(),
#               #'task_type': 'GPU',
#               'verbose': 200,
#               'random_seed': SEED,
#               # 'grow_policy': 'SymmetricTree',
#               # 'l2_leaf_reg': 3,
#               'depth': 8,
#               'learning_rate': 0.1,
#               'iterations': 600
#              }
#     print(f'For {i}')
#     cbr_1 = CatBoostRegressor(**params)
#     cbr_1.fit(ct_train, target_train,
#               eval_set=(ct_valid, target_valid), 
#               use_best_model=True,
#               plot=True
#              );

1.05554

rmsle 1.05623
1.05596, 1.05583 (0.2 lr)

In [56]:
y_full = np.log1p(data['Premium Amount'])
X_full = data.drop(columns='Premium Amount')

In [57]:
ct_train_full = CT.fit_transform(X_full)
ct_test_full = CT.transform(data_test)



In [58]:
%%time

params = {'loss_function':RMSLE(),#'Quantile:alpha=0.41',
          'eval_metric':RMSLE_val(),
          #'task_type': 'GPU',
          'verbose': 200,
          'random_seed': SEED,
          'grow_policy': 'Lossguide',
          'l2_leaf_reg': 3,
          'depth': 8,
          'learning_rate': 0.015,
          'iterations': 2500
         }
cbr_1 = CatBoostRegressor(**params)
cbr_1.fit(ct_train_full, y_full,
          #eval_set=(ct_valid, target_valid), 
          use_best_model=True,
          plot=True
         );

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 1.9920281	total: 317ms	remaining: 13m 11s
200:	learn: 0.1884568	total: 39.7s	remaining: 7m 33s
400:	learn: 0.1587129	total: 1m 19s	remaining: 6m 57s
600:	learn: 0.1583805	total: 2m 2s	remaining: 6m 26s
800:	learn: 0.1582976	total: 2m 44s	remaining: 5m 48s
1000:	learn: 0.1582087	total: 3m 27s	remaining: 5m 10s
1200:	learn: 0.1581233	total: 4m 10s	remaining: 4m 30s
1400:	learn: 0.1580513	total: 4m 50s	remaining: 3m 47s
1600:	learn: 0.1579818	total: 5m 29s	remaining: 3m 4s
1800:	learn: 0.1579189	total: 6m 6s	remaining: 2m 22s
2000:	learn: 0.1578554	total: 6m 44s	remaining: 1m 40s
2200:	learn: 0.1577913	total: 7m 21s	remaining: 60s
2400:	learn: 0.1577317	total: 7m 58s	remaining: 19.7s
2499:	learn: 0.1577039	total: 8m 16s	remaining: 0us
CPU times: user 20min 5s, sys: 1min, total: 21min 5s
Wall time: 8min 17s


In [75]:
def get_preds(cbr_1, ct_test, data_test):
    test_predict = pd.DataFrame(np.expm1(cbr_1.predict(ct_test)), columns=['Premium Amount'], index=data_test.index + 1200000)
    test_predict.index.names = ['id']
    display(test_predict.head(5))
    display(test_predict.index.names)
    display(test_predict.shape)
    test_predict.to_csv('test_preds_4.csv')

In [76]:
get_preds(cbr_1, ct_test, data_test)

Unnamed: 0_level_0,Premium Amount
id,Unnamed: 1_level_1
1200000,542.924861
1200001,735.637437
1200002,736.137841
1200003,730.530013
1200004,706.739348


FrozenList(['id'])

(800000, 1)

In [74]:
test_pred_1 = pd.read_csv('test_preds_2.csv')
test_pred_1.head(5)

Unnamed: 0,id,Premium Amount
0,1200000,720.616309
1,1200001,783.274251
2,1200002,792.894976
3,1200003,807.933864
4,1200004,759.300066


In [47]:
#  model = CatBoostRegressor(iterations=3000,
#                           early_stopping_rounds=100,
#                           grow_policy = 'Depthwise',
#                           depth=8,
#                           loss_function=RMSLE(),
#                           cat_features= CAT_COLS,
#                           random_state=RS,
#                           l2_leaf_reg = 1,
#                           learning_rate=0.03,
#                           verbose=10,
#                           eval_metric=RMSLE_val())
# params = {'l2_leaf_reg':[1,4,8],
#           'learning_rate': [0.03,0.5,0.1]
#           'depth':[6,8,10]
#          }
# grid_search_res = model.grid_search(params, full_features['items'][FTS_COLS], full_features['items'].target, train_size=0.8)

### LGBM

In [48]:
import lightgbm as lgb
print(lgb.__version__)

4.5.0


In [49]:
def rmsle_lgbm(y_pred, data):
    y_true = data.get_label()

    grad = []
    hess = []

    for idx in range(len(y_true)):
        grad_ = -math.log1p(y_true[idx]) + math.log1p(max(0.0, y_pred[idx]))
        hess_ = 1.0 / (max(0, y_pred[idx]) + 1.0)

        grad.append(grad_)
        hess.append(hess_)

    return np.array(grad), np.array(hess)
    

def rmsle_lgbm_eval(y_pred, data):
    y_true = data.get_label()

    err = 0.0
    for i in range(len(y_pred)):
        err += (-math.log1p(max(0.0, y_true[i])) + math.log1p(max(0.0, y_pred[i])))**2

    return 'rmsle', err, False

In [50]:
lgb_train = lgb.Dataset(ct_train, target_train)
lgb_val = lgb.Dataset(ct_valid, target_valid)

In [51]:

lgbm_params = {
    'random_seed': 13,
    # 'max_depth': 7,
    # 'boosting_type': 'gbdt',
    # 'num_leaves': 31,
    # 'subsample': 0.208,
    # 'colsample_bytree': 0.578,
    # 'learning_rate': 0.2,
    # 'min_child_samples': 20,
    # 'reg_alpha': 0.262,
    # 'reg_lambda': 0.287e-6,
    'n_jobs': -1
}


In [52]:
# %%time

# params = {
#     'verbose':200, 
#     'device':'gpu',
#     'objective':rmsle_lgbm()
# }

# lgbm_1 = LGBMRegressor(**lgbm_params)
# lgbm_1.fit(ct_train, target_train.values.ravel(),
#           eval_set=(ct_valid, target_valid), eval_metric=rmsle_lgbm_eval()
#          );

In [53]:
lightgbm_reg = lgb.train(lgbm_params, 
                         lgb_train,
                         num_boost_round=1000,
                         # fobj=rmsle_lgbm,
                         feval=rmsle_lgbm_eval,
                         valid_sets=[lgb_val],
                         callbacks=[lgb.callback.log_evaluation(period=100),
                                    lgb.early_stopping(200)]
                        )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2247
[LightGBM] [Info] Number of data points in the train set: 765000, number of used features: 48
[LightGBM] [Info] Start training from score 6.593028
Training until validation scores don't improve for 200 rounds
[100]	valid_0's l2: 1.10534	valid_0's rmsle: 6442.7
[200]	valid_0's l2: 1.1054	valid_0's rmsle: 6442.38
[300]	valid_0's l2: 1.1059	valid_0's rmsle: 6444.25
Early stopping, best iteration is:
[143]	valid_0's l2: 1.10521	valid_0's rmsle: 6441.74


In [54]:
def rmsle_1(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

In [55]:
rmsle_1(target_valid, lightgbm_reg.predict(ct_valid))

0.15893938034770377