In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from user_agents import parse

In [2]:
data = pd.read_csv('data/final_data.csv', sep=",", header = 0)

In [3]:
data_columns = [
    "Bid ID", "Timestamp", "Log type", "iPinYou ID", "User-Agent", "IP",
    "Region", "City", "Ad exchange", "Domain", "URL", "Anonymous URL ID",
    "Ad slot ID", "Ad slot width", "Ad slot height", "Ad slot visibility",
    "Ad slot format", "Ad slot floor price", "Creative ID", "Bidding price",
    "Paying price", "Key page URL", "Advertiser ID", "User Tags", 'All paying price',
]
columns_to_drop = [
    "Bid ID", "iPinYou ID", "User-Agent", "IP", "URL", "Log type", "Timestamp",
    "Anonymous URL ID", "Creative ID", "Key page URL", "Ad slot ID", "Advertiser ID", 'All paying price', "User Tags",
]
categorical_features = [
    'City', 'Region', 'Ad exchange', 'Ad slot visibility', 'Ad slot format', "Hour", "Weekday", "Domain", "os", "device", "device_type", "browser"]


In [4]:
#data = data.drop_duplicates()
data = data[data['Advertiser ID'] == 3476]
data = data[data["Paying price"] > 0]

In [5]:
def parse_timestamp(ts):
    ts_str = str(ts)
    year = int(ts_str[:4])
    month = int(ts_str[4:6])
    day = int(ts_str[6:8])
    hour = int(ts_str[8:10])
    minute = int(ts_str[10:12])
    return pd.Timestamp(year=year, month=month, day=day, hour=hour, minute=minute)


In [6]:
data['Timestamp'] = data['Timestamp'].apply(parse_timestamp)
data['Hour'] = data['Timestamp'].dt.hour
data['Weekday'] = data['Timestamp'].dt.weekday

In [8]:
"""data["s"] = data['Ad slot width'] * data['Ad slot height']
data['weekend_flag'] = data['Weekday'].isin([5, 6]).astype(int)
data['aspect_ratio'] = data['Ad slot width'] / (data['Ad slot height'] + 1e-6)
data['domain_hour_interaction'] = data['Domain'].astype(str) + '_' + data['Hour'].astype(str)
data['floor_bid_ratio'] = data['Bidding price'] / (data['Ad slot floor price'] + 1e-6)

categorical_features+=[ 'weekend_flag', 'floor_bid_ratio', 'domain_hour_interaction']"""

In [7]:
tag_names = {
    '10006': 'Long-term interest/news',
    '10024': 'Long-term interest/education',
    '10031': 'Long-term interest/automobile',
    '10048': 'Long-term interest/real estate',
    '10052': 'Long-term interest/IT',
    '10057': 'Long-term interest/electronic game',
    '10059': 'Long-term interest/fashion',
    '10063': 'Long-term interest/entertainment',
    '10067': 'Long-term interest/luxury',
    '10074': 'Long-term interest/home and lifestyle',
    '10075': 'Long-term interest/health',
    '10076': 'Long-term interest/food',
    '10077': 'Long-term interest/divine',
    '10079': 'Long-term interest/motherhood&parenting',
    '10083': 'Long-term interest/sports',
    '10093': 'Long-term interest/travel&outdoors',
    '10102': 'Long-term interest/social',
    '10684': 'In-market/3c product',
    '11092': 'In-market/appliances',
    '11278': 'In-market/clothing, shoes&bags',
    '11379': 'In-market/Beauty & Personal Care',
    '11423': 'In-market/household & home improvement',
    '11512': 'In-market/infant & mom products',
    '11576': 'In-market/sports item',
    '11632': 'In-market/outdoor',
    '11680': 'In-market/health care products',
    '11724': 'In-market/luxury',
    '11944': 'In-market/real estate',
    '13042': 'In-market/automobile',
    '13403': 'In-market/finance',
    '13496': 'In-market/travel',
    '13678': 'In-market/education',
    '13776': 'In-market/service',
    '13800': 'Long-term interest/art & photography & design',
    '13866': 'Long-term interest/online literature',
    '13874': 'In-market/electronic game',
    '14273': 'Long-term interest/3c',
    '16593': 'In-market/book',
    '16617': 'In-market/medicine',
    '16661': 'In-market/food & drink',
    '16706': 'Long-term interest/culture',
    '10110': 'Demographic/gender/male',
    '10111': 'Demographic/gender/female'
}
data['User Tags'] = data['User Tags'].fillna('').astype(str)

for tag, name in tag_names.items():
    data[name] = data['User Tags'].apply(lambda x: 1 if tag in x.split(',') else 0)

In [83]:
print(data.nunique())

Bid ID                        1727541
Timestamp                        1058
Log type                            1
iPinYou ID                    1518882
IP                             445851
                               ...   
In-market/medicine                  2
In-market/food & drink              2
Long-term interest/culture          2
Demographic/gender/male             2
Demographic/gender/female           2
Length: 73, dtype: int64


In [8]:
data = data.drop(columns=columns_to_drop, errors='ignore')

In [85]:
print(data.nunique())

Region                                              35
City                                               370
Ad exchange                                          3
Domain                                           11326
Ad slot width                                       11
Ad slot height                                       6
Ad slot visibility                                   4
Ad slot format                                       3
Ad slot floor price                                139
Bidding price                                        3
Paying price                                       266
device                                            2797
os                                                  24
browser                                             68
device_type                                          5
Hour                                                24
Weekday                                              7
Long-term interest/news                              2
Long-term 

In [9]:
for col in categorical_features:
    data[col] = data[col].fillna("missing").astype(str)
X = data.drop(columns=['Paying price'], errors='ignore')
y = data['Paying price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )
print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 1576172
Test size: 394044


In [87]:
print(data.duplicated().sum())

208149


In [88]:
print(data.isnull().sum())

Region                                           0
City                                             0
Ad exchange                                      0
Domain                                           0
Ad slot width                                    0
Ad slot height                                   0
Ad slot visibility                               0
Ad slot format                                   0
Ad slot floor price                              0
Bidding price                                    0
Paying price                                     0
device                                           0
os                                               0
browser                                          0
device_type                                      0
Hour                                             0
Weekday                                          0
Long-term interest/news                          0
Long-term interest/education                     0
Long-term interest/automobile  

In [11]:
missing_cols = set(categorical_features) - set(X_train.columns)
if missing_cols:
    raise ValueError(f"Следующие категориальные признаки отсутствуют в данных: {missing_cols}")

In [10]:
train_pool = Pool(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_features])
test_pool = Pool(X_test, y_test, cat_features = [X_train.columns.get_loc(col) for col in categorical_features])

In [91]:
print(data['Paying price'].describe())

count    1.742103e+06
mean     9.238437e+01
std      6.392125e+01
min      1.000000e+00
25%      4.900000e+01
50%      7.700000e+01
75%      1.240000e+02
max      2.670000e+02
Name: Paying price, dtype: float64


In [None]:
model = CatBoostRegressor(
    iterations=1000, 
    learning_rate=0.9, 
    depth=8, 
    l2_leaf_reg=2, 
    bagging_temperature=1, 
    random_strength=1, 
    one_hot_max_size=10,   
    loss_function='RMSE',  
    od_type='Iter',   
    od_wait=50, 
    use_best_model=True, 
    random_seed=42,  
    verbose=200,
    task_type='GPU',
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())

0:	learn: 48.8518276	test: 48.7408625	best: 48.7408625 (0)	total: 430ms	remaining: 7m 9s
200:	learn: 40.2389422	test: 40.6268735	best: 40.6268735 (200)	total: 1m 38s	remaining: 6m 31s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 40.55626337
bestIteration = 308

Shrink model to first 309 iterations.
True
{'iterations': 1000, 'learning_rate': 0.9, 'depth': 8, 'l2_leaf_reg': 2, 'loss_function': 'RMSE', 'od_wait': 50, 'od_type': 'Iter', 'random_seed': 42, 'use_best_model': True, 'verbose': 200, 'one_hot_max_size': 10, 'random_strength': 1, 'bagging_temperature': 1}


In [13]:
predictions = model.predict(X_test)
feature_importances = model.get_feature_importance(prettified = True)
print("Важность факторов:")
print(feature_importances)

Важность факторов:
                                       Feature Id  Importances
0                                          Domain    37.446906
1                                            Hour    12.245412
2                             Ad slot floor price    10.001555
3                                         Weekday     9.345423
4                              Ad slot visibility     8.362022
5                                   Ad slot width     7.104684
6                                  Ad slot height     2.700202
7                                     Ad exchange     1.569149
8                                          Region     1.500383
9                                  Ad slot format     1.470233
10                                  Bidding price     1.136090
11                                           City     1.039313
12                                        browser     0.812695
13                                             os     0.759653
14                        Long-term 

In [94]:
high_error_mask = (np.abs(y_test - predictions) > 100)
high_error_data = X_test[high_error_mask].copy()
high_error_data['True'] = y_test[high_error_mask]
high_error_data['Predicted'] = predictions[high_error_mask]
print(high_error_data[['Ad slot width', 'Ad slot visibility', 'Ad slot format', 'True', 'Predicted']])

          Ad slot width Ad slot visibility Ad slot format  True   Predicted
5581422             950                  2              1   266  124.374945
1557844             950                  1              1   253  116.786941
9505726             300                  2              1   194   82.969219
10201031            250                  1              0   216   90.185949
8010677             120                  0              0   213   58.060432
...                 ...                ...            ...   ...         ...
3702197             950                255              1   223   32.774175
5515874             300                  2              1   190   85.892050
1525020             300                  2              1   253  128.309661
1534569             950                255              1   265   37.420698
1504497             950                  2              1   247  128.497709

[13160 rows x 5 columns]


In [95]:
print(data['Paying price'].describe())

count    1.742103e+06
mean     9.238437e+01
std      6.392125e+01
min      1.000000e+00
25%      4.900000e+01
50%      7.700000e+01
75%      1.240000e+02
max      2.670000e+02
Name: Paying price, dtype: float64


In [14]:
data['Paying price log'] = np.log1p(data['Paying price'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Paying price log'], test_size=0.2, random_state=42 )
train_pool = Pool(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_features])
test_pool = Pool(X_test, y_test, cat_features = [X_train.columns.get_loc(col) for col in categorical_features])

In [None]:
model = CatBoostRegressor(
    iterations=1000,        
    learning_rate=0.6,   
    depth=8,             
    l2_leaf_reg=2,          
    bagging_temperature=1,  
    random_strength=1,     
    one_hot_max_size=10,    
    loss_function='RMSE',     
    od_type='Iter',  
    od_wait=50,         
    use_best_model=True,     
    random_seed=42, 
    verbose=200,
    task_type='GPU',
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())

0:	learn: 0.6864138	test: 0.6848371	best: 0.6848371 (0)	total: 508ms	remaining: 8m 27s
200:	learn: 0.5274983	test: 0.5308463	best: 0.5308463 (200)	total: 3m 25s	remaining: 13m 35s
400:	learn: 0.5177044	test: 0.5278208	best: 0.5278105 (388)	total: 5m 43s	remaining: 8m 33s
600:	learn: 0.5108804	test: 0.5268659	best: 0.5268524 (599)	total: 8m 6s	remaining: 5m 22s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5265752866
bestIteration = 660

Shrink model to first 661 iterations.
True
{'iterations': 1000, 'learning_rate': 0.6, 'depth': 8, 'l2_leaf_reg': 2, 'loss_function': 'RMSE', 'od_wait': 50, 'od_type': 'Iter', 'random_seed': 42, 'use_best_model': True, 'verbose': 200, 'one_hot_max_size': 10, 'random_strength': 1, 'bagging_temperature': 1}


In [18]:
y_pred_log = model.predict(test_pool)
y_true_log = test_pool.get_label()
predictions = model.predict(X_test)

print("Важность факторов:")
print(y_pred_log)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_true_log)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"\nRMSE in original scale: {rmse:.2f}")
print(f"MAE in original scale: {mae:.2f}")


RMSE in original scale: 42.05
MAE in original scale: 25.38


In [19]:
feature_importances = model.get_feature_importance(prettified=True)
print("Важность факторов:")
print(feature_importances)

Важность факторов:
                                       Feature Id  Importances
0                                          Domain    35.206093
1                             Ad slot floor price    12.984861
2                                            Hour    10.348052
3                                         Weekday     7.925542
4                                   Ad slot width     6.256045
5                              Ad slot visibility     5.043103
6                                   Bidding price     4.828204
7                                  Ad slot height     2.713761
8                                         browser     2.629652
9                                          Region     1.948415
10                                           City     1.801594
11                                    Ad exchange     1.229013
12                                             os     0.991607
13                                 Ad slot format     0.819486
14           Long-term interest/onli

In [24]:
# Пример: ошибки для дорогих аукционов
high_price_mask = y_true > 150
mae_high = mean_absolute_error(y_true[high_price_mask], y_pred[high_price_mask])
print(f"MAE для аукционов >150$: {mae_high:.2f}")



MAE для аукционов >150$: 59.38


In [26]:
top_domains = data['Domain'].value_counts().nlargest(10).index.tolist()
# Ошибки по доменам
for domain in top_domains:
    mask = X_test['Domain'] == domain
    rmse_domain = np.sqrt(mean_squared_error(y_true[mask], y_pred[mask]))
    print(f"RMSE для {domain}: {rmse_domain:.2f}")

RMSE для missing: 48.83
RMSE для trqRTuMvjTN7X9KbuKz: 16.67
RMSE для 5F1RQS9rg5scFsf: 11.44
RMSE для 31xSTvprdN1RFt: 4.53
RMSE для ersbQv1RdoTy1m58uG: 8.19
RMSE для DFpETuxoGQdcFNKbuKz: 40.42
RMSE для eA1XTupSMZq81YT9UDaWvpdh: 12.68
RMSE для trqRTvpogNlyDok4JKTI: 38.63
RMSE для trqRTuNoGTlyDok4JKTI: 36.94
RMSE для DSmreF9aBTN7gqKbuKz: 25.59


In [20]:
data["s"] = data['Ad slot width'] * data['Ad slot height']
data['weekend_flag'] = data['Weekday'].isin([5, 6]).astype(int)
data['aspect_ratio'] = data['Ad slot width'] / (data['Ad slot height'] + 1e-6)
data['domain_hour_interaction'] = data['Domain'].astype(str) + '_' + data['Hour'].astype(str)
data['floor_bid_ratio'] = data['Bidding price'] / (data['Ad slot floor price'] + 1e-6)

In [21]:
categorical_features = [
    'City', 'Region', 'Ad exchange', 'Ad slot visibility', 'Ad slot format', "Hour", "Weekday", "Domain", "os", "device", "device_type", "browser"]

categorical_features+=[ 'weekend_flag', 'floor_bid_ratio', 'domain_hour_interaction']
for col in categorical_features:
    # Заменяем NaN на строку 'missing'
    data[col] = data[col].fillna('missing')
    # Преобразуем все значения в строки
    data[col] = data[col].astype(str)

In [22]:
X = data.drop(columns=['Paying price', 'Paying price log'], errors='ignore')
y = data['Paying price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )
train_pool = Pool(X_train, y_train, cat_features= categorical_features)
test_pool = Pool(X_test, y_test, cat_features =categorical_features)

In [13]:

model = CatBoostRegressor(
    iterations=1000,            # Максимальное число итераций
    learning_rate=0.9,         # Скорость обучения
    depth=6,                    # Глубина деревьев
    l2_leaf_reg=5,              # Регуляризация
    bagging_temperature=1,      # Рандомизация бутстрэпа
    random_strength=1,          # Случайность разбиений
    one_hot_max_size=10,        # Признаки с уникальными значениями <= 10 будут one-hot кодироваться
    loss_function='MAE',       # Потеря (можно попробовать 'Quantile:alpha=0.5' для устойчивости к выбросам)
    od_type='Iter',             # Тип ранней остановки
    od_wait=50,                 # Количество итераций для early stopping
    use_best_model=True,        # Использовать лучшую модель по валидационной выборке
    random_seed=42,             # Фиксированный seed для воспроизводимости
    verbose=200,
    #task_type='GPU',# Вывод прогресса обучения
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())

0:	learn: 35.5586240	test: 35.5211772	best: 35.5211772 (0)	total: 1.23s	remaining: 20m 25s
200:	learn: 27.6397532	test: 27.5700772	best: 27.5700772 (200)	total: 3m 2s	remaining: 12m 4s
400:	learn: 26.8216493	test: 26.7214553	best: 26.7214553 (400)	total: 4m 57s	remaining: 7m 24s
600:	learn: 26.5229441	test: 26.4196573	best: 26.4196573 (600)	total: 6m 56s	remaining: 4m 36s
800:	learn: 26.2859676	test: 26.1790860	best: 26.1790860 (800)	total: 8m 48s	remaining: 2m 11s
999:	learn: 26.0148865	test: 25.9057820	best: 25.9057705 (995)	total: 10m 30s	remaining: 0us

bestTest = 25.9057705
bestIteration = 995

Shrink model to first 996 iterations.
True
{'iterations': 1000, 'learning_rate': 0.9, 'depth': 6, 'l2_leaf_reg': 5, 'loss_function': 'MAE', 'od_wait': 50, 'od_type': 'Iter', 'random_seed': 42, 'use_best_model': True, 'verbose': 200, 'one_hot_max_size': 10, 'random_strength': 1, 'bagging_temperature': 1}


In [14]:
model = CatBoostRegressor(
    iterations=1000,            # Максимальное число итераций
    learning_rate=0.9,         # Скорость обучения
    depth=6,                    # Глубина деревьев
    l2_leaf_reg=5,              # Регуляризация
    bagging_temperature=1,      # Рандомизация бутстрэпа
    random_strength=1,          # Случайность разбиений
    one_hot_max_size=10,        # Признаки с уникальными значениями <= 10 будут one-hot кодироваться
    loss_function='Quantile:alpha=0.5', # Потеря (можно попробовать 'Quantile:alpha=0.5' для устойчивости к выбросам)
    od_type='Iter',             # Тип ранней остановки
    od_wait=50,                 # Количество итераций для early stopping
    use_best_model=True,        # Использовать лучшую модель по валидационной выборке
    random_seed=42,             # Фиксированный seed для воспроизводимости
    verbose=200,
    #task_type='GPU',# Вывод прогресса обучения
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())

0:	learn: 17.7793120	test: 17.7605886	best: 17.7605886 (0)	total: 986ms	remaining: 16m 25s
200:	learn: 13.8198766	test: 13.7850386	best: 13.7850386 (200)	total: 3m 27s	remaining: 13m 42s
400:	learn: 13.4108246	test: 13.3607277	best: 13.3607277 (400)	total: 5m 50s	remaining: 8m 43s
600:	learn: 13.2614720	test: 13.2098286	best: 13.2098286 (600)	total: 7m 56s	remaining: 5m 16s
800:	learn: 13.1429838	test: 13.0895430	best: 13.0895430 (800)	total: 9m 52s	remaining: 2m 27s
999:	learn: 13.0074433	test: 12.9528910	best: 12.9528852 (995)	total: 11m 48s	remaining: 0us

bestTest = 12.95288525
bestIteration = 995

Shrink model to first 996 iterations.
True
{'iterations': 1000, 'learning_rate': 0.9, 'depth': 6, 'l2_leaf_reg': 5, 'loss_function': 'Quantile:alpha=0.5', 'od_wait': 50, 'od_type': 'Iter', 'random_seed': 42, 'use_best_model': True, 'verbose': 200, 'one_hot_max_size': 10, 'random_strength': 1, 'bagging_temperature': 1}


In [None]:
predictions = model.predict(test_pool)
feature_importances = model.get_feature_importance(prettified = True)
print("Важность факторов:")
print(feature_importances)