In [14]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from user_agents import parse

In [15]:
data = pd.read_csv('data/final_data.csv', sep=",", header = 0)

In [16]:
data_columns = [
    "Bid ID", "Timestamp", "Log type", "iPinYou ID", "User-Agent", "IP",
    "Region", "City", "Ad exchange", "Domain", "URL", "Anonymous URL ID",
    "Ad slot ID", "Ad slot width", "Ad slot height", "Ad slot visibility",
    "Ad slot format", "Ad slot floor price", "Creative ID", "Bidding price",
    "Paying price", "Key page URL", "Advertiser ID", "User Tags", 'All paying price',
]
columns_to_drop = [
    "Bid ID", "iPinYou ID", "User-Agent", "IP", "URL", "Log type", "Timestamp",
    "Anonymous URL ID", "Creative ID", "Key page URL", "Ad slot ID", "Advertiser ID", 'All paying price', "User Tags",
]
categorical_features = [
    'City', 'Region', 'Ad exchange', 'Ad slot visibility', 'Ad slot format', "Hour", "Weekday", "Domain", "os", "device", "device_type", "browser"]


In [17]:
#data = data.drop_duplicates()
data = data[data['Advertiser ID'] == 3386]
data = data[data["Paying price"] > 0]

In [18]:
def parse_timestamp(ts):
    ts_str = str(ts)
    year = int(ts_str[:4])
    month = int(ts_str[4:6])
    day = int(ts_str[6:8])
    hour = int(ts_str[8:10])
    minute = int(ts_str[10:12])
    return pd.Timestamp(year=year, month=month, day=day, hour=hour, minute=minute)


In [19]:
data['Timestamp'] = data['Timestamp'].apply(parse_timestamp)
data['Hour'] = data['Timestamp'].dt.hour
data['Weekday'] = data['Timestamp'].dt.weekday

In [20]:
tag_names = {
    '10006': 'Long-term interest/news',
    '10024': 'Long-term interest/education',
    '10031': 'Long-term interest/automobile',
    '10048': 'Long-term interest/real estate',
    '10052': 'Long-term interest/IT',
    '10057': 'Long-term interest/electronic game',
    '10059': 'Long-term interest/fashion',
    '10063': 'Long-term interest/entertainment',
    '10067': 'Long-term interest/luxury',
    '10074': 'Long-term interest/home and lifestyle',
    '10075': 'Long-term interest/health',
    '10076': 'Long-term interest/food',
    '10077': 'Long-term interest/divine',
    '10079': 'Long-term interest/motherhood&parenting',
    '10083': 'Long-term interest/sports',
    '10093': 'Long-term interest/travel&outdoors',
    '10102': 'Long-term interest/social',
    '10684': 'In-market/3c product',
    '11092': 'In-market/appliances',
    '11278': 'In-market/clothing, shoes&bags',
    '11379': 'In-market/Beauty & Personal Care',
    '11423': 'In-market/household & home improvement',
    '11512': 'In-market/infant & mom products',
    '11576': 'In-market/sports item',
    '11632': 'In-market/outdoor',
    '11680': 'In-market/health care products',
    '11724': 'In-market/luxury',
    '11944': 'In-market/real estate',
    '13042': 'In-market/automobile',
    '13403': 'In-market/finance',
    '13496': 'In-market/travel',
    '13678': 'In-market/education',
    '13776': 'In-market/service',
    '13800': 'Long-term interest/art & photography & design',
    '13866': 'Long-term interest/online literature',
    '13874': 'In-market/electronic game',
    '14273': 'Long-term interest/3c',
    '16593': 'In-market/book',
    '16617': 'In-market/medicine',
    '16661': 'In-market/food & drink',
    '16706': 'Long-term interest/culture',
    '10110': 'Demographic/gender/male',
    '10111': 'Demographic/gender/female'
}
data['User Tags'] = data['User Tags'].fillna('').astype(str)

for tag, name in tag_names.items():
    data[name] = data['User Tags'].apply(lambda x: 1 if tag in x.split(',') else 0)

In [21]:
print(data.nunique())

Bid ID                        2835896
Timestamp                        4312
Log type                            1
iPinYou ID                    2641288
IP                             489689
                               ...   
In-market/medicine                  2
In-market/food & drink              2
Long-term interest/culture          2
Demographic/gender/male             2
Demographic/gender/female           2
Length: 73, dtype: int64


In [22]:
data = data.drop(columns=columns_to_drop, errors='ignore')

In [29]:
print(data.nunique())

Region                                              35
City                                               370
Ad exchange                                          3
Domain                                           13462
Ad slot width                                        8
Ad slot height                                       5
Ad slot visibility                                   4
Ad slot format                                       2
Ad slot floor price                                184
Bidding price                                        1
Paying price                                       300
device                                            2774
os                                                  29
browser                                             77
device_type                                          5
Hour                                                24
Weekday                                              7
Long-term interest/news                              2
Long-term 

In [23]:
for col in categorical_features:
    data[col] = data[col].fillna("missing").astype(str)
X = data.drop(columns=['Paying price'], errors='ignore')
y = data['Paying price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )
print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 2278238
Test size: 569560


In [31]:
print(data.duplicated().sum())

95573


In [24]:
print(data.isnull().sum())

Region                                           0
City                                             0
Ad exchange                                      0
Domain                                           0
Ad slot width                                    0
Ad slot height                                   0
Ad slot visibility                               0
Ad slot format                                   0
Ad slot floor price                              0
Bidding price                                    0
Paying price                                     0
device                                           0
os                                               0
browser                                          0
device_type                                      0
Hour                                             0
Weekday                                          0
Long-term interest/news                          0
Long-term interest/education                     0
Long-term interest/automobile  

In [25]:
missing_cols = set(categorical_features) - set(X_train.columns)
if missing_cols:
    raise ValueError(f"Следующие категориальные признаки отсутствуют в данных: {missing_cols}")

In [26]:
train_pool = Pool(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_features])
test_pool = Pool(X_test, y_test, cat_features = [X_train.columns.get_loc(col) for col in categorical_features])

In [27]:
print(data['Paying price'].describe())

count    2.847798e+06
mean     7.692501e+01
std      6.124739e+01
min      1.000000e+00
25%      3.200000e+01
50%      6.700000e+01
75%      9.000000e+01
max      3.000000e+02
Name: Paying price, dtype: float64


In [None]:
model = CatBoostRegressor(
    iterations=1000, 
    learning_rate=0.9,      
    depth=8,         
    l2_leaf_reg=2,  
    bagging_temperature=1,    
    random_strength=1,      
    one_hot_max_size=10,     
    loss_function='RMSE',     
    od_type='Iter',  
    od_wait=50,     
    use_best_model=True, 
    random_seed=42,  
    verbose=200,    
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())

0:	learn: 51.4189012	test: 51.3675637	best: 51.3675637 (0)	total: 1.83s	remaining: 30m 25s
200:	learn: 41.2740765	test: 41.3702541	best: 41.3695875 (197)	total: 8m 51s	remaining: 35m 11s
400:	learn: 40.6534751	test: 41.2383998	best: 41.2381669 (398)	total: 13m 51s	remaining: 20m 41s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 41.23249798
bestIteration = 413

Shrink model to first 414 iterations.
True
{'iterations': 1000, 'learning_rate': 0.9, 'depth': 8, 'l2_leaf_reg': 2, 'loss_function': 'RMSE', 'od_wait': 50, 'od_type': 'Iter', 'random_seed': 42, 'use_best_model': True, 'verbose': 200, 'one_hot_max_size': 10, 'random_strength': 1, 'bagging_temperature': 1}


In [29]:
predictions = model.predict(X_test)
feature_importances = model.get_feature_importance(prettified = True)
print("Важность факторов:")
print(feature_importances)

Важность факторов:
                                       Feature Id  Importances
0                                          Domain    18.785257
1                             Ad slot floor price    18.209544
2                                     Ad exchange    15.922640
3                              Ad slot visibility    12.798975
4                                   Ad slot width    10.043139
5                                         Weekday     6.416483
6                                            Hour     5.865288
7                                  Ad slot height     3.100707
8                                            City     1.286073
9                                         browser     1.262956
10                                 Ad slot format     1.100893
11                                         Region     0.946379
12                        Long-term interest/food     0.497242
13                                             os     0.445591
14                        Long-term 

In [None]:
high_error_mask = (np.abs(y_test - predictions) > 100)
high_error_data = X_test[high_error_mask].copy()
high_error_data['True'] = y_test[high_error_mask]
high_error_data['Predicted'] = predictions[high_error_mask]
print(high_error_data[['Ad slot width', 'Ad slot visibility', 'Ad slot format', 'True', 'Predicted']])

In [None]:
print(data['Paying price'].describe())

In [None]:
data['Paying price log'] = np.log1p(data['Paying price'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Paying price log'], test_size=0.2, random_state=42 )
train_pool = Pool(X_train, y_train, cat_features=[X_train.columns.get_loc(col) for col in categorical_features])
test_pool = Pool(X_test, y_test, cat_features = [X_train.columns.get_loc(col) for col in categorical_features])

In [None]:
y_pred_log = model.predict(test_pool)
y_true_log = test_pool.get_label()

y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_true_log)

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)

print(f"\nRMSE in original scale: {rmse:.2f}")
print(f"MAE in original scale: {mae:.2f}")

In [None]:
feature_importances = model.get_feature_importance(prettified=True)
print("Важность факторов:")
print(feature_importances)

In [None]:
# Пример: ошибки для дорогих аукционов
high_price_mask = y_true > 150
mae_high = mean_absolute_error(y_true[high_price_mask], y_pred[high_price_mask])
print(f"MAE для аукционов >150$: {mae_high:.2f}")



In [None]:
top_domains = data['Domain'].value_counts().nlargest(10).index.tolist()
# Ошибки по доменам
for domain in top_domains:
    mask = X_test['Domain'] == domain
    rmse_domain = np.sqrt(mean_squared_error(y_true[mask], y_pred[mask]))
    print(f"RMSE для {domain}: {rmse_domain:.2f}")

In [None]:
data["s"] = data['Ad slot width'] * data['Ad slot height']
data['weekend_flag'] = data['Weekday'].isin([5, 6]).astype(int)
data['aspect_ratio'] = data['Ad slot width'] / (data['Ad slot height'] + 1e-6)
data['domain_hour_interaction'] = data['Domain'].astype(str) + '_' + data['Hour'].astype(str)
data['floor_bid_ratio'] = data['Bidding price'] / (data['Ad slot floor price'] + 1e-6)

In [None]:
categorical_features = [
    'City', 'Region', 'Ad exchange', 'Ad slot visibility', 'Ad slot format', "Hour", "Weekday", "Domain", "os", "device", "device_type", "browser"]

categorical_features+=[ 'weekend_flag', 'floor_bid_ratio', 'domain_hour_interaction']
for col in categorical_features:
    # Заменяем NaN на строку 'missing'
    data[col] = data[col].fillna('missing')
    # Преобразуем все значения в строки
    data[col] = data[col].astype(str)

In [None]:
X = data.drop(columns=['Paying price', 'Paying price log'], errors='ignore')
y = data['Paying price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 )
train_pool = Pool(X_train, y_train, cat_features= categorical_features)
test_pool = Pool(X_test, y_test, cat_features =categorical_features)

In [None]:

model = CatBoostRegressor(
    iterations=1000,  
    learning_rate=0.1,    
    depth=6,    
    l2_leaf_reg=5,        
    bagging_temperature=1,   
    random_strength=1,      
    one_hot_max_size=10,      
    loss_function='RMSE',    
    od_type='Iter',          
    od_wait=50,       
    use_best_model=True,   
    random_seed=42,         
    verbose=200,              
)
model.fit(train_pool, eval_set=test_pool)
print(model.is_fitted())
print(model.get_params())