In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from user_agents import parse

In [2]:
data = pd.read_csv('data/final_data.csv', sep=",", header = 0)

In [7]:
data_columns = [
    "Bid ID", "Timestamp", "Log type", "iPinYou ID", "User-Agent", "IP",
    "Region", "City", "Ad exchange", "Domain", "URL", "Anonymous URL ID",
    "Ad slot ID", "Ad slot width", "Ad slot height", "Ad slot visibility",
    "Ad slot format", "Ad slot floor price", "Creative ID", "Bidding price",
    "Paying price", "Key page URL", "Advertiser ID", "User Tags", 'All paying price',
]
columns_to_drop = [
    "Bid ID", "iPinYou ID", "User-Agent", "IP", "URL", "Log type", "Timestamp",
    "Anonymous URL ID", "Creative ID", "Key page URL", "Ad slot ID", "Advertiser ID", 'All paying price', "User Tags",
]


In [4]:
#data = data.drop_duplicates()
data = data[data['Advertiser ID'] == 3476]
data = data[data["Paying price"] > 0]

In [5]:
def parse_timestamp(ts):
    ts_str = str(ts)
    year = int(ts_str[:4])
    month = int(ts_str[4:6])
    day = int(ts_str[6:8])
    hour = int(ts_str[8:10])
    minute = int(ts_str[10:12])
    return pd.Timestamp(year=year, month=month, day=day, hour=hour, minute=minute)


In [6]:
data['Timestamp'] = data['Timestamp'].apply(parse_timestamp)
data['Hour'] = data['Timestamp'].dt.hour
data['Weekday'] = data['Timestamp'].dt.weekday

In [8]:
data["s"] = data['Ad slot width'] * data['Ad slot height']
data['weekend_flag'] = data['Weekday'].isin([5, 6]).astype(int)
data['aspect_ratio'] = data['Ad slot width'] / (data['Ad slot height'] + 1e-6)
data['domain_hour_interaction'] = data['Domain'].astype(str) + '_' + data['Hour'].astype(str)
data['floor_bid_ratio'] = data['Bidding price'] / (data['Ad slot floor price'] + 1e-6)
categorical_features = [
    'City', 'Region', 'Ad exchange', 'Ad slot visibility', 'Ad slot format', "Hour", "Weekday", "Domain", "os", "device", "device_type", "browser"]

categorical_features+=[ 'weekend_flag', 'floor_bid_ratio', 'domain_hour_interaction']

In [82]:
tag_names = {
    '10006': 'Long-term interest/news',
    '10024': 'Long-term interest/education',
    '10031': 'Long-term interest/automobile',
    '10048': 'Long-term interest/real estate',
    '10052': 'Long-term interest/IT',
    '10057': 'Long-term interest/electronic game',
    '10059': 'Long-term interest/fashion',
    '10063': 'Long-term interest/entertainment',
    '10067': 'Long-term interest/luxury',
    '10074': 'Long-term interest/home and lifestyle',
    '10075': 'Long-term interest/health',
    '10076': 'Long-term interest/food',
    '10077': 'Long-term interest/divine',
    '10079': 'Long-term interest/motherhood&parenting',
    '10083': 'Long-term interest/sports',
    '10093': 'Long-term interest/travel&outdoors',
    '10102': 'Long-term interest/social',
    '10684': 'In-market/3c product',
    '11092': 'In-market/appliances',
    '11278': 'In-market/clothing, shoes&bags',
    '11379': 'In-market/Beauty & Personal Care',
    '11423': 'In-market/household & home improvement',
    '11512': 'In-market/infant & mom products',
    '11576': 'In-market/sports item',
    '11632': 'In-market/outdoor',
    '11680': 'In-market/health care products',
    '11724': 'In-market/luxury',
    '11944': 'In-market/real estate',
    '13042': 'In-market/automobile',
    '13403': 'In-market/finance',
    '13496': 'In-market/travel',
    '13678': 'In-market/education',
    '13776': 'In-market/service',
    '13800': 'Long-term interest/art & photography & design',
    '13866': 'Long-term interest/online literature',
    '13874': 'In-market/electronic game',
    '14273': 'Long-term interest/3c',
    '16593': 'In-market/book',
    '16617': 'In-market/medicine',
    '16661': 'In-market/food & drink',
    '16706': 'Long-term interest/culture',
    '10110': 'Demographic/gender/male',
    '10111': 'Demographic/gender/female'
}
data['User Tags'] = data['User Tags'].fillna('').astype(str)

for tag, name in tag_names.items():
    data[name] = data['User Tags'].apply(lambda x: 1 if tag in x.split(',') else 0)

Bid ID                        1727541
Timestamp                        1058
Log type                            1
iPinYou ID                    1518882
IP                             445851
                               ...   
In-market/medicine                  2
In-market/food & drink              2
Long-term interest/culture          2
Demographic/gender/male             2
Demographic/gender/female           2
Length: 73, dtype: int64


In [9]:
data = data.drop(columns=columns_to_drop, errors='ignore')

In [23]:
for col in categorical_features:
    data[col] = data[col].fillna("missing").astype(str)

n_bins = 25
bins = np.linspace(data['Paying price'].min(), data['Paying price'].max(), n_bins + 1)
data['price_bin'] = pd.cut(data['Paying price'], bins=bins, labels=False, include_lowest=True)

X = data.drop(columns=['Paying price', 'price_bin'], errors='ignore')
y = data['price_bin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
missing_cols = set(categorical_features) - set(X_train.columns)
if missing_cols:
    raise ValueError(f"Следующие категориальные признаки отсутствуют в данных: {missing_cols}")

In [27]:
import ipywidgets

model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.5,
    od_wait=100,
    cat_features=categorical_features,
    verbose = 100
)
model.fit(X_train, y_train, eval_set = (X_test, y_test), )

KeyboardInterrupt: 