In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv
/kaggle/input/russian-car-plates-prices-prediction/supplemental_english.py
/kaggle/input/russian-car-plates-prices-prediction/supplemental_russian.py
/kaggle/input/russian-car-plates-prices-prediction/train.csv
/kaggle/input/russian-car-plates-prices-prediction/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/train.csv')
test = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/test.csv')

In [3]:
sample = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv')
sample.head()

Unnamed: 0,id,price
0,51636,100000
1,51637,100000
2,51638,100000
3,51639,100000
4,51640,100000


In [4]:
train.head()

Unnamed: 0,id,plate,date,price
0,1,X059CP797,2024-12-26 00:00:00,65000
1,2,Y800MH790,2024-07-12 21:31:37,100000
2,3,A212TX77,2024-04-18 00:00:00,290000
3,4,P001AY199,2025-01-03 00:27:15,680000
4,5,P001AY199,2025-01-10 09:32:41,750000


In [5]:
test.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,plate,date,price
0,51636,P700TT790,2025-01-27 00:00:00,
1,51637,M081TX797,2025-02-10 00:00:00,
2,51638,T333HX777,2025-02-11 00:00:00,
3,51639,H744BH977,2025-02-03 00:00:00,
4,51640,X066EM777,2025-02-12 00:00:00,


In [6]:
train.drop(columns=['id'],inplace=True)
test.drop(columns=['id','price'], inplace=True)
train.shape,test.shape

((51635, 3), (7695, 2))

In [7]:
train['price'].describe()
train['plate'].nunique()

43605

In [8]:
from datetime import datetime

In [9]:
# Date features
def preprocess_dates(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    df['days_since'] = (df['date'] - df['date'].min()).dt.days
    return df

#Plate Features
def preprocess_plate(df):
    df[['letter1', 'digits', 'letters2', 'region']] = df['plate'].str.extract(
        r"^([ABEKMHOPCTYX])(\d{3})([ABEKMHOPCTYX]{2})(\d{2,3})$"
    )
    return df

In [10]:
def extract_digit_features(df):
    df['digits'] = df['digits'].astype(str).str.zfill(3)
    
    df['is_palindrome'] = df['digits'].apply(lambda x:x == x[::-1])
    
    df['is_repeated_digits'] = df['digits'].apply(lambda x: len(set(x))== 1)
    
    df['unique_digit_count'] = df['digits'].apply(lambda x: len(set(x)))
    return df

In [11]:
train = preprocess_dates(train)
train = preprocess_plate(train)
train = extract_digit_features(train)

test = preprocess_dates(test)
test = preprocess_plate(test)
test = extract_digit_features(test)

In [12]:
region_price_stats = train.groupby('region')['price'].mean().sort_values(ascending=False)
region_price_stats.head(10)

region
88     5.500050e+07
08     3.076363e+06
17     2.528571e+06
121    2.266667e+06
24     2.132473e+06
66     1.842525e+06
39     1.555650e+06
173    1.500000e+06
63     1.452456e+06
77     1.345005e+06
Name: price, dtype: float64

In [13]:
from supplemental_english import REGION_CODES, GOVERNMENT_CODES

REGION_CODES_INVERTED = {}
for name, codes in REGION_CODES.items():
    if isinstance(codes, list):
        for code in codes:
            REGION_CODES_INVERTED[str(code)] = name
    else:
        REGION_CODES_INVERTED[str(codes)] = name


In [14]:
def map_region_features(df, reference_df, price_threshold=1_000_000):
    df['region'] = df['region'].astype(str)
    reference_df['region'] = reference_df['region'].astype(str)

    # Compute average price per region
    region_price_stats = reference_df.groupby('region')['price'].mean()

    # Select regions above the threshold
    prestige_regions = set(region_price_stats[region_price_stats > price_threshold].index)

    # Map region name and flag prestigious
    df['region_name'] = df['region'].map(REGION_CODES_INVERTED)
    df['is_prestigious_region'] = df['region'].isin(prestige_regions)

    return df

In [15]:
train = map_region_features(train, reference_df=train, price_threshold=1_000_000)

prestigious_regions = train.loc[train['is_prestigious_region'], 'region'].unique()
print("Number of prestigious regions:", len(prestigious_regions))
print("Prestigious regions:", prestigious_regions)


Number of prestigious regions: 13
Prestigious regions: ['77' '99' '39' '63' '42' '24' '08' '01' '66' '17' '173' '88' '121']


In [16]:
# Apply using train as reference for prestige calculation
train = map_region_features(train, reference_df=train, price_threshold=1_000_000)
test = map_region_features(test, reference_df=train, price_threshold=1_000_000)


In [17]:
def flag_govt_plate(df,govt_dict):
    df['is_govt_plate'] = df['plate'].isin(govt_dict)
    return df

In [18]:
train = flag_govt_plate(train, GOVERNMENT_CODES)
test = flag_govt_plate(test, GOVERNMENT_CODES)


In [19]:
train.head()

Unnamed: 0,plate,date,price,year,month,day,weekday,days_since,letter1,digits,letters2,region,is_palindrome,is_repeated_digits,unique_digit_count,region_name,is_prestigious_region,is_govt_plate
0,X059CP797,2024-12-26 00:00:00,65000,2024,12,26,3,1407,X,59,CP,797,False,False,3,Moscow,False,False
1,Y800MH790,2024-07-12 21:31:37,100000,2024,7,12,4,1241,Y,800,MH,790,False,False,2,Moscow Oblast,False,False
2,A212TX77,2024-04-18 00:00:00,290000,2024,4,18,3,1155,A,212,TX,77,True,False,2,Moscow,True,False
3,P001AY199,2025-01-03 00:27:15,680000,2025,1,3,4,1415,P,1,AY,199,False,False,2,Moscow,False,False
4,P001AY199,2025-01-10 09:32:41,750000,2025,1,10,4,1422,P,1,AY,199,False,False,2,Moscow,False,False


In [20]:
test.head()

Unnamed: 0,plate,date,year,month,day,weekday,days_since,letter1,digits,letters2,region,is_palindrome,is_repeated_digits,unique_digit_count,region_name,is_prestigious_region,is_govt_plate
0,P700TT790,2025-01-27,2025,1,27,0,1439,P,700,TT,790,False,False,2,Moscow Oblast,False,False
1,M081TX797,2025-02-10,2025,2,10,0,1453,M,81,TX,797,False,False,3,Moscow,False,False
2,T333HX777,2025-02-11,2025,2,11,1,1454,T,333,HX,777,True,True,1,Moscow,False,False
3,H744BH977,2025-02-03,2025,2,3,0,1446,H,744,BH,977,False,False,2,Moscow,False,False
4,X066EM777,2025-02-12,2025,2,12,2,1455,X,66,EM,777,False,False,2,Moscow,False,False


In [21]:
feature_cols = [
    'year', 'month', 'day', 'weekday', 'days_since',
    'letter1', 'digits', 'letters2', 'region',
    'is_palindrome', 'is_repeated_digits', 'unique_digit_count',
    'is_prestigious_region', 'is_govt_plate'
]

X = train[feature_cols]
y = train['price']


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
cat_features = ['letter1', 'digits', 'letters2', 'region']


In [24]:
from catboost import CatBoostRegressor, Pool

train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MAE',
    eval_metric='MAE',
    early_stopping_rounds=50,
    verbose=100,
    random_seed=42
)

model.fit(train_pool, eval_set=val_pool)

0:	learn: 367707.2317665	test: 367661.1279351	best: 367661.1279351 (0)	total: 94ms	remaining: 1m 33s
100:	learn: 248009.0177039	test: 243940.8323257	best: 243940.8323257 (100)	total: 2.13s	remaining: 19s
200:	learn: 235354.8827096	test: 232167.6136106	best: 232167.6136106 (200)	total: 4.53s	remaining: 18s
300:	learn: 228949.1689642	test: 227129.4515905	best: 227129.4515905 (300)	total: 6.75s	remaining: 15.7s
400:	learn: 222532.1738033	test: 222098.8770603	best: 222098.8770603 (400)	total: 8.91s	remaining: 13.3s
500:	learn: 219339.4925655	test: 220171.0210906	best: 220148.3620291 (497)	total: 11.1s	remaining: 11.1s
600:	learn: 215623.6617374	test: 217585.1547438	best: 217585.1547438 (600)	total: 13.3s	remaining: 8.86s
700:	learn: 213229.2242661	test: 216366.9781962	best: 216366.9781962 (700)	total: 15.6s	remaining: 6.66s
800:	learn: 211353.6867350	test: 215345.3956349	best: 215345.3956349 (800)	total: 18s	remaining: 4.47s
900:	learn: 210054.6461707	test: 214745.0912915	best: 214745.0912

<catboost.core.CatBoostRegressor at 0x7d9ffa273b50>

In [25]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    return np.mean(diff) * 100

val_preds = model.predict(val_pool)
print("SMAPE on validation:", smape(y_val, val_preds))


SMAPE on validation: 38.59209297677174


In [26]:
X_test = test[feature_cols]
test_pool = Pool(X_test, cat_features=cat_features)
test_preds = model.predict(test_pool)


In [27]:
sample = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv')
sample['price'] = np.clip(test_preds, a_min=0, a_max=None)
sample.to_csv('submission.csv', index=False)


In [28]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# New pools with the transformed target
train_pool = Pool(X_train, y_train_log, cat_features=cat_features)
val_pool = Pool(X_val, y_val_log, cat_features=cat_features)

model.fit(train_pool, eval_set=val_pool)


0:	learn: 0.8994074	test: 0.9048130	best: 0.9048130 (0)	total: 29.5ms	remaining: 29.5s
100:	learn: 0.4650027	test: 0.4636064	best: 0.4636064 (100)	total: 1.88s	remaining: 16.7s
200:	learn: 0.4395449	test: 0.4401824	best: 0.4401824 (200)	total: 4.31s	remaining: 17.1s
300:	learn: 0.4267986	test: 0.4299595	best: 0.4299595 (300)	total: 6.81s	remaining: 15.8s
400:	learn: 0.4181857	test: 0.4238685	best: 0.4238685 (400)	total: 8.9s	remaining: 13.3s
500:	learn: 0.4120265	test: 0.4198181	best: 0.4198181 (500)	total: 11s	remaining: 10.9s
600:	learn: 0.4073537	test: 0.4172774	best: 0.4172774 (600)	total: 13.1s	remaining: 8.71s
700:	learn: 0.4033505	test: 0.4153610	best: 0.4153610 (700)	total: 15.4s	remaining: 6.55s
800:	learn: 0.4000283	test: 0.4139181	best: 0.4139181 (800)	total: 17.7s	remaining: 4.4s
900:	learn: 0.3969498	test: 0.4125221	best: 0.4125221 (900)	total: 19.9s	remaining: 2.18s
999:	learn: 0.3944679	test: 0.4114254	best: 0.4114254 (999)	total: 22.1s	remaining: 0us

bestTest = 0.41142

<catboost.core.CatBoostRegressor at 0x7d9ffa273b50>

In [29]:
val_preds_log = model.predict(val_pool)
val_preds = np.expm1(val_preds_log)

print("SMAPE:", smape(y_val, val_preds))


SMAPE: 37.99419141326699


In [30]:
X_test = test[feature_cols]
test_pool = Pool(X_test, cat_features=cat_features)
test_preds_log = model.predict(test_pool)
test_preds = np.expm1(test_preds_log)

In [31]:
sample = pd.read_csv('/kaggle/input/russian-car-plates-prices-prediction/sample_submission.csv')
sample['price'] = np.clip(test_preds, a_min=0, a_max=None)
sample.to_csv('submission.csv', index=False)
