In [1]:
# Importing essential libraries
import pandas as pd # For data manipulation and analysis
import numpy as np  # For numerical operations
import re           # For regular expressions (used in plate parsing)
import sys          # For system-specific parameters and functions (like path manipulation)
import os           # For interacting with the operating system (e.g., file paths)

# Scikit-learn for preprocessing and model selection utilities
from sklearn.preprocessing import StandardScaler, MinMaxScaler # For numerical scaling
from sklearn.impute import SimpleImputer # For handling missing values
from sklearn.model_selection import KFold, StratifiedKFold # For cross-validation strategies
from sklearn.ensemble import GradientBoostingRegressor # An example of a tree-based regressor
from sklearn.linear_model import Ridge # An example of a linear model
from sklearn.metrics import mean_squared_error # Metric for regression evaluation

In [2]:
# Load the training and test datasets
print("Loading datasets...")
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Combine train and test datasets for preprocessing
print("Combining datasets for preprocessing...")
train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train, test], ignore_index=True)
print("Combined shape:", df.shape)

Loading datasets...
Train shape: (51635, 4)
Test shape: (7695, 4)
Combining datasets for preprocessing...
Combined shape: (59330, 5)


### Plate Component Extraction and Basic Feature Engineering

In [3]:
def extract_components(plate):
    match = re.match(r'^([A-Z])(\d{3})([A-Z]{2})(\d{2,3})$', plate)
    if match:
        first_letter = match.group(1)
        number = int(match.group(2))
        middle_letters = match.group(3)
        region_code = match.group(4)
        letters = first_letter + middle_letters
        return first_letter, number, middle_letters, region_code, letters
    return None, None, None, None, None

# Apply the plate parsing function to the 'plate' column
print("Parsing plate numbers...")
df[['first_letter', 'number', 'middle_letters', 'region_code', 'letters']] = df['plate'].apply(
    lambda x: pd.Series(extract_components(x))
)

# Convert date to datetime format
df['date'] = pd.to_datetime(df['date'])
print("Date conversion complete.")

df.head()

Parsing plate numbers...
Date conversion complete.


Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY


In [4]:
# get temporal features from date
def get_temporal_features(df):
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
    df['quarter'] = df['date'].dt.quarter
    df['total_days'] = (df['date'] - df['date'].min()).dt.days
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['day_name'] = df['date'].dt.day_name()

    # Cyclical Features: Sine and cosine transformations for periodic features
    # These help models understand the cyclical nature without implying a linear relationship.
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
    df['weekday_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    return df

# Apply the temporal feature extraction
print("Extracting temporal features...")
df = get_temporal_features(df)
df.head()

Extracting temporal features...


Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,quarter,total_days,is_weekend,day_name,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,4,1407,0,Thursday,-2.449294e-16,1.0,-0.848644,0.528964,0.433884,-0.900969
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,3,1241,0,Friday,-0.5,-0.866025,0.651372,-0.758758,-0.433884,-0.900969
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,2,1155,0,Thursday,0.8660254,-0.5,-0.485302,-0.874347,0.433884,-0.900969
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,1,1415,0,Friday,0.5,0.866025,0.571268,0.820763,-0.433884,-0.900969
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,1,1422,0,Friday,0.5,0.866025,0.897805,-0.440394,-0.433884,-0.900969


#### Extracting Information from Supplemental Data

In [5]:
REGION_CODES = {
    "Republic of Adygea": ["01"],
    "Altai Republic": ["04"],
    "Republic of Bashkortostan": ["02", "102", "702"],
    "Republic of Buryatia": ["03"],
    "Republic of Dagestan": ["05"],
    "Donetsk People's Republic": ["80", "180"],
    "Republic of Ingushetia": ["06"],
    "Kabardino-Balkarian Republic": ["07"],
    "Republic of Kalmykia": ["08"],
    "Karachay-Cherkess Republic": ["09"],
    "Republic of Karelia": ["10"],
    "Komi Republic": ["11"],
    "Republic of Crimea": ["82"],
    "Luhansk People's Republic": ["81", "181"],
    "Republic of Mari El": ["12"],
    "Republic of Mordovia": ["13", "113"],
    "Sakha Republic": ["14"],
    "Republic of North Ossetia": ["15"],
    "Republic of Tatarstan": ["16", "116", "716"],
    "Republic of Tyva (Tuva)": ["17"],
    "Udmurt Republic": ["18"],
    "Republic of Khakassia": ["19"],
    "Chechen Republic": ["20", "95"],
    "Chuvash Republic": ["21", "121"],
    "Altai Krai": ["22", "122"],
    "Zabaykalsky Krai": ["75"],
    "Kamchatka Krai": ["41"],
    "Krasnodar Krai": ["23", "93", "123", "193", "323"],
    "Krasnoyarsk Krai": ["24", "84", "88", "124"],
    "Perm Krai": ["59", "81", "159"],
    "Primorsky Krai": ["25", "125"],
    "Stavropol Krai": ["26", "126"],
    "Khabarovsk Krai": ["27"],
    "Amur Oblast": ["28"],
    "Arkhangelsk Oblast": ["29"],
    "Astrakhan Oblast": ["30", "130"],
    "Belgorod Oblast": ["31"],
    "Bryansk Oblast": ["32"],
    "Vladimir Oblast": ["33"],
    "Volgograd Oblast": ["34", "134"],
    "Vologda Oblast": ["35"],
    "Voronezh Oblast": ["36", "136"],
    "Zaporizhzhia Oblast": ["85", "185"],
    "Ivanovo Oblast": ["37"],
    "Irkutsk Oblast": ["38", "85", "138"],
    "Kaliningrad Oblast": ["39", "91"],
    "Kaluga Oblast": ["40"],
    "Kemerovo Oblast": ["42", "142"],
    "Kirov Oblast": ["43"],
    "Kostroma Oblast": ["44"],
    "Kurgan Oblast": ["45"],
    "Kursk Oblast": ["46"],
    "Leningrad Oblast": ["47", "147"],
    "Lipetsk Oblast": ["48"],
    "Magadan Oblast": ["49"],
    "Moscow Oblast": ["50", "90", "150", "190", "250", "550", "750", "790"],
    "Murmansk Oblast": ["51"],
    "Nizhny Novgorod Oblast": ["52", "152", "252"],
    "Novgorod Oblast": ["53"],
    "Novosibirsk Oblast": ["54", "154", "754"],
    "Omsk Oblast": ["55", "155"],
    "Orenburg Oblast": ["56", "156"],
    "Oryol Oblast": ["57"],
    "Penza Oblast": ["58", "158"],
    "Pskov Oblast": ["60"],
    "Rostov Oblast": ["61", "161", "761"],
    "Ryazan Oblast": ["62"],
    "Samara Oblast": ["63", "163", "763"],
    "Saratov Oblast": ["64", "164"],
    "Sakhalin Oblast": ["65"],
    "Sverdlovsk Oblast": ["66", "96", "196"],
    "Smolensk Oblast": ["67"],
    "Tambov Oblast": ["68"],
    "Tver Oblast": ["69"],
    "Tomsk Oblast": ["70"],
    "Tula Oblast": ["71"],
    "Tyumen Oblast": ["72", "172"],
    "Ulyanovsk Oblast": ["73", "173"],
    "Kherson Oblast": ["84", "184"],
    "Chelyabinsk Oblast": ["74", "174", "774"],
    "Yaroslavl Oblast": ["76"],
    "Moscow": ["77", "97", "99", "177", "197", "199", "777", "797", "799", "977"],
    "Saint Petersburg": ["78", "98", "178", "198"],
    "Sevastopol": ["92"],
    "Jewish Autonomous Oblast": ["79"],
    "Nenets Autonomous Okrug": ["83"],
    "Khanty-Mansi Autonomous Okrug": ["86", "186"],
    "Chukotka Autonomous Okrug": ["87"],
    "Yamalo-Nenets Autonomous Okrug": ["89"],
    "Baikonur": ["94"],
    "Occupational Administration of Kharkiv Oblast": ["188"],
}

In [6]:
# ((letters, numbers range (from, to), region code), is it forbidden to buy (bool), do they have an advantage on the road (bool), level of significance (author's opinion))
GOVERNMENT_CODES = {
    # Moscow
    ("AMP", (0, 999), "97"): ("Government of Russia", 1, 1, 10),
    ("AMP", (0, 999), "77"): ("Partially Government of Russia", 0, 1, 8),
    ("EKX", (0, 999), "77"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("EKX", (0, 999), "97"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("EKX", (0, 999), "99"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("KKX", (0, 999), "77"): ("Partially used on vehicles of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("CAC", (500, 999), "77"): ("Former officially 'open' plates of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("CAC", (500, 999), "77"): ("Former officially 'open' plates of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("AOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("BOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("MOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("COO", (0, 999), "77"): ("Partially Administrative Directorate, Federation Council plates", 0, 1, 6),
    ("AMM", (0, 999), "99"): ("Partially plates of Moscow City Duma deputies, police", 0, 1, 4),
    ("CCC", (0, 999), "77"): ("Partially Central Special Communication, Courier Service, Ministry of Communications", 0, 1, 3),
    ("CCC", (0, 999), "99"): ("Partially Tax Police, Customs, Special Communications", 0, 1, 3),
    ("CCC", (0, 999), "97"): ("Partially Central Special Communication, Courier Service, Ministry of Communications", 0, 1, 3),
    ("KKK", (0, 999), "99"): ("Initially belonged to Courier Service, now used among private individuals", 0, 0, 1),
    ("OOO", (0, 999), "77"): ("Initially intended for Federal Security Service", 0, 0, 1),
    ("KMM", (0, 999), "77"): ("Partially Fire Department plates", 0, 1, 3),
    ("MMP", (300, 320), "77"): ("Partially Federal Security Service plates", 0, 1, 4),
    ("MMP", (0, 299), "77"): ("Partially Government of Russia, Federal Security Service, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("MMP", (321, 999), "77"): ("Partially Government of Russia, Federal Security Service, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("PMP", (0, 999), "77"): ("Partially Ministry of Justice plates", 0, 1, 3),
    ("AMO", (0, 999), "77"): ("Partially Moscow City Hall plates", 0, 1, 5),
    ("KOO", (0, 999), "77"): ("Partially Constitutional Court plates", 0, 1, 3),
    ("EPE", (0, 999), "77"): ("Partially State Duma plates", 0, 1, 3),
    ("AAA", (0, 999), "77"): ("Partially Administration of the President plates", 0, 1, 6),
    ("KMP", (0, 999), "77"): ("Partially Government of Russia plates", 0, 1, 3),
    ("TMP", (0, 999), "77"): ("Partially Government of Russia plates, as well as private individuals with connections in the traffic police", 0, 1, 2),
    ("YMP", (0, 999), "77"): ("Partially Government of Russia plates, as well as private individuals with connections in the traffic police", 0, 1, 2),
    ("XXX", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),
    ("YYY", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),
    ("XKX", (0, 999), "77"): ("Partially Federal Security Service and Federal Protective Service plates", 0, 1, 2),
    ("OMP", (0, 999), "77"): ("Partially Government of Russia, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("EEE", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),

    # Moscow Oblast
    ("AMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("BMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("KMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("CMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("OMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("MMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("TMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("HMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("YMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("XMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("AMM", (0, 999), "50"): ("Partially plates of the regional administration", 0, 1, 5),
    ("AMM", (0, 999), "90"): ("Partially plates of the regional administration", 0, 1, 5),
    ("MMM", (0, 999), "50"): ("Partially plates of law enforcement in the region (prosecutor's office, EMERCOM, traffic police, etc.)", 0, 1, 5),
    ("MMM", (0, 999), "90"): ("Partially plates of law enforcement in the region (prosecutor's office, EMERCOM, traffic police, etc.)", 0, 1, 5),

    # Saint Petersburg
    ("OBO", (0, 999), "78"): ("Partially Departmental Security Service plates", 0, 1, 4),
    ("OBO", (0, 999), "98"): ("Partially Departmental Security Service plates", 0, 1, 4),
    ("OTT", (0, 999), "78"): ("Partially former traffic police plates (now replaced by 98)", 0, 0, 1),
    ("OTT", (0, 999), "98"): ("Partially traffic police plates", 0, 1, 4),
    ("OMM", (0, 999), "78"): ("Partially city district police plates", 0, 1, 3),
    ("OMM", (0, 999), "98"): ("Partially city district police plates", 0, 1, 3),
    ("OOM", (0, 999), "78"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OOM", (0, 999), "98"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OKO", (0, 100), "78"): ("Partially former plates of the prosecutor's office and judicial department (now replaced by 98)", 0, 0, 1),
    ("OKO", (0, 100), "98"): ("Partially plates of the prosecutor's office and judicial department", 0, 1, 3),
    ("OKO", (700, 999), "78"): ("Partially former Federal Security Service plates (now replaced by 98)", 0, 0, 1),
    ("OKO", (700, 999), "98"): ("Partially Federal Security Service plates", 0, 1, 3),
    ("OPP", (0, 999), "78"): ("Partially former plates of the Main Department of Internal Affairs (now replaced by 98)", 0, 0, 1),
    ("OPP", (0, 999), "98"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OOH", (0, 999), "78"): ("Partially Federal Drug Control Service and Federal Tax Service plates", 0, 1, 3),
    ("OOH", (0, 999), "98"): ("Partially Federal Drug Control Service and Federal Tax Service plates", 0, 1, 3),
    ("OAO", (0, 999), "78"): ("Partially plates of the city and regional administration", 0, 1, 5),
    ("OAO", (0, 999), "98"): ("Partially plates of the city and regional administration", 0, 1, 5),
    ("AAA", (0, 100), "78"): ("Partially plates of the city and regional administration", 0, 1, 6),
    ("AAA", (0, 100), "98"): ("Partially plates of the city and regional administration", 0, 1, 6),
    ("OOO", (0, 899), "78"): ("Commercial plates", 0, 0, 2),
    ("OOO", (0, 899), "98"): ("Commercial plates", 0, 0, 2),
    ("OOO", (900, 999), "78"): ("Partially Federal Protective Service plates", 0, 1, 3),
    ("OOO", (900, 999), "98"): ("Partially Federal Protective Service plates", 0, 1, 3),
    ("OKC", (0, 999), "98"): ("Partially Constitutional Court of the Russian Federation plates", 0, 1, 3),
    ("OOC", (0, 999), "78"): ("Partially plates of heads of enterprises and organizations", 0, 0, 2),
    ("OOC", (0, 999), "98"): ("Partially plates of heads of enterprises and organizations", 0, 0, 2),
    ("MMM", (0, 999), "78"): ("Commercial plates", 0, 0, 2),
    ("MMM", (0, 999), "98"): ("Commercial plates", 0, 0, 2),

    # Altai Republic
    ("XXX", (0, 999), "04"): ("Widespread 'special' plates", 0, 0, 2),
    ("TTT", (0, 999), "04"): ("Rare 'special' plates", 0, 0, 2),
    ("PPP", (0, 999), "04"): ("Partially prosecutor's office of the republic", 0, 1, 3),
    ("PPA", (0, 999), "04"): ("Partially prosecutor's office of the republic", 0, 1, 3),
    ("MPA", (0, 999), "04"): ("Partially Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("OOO", (0, 999), "04"): ("Partially plates of the government of the republic", 0, 1, 5),
    ("HHH", (0, 999), "04"): ("Partially the republic's tax service plates", 0, 1, 3),
    ("CCC", (0, 999), "04"): ("Partially plates belonging to the republic's judges", 0, 1, 3),

    # Republic of Bashkortostan
    ("PKC", (0, 999), "02"): ("Partially State Assembly (Kurultai) plates", 0, 1, 5),
    ("KKC", (0, 999), "02"): ("Partially State Assembly (Kurultai) plates", 0, 1, 5),
    ("OOO", (0, 999), "02"): ("Partially plates of leaders of large enterprises and ministries", 0, 1, 3),
    ("AAA", (0, 999), "02"): ("Partially plates of the republic's government", 0, 1, 5),

    # Republic of Karelia
    ("TTT", (0, 999), "10"): ("Partially government of the republic and Federal Security Service plates", 0, 1, 5),
    ("HHH", (0, 999), "10"): ("Partially plates of city and district administrations of the republic", 0, 1, 4),
    ("MMM", (0, 999), "10"): ("Partially plates of the Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("EMP", (0, 999), "10"): ("Partially plates of the Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("CCC", (0, 999), "10"): ("Partially plates of the prosecutor's office and judges' vehicles", 0, 1, 3),

    # Komi Republic
    ("TTT", (0, 999), "11"): ("Partially government of the republic and Federal Security Service plates", 0, 1, 5),
    ("OOO", (0, 999), "11"): ("Widespread semi-special plates, leaders of large industrial companies", 0, 1, 3),

    # Sakha Republic
    ("PPP", (0, 999), "14"): ("Partially plates of the republic's prosecutor's office", 0, 1, 3),
    ("AAA", (0, 999), "14"): ("Motor pool of the President, Government, Parliament of the republic, as well as heads of state enterprises", 0, 1, 5),

    # Republic of Tatarstan
    ("OAA", (0, 999), "16"): ("Partially plates of heads of district administrations", 0, 1, 5),
    ("OAA", (0, 999), "116"): ("Partially plates of heads of district administrations", 0, 1, 5),
    ("OAA", (0, 999), "716"): ("Partially plates of heads of district administrations", 0, 1, 5),

    # Krasnodar Krai
    ("PPP", (0, 999), "23"): ("Partially plates of the Krai and city administrations", 0, 1, 5),
    ("HHH", (0, 999), "23"): ("Partially plates of the tax authorities", 0, 1, 3),
    ("OOO", (0, 999), "23"): ("Partially plates of the Krai and city administrations", 0, 1, 5),
    ("KKK", (0, 999), "23"): ("Partially plates of the Krai administration", 0, 1, 5),

    # Krasnoyarsk Krai
    ("KPK", (0, 999), "24"): ("Partially plates of the Krai administration", 0, 1, 5),
    ("OOO", (0, 999), "24"): ("Partially Federal Security Service plates of the Krai", 0, 1, 3),
    ("MKK", (0, 999), "24"): ("Partially former plates of the Ministry of Internal Affairs of the Krai", 0, 0, 1),

    # Primorsky Krai
    ("BOO", (0, 999), "25"): ("Partially military plates", 0, 1, 3),
    ("BOO", (0, 999), "125"): ("Partially city services plates in Vladivostok and districts", 0, 1, 2),
    ("AAA", (0, 999), "25"): ("Issued first in Vladivostok", 0, 0, 2),
    ("AAA", (0, 999), "125"): ("One of the most 'special' series, prosecutor's office", 1, 1, 5),
    ("HHH", (0, 999), "25"): ("Partially plates of the administration and vehicles of City Duma deputies", 0, 1, 3),
    ("MMM", (0, 999), "25"): ("Partially plates of the deputies of the Krai Legislative Assembly", 0, 1, 3),
    ("CCC", (0, 999), "25"): ("Partially plates of the Krai administration", 0, 1, 5),
    ("XXX", (0, 999), "25"): ("Partially plates of the prosecutor's office and the Department of Internal Affairs", 0, 1, 2),
    ("OOO", (0, 999), "25"): ("Partially former plates of the Krai administration (during Governor Evgeny Nazdratenko)", 0, 0, 1),
    ("TTT", (0, 999), "25"): ("Partially former plates of the Vladivostok administration and federal agencies in the Krai (during Mayor Yuri Kopylov)", 0, 0, 1),
    ("MBK", (0, 999), "25"): ("Partially plates for employees of the Department of Internal Affairs", 0, 1, 3),
    ("MBK", (0, 999), "125"): ("Partially plates for employees of the Department of Internal Affairs", 0, 1, 3),
    ("MOO", (0, 999), "25"): ("Partially plates for Krai agencies of the Department of Internal Affairs, EMERCOM, firefighters, etc.", 0, 1, 2),
    ("MOO", (0, 999), "125"): ("Partially plates for Krai agencies of the Department of Internal Affairs, EMERCOM, firefighters, etc.", 0, 1, 2),
    ("HOO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the southeastern region of the Krai (Nakhodka)", 0, 1, 3),
    ("HOO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the southeastern region of the Krai (Nakhodka)", 0, 1, 3),
    ("YOO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the central region of the Krai (Ussuriysk)", 0, 1, 3),
    ("YOO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the central region of the Krai (Ussuriysk)", 0, 1, 3),
    ("COO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the northern region of the Krai (Spassk-Dalny)", 0, 1, 3),
    ("COO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the northern region of the Krai (Spassk-Dalny)", 0, 1, 3),

    # Vologda Oblast
    ("AAA", (0, 999), "35"): ("Partially plates of the regional government and Vologda city administration", 0, 1, 5),

    # Volgograd Oblast
    ("AAM", (0, 999), "34"): ("Partially plates of the Oblast Duma", 0, 1, 3),
    ("PAA", (0, 999), "34"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAA", (0, 999), "34"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("ACK", (0, 999), "34"): ("Partially plates of the Investigative Committee, Main Department of Internal Affairs", 0, 1, 3),
    ("YYY", (0, 999), "34"): ("Partially Federal Security Service plates", 0, 1, 3),
    ("AAK", (0, 999), "34"): ("Partially plates of the Federal Bailiff Service, Ministry of Justice, and Judicial Department", 0, 1, 3),

    # Voronezh Oblast
    ("ААА", (0, 999), "36"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("BOA", (0, 999), "36"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MMM", (0, 999), "36"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Kaliningrad Oblast
    ("AAK", (0, 999), "39"): ("Partially plates of the Oblast Administration, Federal Security Service, and Prosecutor's Office", 0, 1, 5),
    ("KKK", (0, 999), "39"): ("Partially plates of the Oblast Administration, Federal Security Service, and Prosecutor's Office", 0, 1, 5),
    ("PPP", (0, 999), "39"): ("Partially former plates of the Oblast Administration (during Governor Boos)", 0, 0, 1),

    # Kaluga Oblast
    ("OOO", (0, 999), "40"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("TTT", (0, 999), "40"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "40"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Kurgan Oblast
    ("OOO", (0, 999), "45"): ("Partially former plates of the Oblast Administration", 0, 0, 1),
    ("TTT", (0, 999), "45"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("OKO", (0, 999), "45"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Novosibirsk Oblast
    ("AAA", (0, 199), "54"): ("Plates for Presidential Plenipotentiaries", 1, 1, 7),
    ("AAA", (200, 999), "54"): ("'Special' plates", 0, 1, 4),
    ("HHH", (0, 999), "54"): ("Partially plates of the Novosibirsk mayor's office, Oblast Administration, and Oblast Council", 0, 1, 5),
    ("ACK", (0, 999), "54"): ("Partially Federal Security Service plates of the Oblast", 0, 1, 3),
    ("AHO", (0, 999), "54"): ("Partially former plates of the Oblast Administration", 0, 0, 1),
    ("AAO", (0, 999), "54"): ("Partially plates of various government agencies, including district administrations of Novosibirsk", 0, 1, 3),
    ("PPP", (0, 999), "54"): ("'Morozov' plates (introduced by former head of traffic police Pyotr Morozov)", 0, 1, 2),
    ("MOP", (0, 999), "54"): ("'Morozov' plates (introduced by former head of traffic police Pyotr Morozov)", 0, 1, 2),

    # Oryol Oblast
    ("AAA", (0, 999), "57"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AOO", (0, 999), "57"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("OAO", (0, 999), "57"): ("Partially plates of directors of public joint-stock companies", 0, 1, 2),

    # Rostov Oblast
    ("APO", (0, 999), "61"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAA", (0, 999), "61"): ("Partially plates of district heads of Rostov-on-Don, mayors of Oblast cities", 0, 1, 5),
    ("APY", (0, 999), "61"): ("Partially plates of the Rostov-on-Don administration", 0, 1, 5),
    ("KKK", (0, 999), "61"): ("Partially former plates for Presidential Plenipotentiaries (during Viktor Kazantsev)", 0, 0, 1),
    ("HHH", (0, 999), "61"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("MMM", (0, 999), "61"): ("Partially plates of the Oblast Department of Internal Affairs", 0, 1, 3),
    ("OOO", (0, 999), "61"): ("Partially plates of the Oblast Legislative Assembly", 0, 1, 4),
    ("BBK", (0, 999), "61"): ("Partially plates of insurance companies in Rostov-on-Don", 0, 1, 1),

    # Saratov Oblast
    ("AAA", (0, 999), "164"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "164"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("XXX", (0, 999), "64"): ("Partially plates of the Oblast courts", 0, 1, 3),
    ("MMM", (0, 999), "64"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("OAA", (0, 999), "64"): ("Partially Federal Security Service plates of the Oblast", 0, 1, 3),

    # Tomsk Oblast
    ("ATO", (0, 999), "70"): ("Partially plates of the Oblast Administration", 0, 1, 5),

    # Tyumen Oblast
    ("ATO", (0, 999), "72"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PTO", (0, 999), "72"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MTO", (0, 999), "72"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("HTO", (0, 999), "72"): ("Partially plates of the Tax Service", 0, 1, 3),
    ("CTO", (0, 999), "72"): ("Partially plates of the Oblast courts", 0, 1, 3),
    ("YTO", (0, 999), "72"): ("Partially plates of the bailiff service", 0, 1, 3),
    ("BAA", (0, 999), "72"): ("Partially plates of the Oblast Ministry of Internal Affairs", 0, 1, 3),
    ("KKK", (0, 999), "72"): ("'Gangster' plates", 0, 1, 1),

    # Arkhangelsk Oblast
    ("TTT", (0, 999), "29"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "29"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MAO", (0, 999), "29"): ("Partially plates of the Oblast Ministry of Internal Affairs", 0, 1, 3),

    # Ryazan Oblast
    ("APO", (0, 999), "62"): ("Partially plates of the Oblast Administration", 0, 1, 5),

    # Samara Oblast
    ("PAA", (0, 999), "63"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAP", (0, 999), "63"): ("Partially plates of the Oblast Administration", 0, 1, 5),
}

In [7]:
REGION_CODES_DF = pd.DataFrame(REGION_CODES.items(), columns=['region_name', 'region_code'])
REGION_CODES_DF = REGION_CODES_DF.explode('region_code').reset_index(drop=True)

REGION_CODES_DF

Unnamed: 0,region_name,region_code
0,Republic of Adygea,01
1,Altai Republic,04
2,Republic of Bashkortostan,02
3,Republic of Bashkortostan,102
4,Republic of Bashkortostan,702
...,...,...
155,Khanty-Mansi Autonomous Okrug,186
156,Chukotka Autonomous Okrug,87
157,Yamalo-Nenets Autonomous Okrug,89
158,Baikonur,94


In [8]:
# merge main df with REGION_CODES_DF
df = df.merge(REGION_CODES_DF, left_on='region_code', right_on='region_code', how='left')
df['region_code'] = df['region_code'].astype(int)

df.head()

Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,total_days,is_weekend,day_name,month_sin,month_cos,day_sin,day_cos,weekday_sin,weekday_cos,region_name
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,1407,0,Thursday,-2.449294e-16,1.0,-0.848644,0.528964,0.433884,-0.900969,Moscow
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,1241,0,Friday,-0.5,-0.866025,0.651372,-0.758758,-0.433884,-0.900969,Moscow Oblast
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,1155,0,Thursday,0.8660254,-0.5,-0.485302,-0.874347,0.433884,-0.900969,Moscow
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,1415,0,Friday,0.5,0.866025,0.571268,0.820763,-0.433884,-0.900969,Moscow
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,1422,0,Friday,0.5,0.866025,0.897805,-0.440394,-0.433884,-0.900969,Moscow


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59335 entries, 0 to 59334
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              59335 non-null  int64         
 1   plate           59335 non-null  object        
 2   date            59335 non-null  datetime64[ns]
 3   price           51640 non-null  float64       
 4   is_train        59335 non-null  int64         
 5   first_letter    59335 non-null  object        
 6   number          59335 non-null  int64         
 7   middle_letters  59335 non-null  object        
 8   region_code     59335 non-null  int64         
 9   letters         59335 non-null  object        
 10  year            59335 non-null  int32         
 11  month           59335 non-null  int32         
 12  day             59335 non-null  int32         
 13  day_of_week     59335 non-null  int32         
 14  week_of_year    59335 non-null  int64         
 15  qu

In [10]:
# Initialize governmental information columns with default values
df['is_government'] = 0
df['government_agency'] = None
df['forbidden_to_buy'] = False
df['road_advantage'] = False
df['significance_level'] = 0

# Improved function to extract governmental plate information using 'GOVERNMENT_CODES'
def get_government_info(row):
    """
    Retrieves information about governmental plates based on plate components
    and the 'GOVERNMENT_CODES' dictionary.
    """
    # Handle missing plate components gracefully
    if pd.isna(row['first_letter']) or pd.isna(row['number']) or pd.isna(row['region_code']):
        return 0, None, False, False, 0
    
    row_letters = row['letters']
    numbers = int(row['number']) if pd.notna(row['number']) else -1
    region_code = row['region_code']
    
    # Iterate through the defined governmental codes
    for (letters, (start, end), code), (agency, forbidden, advantage, significance) in GOVERNMENT_CODES.items():
        # Check if the plate matches any governmental pattern
        if row_letters == letters and region_code == int(code) and start <= numbers <= end:
            return 1, agency, bool(forbidden), bool(advantage), significance
    
    return 0, None, False, False, 0 # Default if not governmental

# Apply the function to each row to populate governmental features
print("Extracting governmental plate information...")
govt_info = df.apply(get_government_info, axis=1)
df['is_government'] = [info[0] for info in govt_info]
df['government_agency'] = [info[1] for info in govt_info]
df['forbidden_to_buy'] = [info[2] for info in govt_info]
df['road_advantage'] = [info[3] for info in govt_info]
df['significance_level'] = [info[4] for info in govt_info]
print("Governmental plate information extracted and new features created.")

# Display the first few rows of the updated DataFrame
df.head()

Extracting governmental plate information...
Governmental plate information extracted and new features created.


Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,day_sin,day_cos,weekday_sin,weekday_cos,region_name,is_government,government_agency,forbidden_to_buy,road_advantage,significance_level
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,-0.848644,0.528964,0.433884,-0.900969,Moscow,0,,False,False,0
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,0.651372,-0.758758,-0.433884,-0.900969,Moscow Oblast,0,,False,False,0
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,-0.485302,-0.874347,0.433884,-0.900969,Moscow,0,,False,False,0
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,0.571268,0.820763,-0.433884,-0.900969,Moscow,0,,False,False,0
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,0.897805,-0.440394,-0.433884,-0.900969,Moscow,0,,False,False,0


In [11]:
# Replace missing 'government_agency' values with "Non-governmental"
df['government_agency'] = df['government_agency'].fillna('Non-governmental')
print("Missing 'government_agency' values filled with 'Non-governmental'.")

# Function to categorize agencies into more general groups
# This reduces the high cardinality of the 'government_agency' column.
def categorize_agency(agency):
    """
    Categorizes specific government agencies into broader, more general groups
    to simplify the feature.
    """
    if agency == 'Non-governmental':
        return 'Non-governmental'
    elif 'President' in agency:
        return 'Presidential'
    elif 'Police' in agency.lower() or 'Internal Affairs' in agency:
        return 'Police/Security'
    elif 'Government' in agency:
        return 'Government'
    elif 'Military' in agency or 'Army' in agency or 'Defense' in agency:
        return 'Military'
    elif 'Federal' in agency:
        return 'Federal Services'
    elif 'Judge' in agency or 'Court' in agency or 'Justice' in agency or 'prosecutor' in agency.lower():
        return 'Judicial'
    elif 'Administration' in agency:
        return 'Administration'
    else:
        return 'Other Governmental'

# Apply the categorization to create a new 'agency_category' feature
df['agency_category'] = df['government_agency'].apply(categorize_agency)
print("Government agencies categorized into broader groups.")

df.head()

Missing 'government_agency' values filled with 'Non-governmental'.
Government agencies categorized into broader groups.


Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,day_cos,weekday_sin,weekday_cos,region_name,is_government,government_agency,forbidden_to_buy,road_advantage,significance_level,agency_category
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,0.528964,0.433884,-0.900969,Moscow,0,Non-governmental,False,False,0,Non-governmental
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,-0.758758,-0.433884,-0.900969,Moscow Oblast,0,Non-governmental,False,False,0,Non-governmental
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,-0.874347,0.433884,-0.900969,Moscow,0,Non-governmental,False,False,0,Non-governmental
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,0.820763,-0.433884,-0.900969,Moscow,0,Non-governmental,False,False,0,Non-governmental
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,-0.440394,-0.433884,-0.900969,Moscow,0,Non-governmental,False,False,0,Non-governmental


In [12]:
# Calculate average price per agency category for insights (only for training data)
if 'price' in df.columns: # Ensure 'price' exists before calculating
    agency_price = df[df['is_train'] == 1].groupby('agency_category')['price'].agg(['mean', 'count']).sort_values('mean', ascending=False)
    print("\nAverage price per agency category (Training Data):")
    print(agency_price)


Average price per agency category (Training Data):
                            mean  count
agency_category                        
Government          5.659560e+06    112
Presidential        4.125739e+06     46
Federal Services    2.016712e+06     52
Judicial            1.969929e+06     91
Other Governmental  1.693679e+06    336
Police/Security     9.111429e+05      7
Administration      5.118333e+05     18
Non-governmental    4.258965e+05  50978


In [13]:
agency_dummies = pd.get_dummies(df['agency_category'], prefix='agency')
df = pd.concat([df, agency_dummies], axis=1)
print("One-hot encoded 'agency_category' into binary features.")

df.head()

One-hot encoded 'agency_category' into binary features.


Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,significance_level,agency_category,agency_Administration,agency_Federal Services,agency_Government,agency_Judicial,agency_Non-governmental,agency_Other Governmental,agency_Police/Security,agency_Presidential
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,0,Non-governmental,False,False,False,False,True,False,False,False
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,0,Non-governmental,False,False,False,False,True,False,False,False
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,0,Non-governmental,False,False,False,False,True,False,False,False
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,0,Non-governmental,False,False,False,False,True,False,False,False
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,0,Non-governmental,False,False,False,False,True,False,False,False


#### Feature Creation from Plate

In [14]:
# Check for repeated letters in the 'full_letters' (e.g., 'AAA', 'XXX')
# This feature indicates patterns that might be considered desirable.
df['has_repeated_letters'] = df['letters'].str.replace(r'(.)(?=.*\1)', '', regex=True).str.len() < df['letters'].str.len()
print("Created 'has_repeated_letters' feature.")

# Check for repeated digits in the 'numbers' (e.g., '111', '777')
# These are often considered "beautiful" or "prestigious" numbers.
df['has_repeated_numbers'] = df['number'].apply(
    lambda n: bool(re.search(r'(\d)\1', f"{int(n):03d}")) # Format to 3 digits (e.g., 7 -> 007)
)
print("Created 'has_repeated_numbers' feature.")

# Check for sequential digits (e.g., '123', '987')
# Another pattern that can indicate prestige.
df['has_sequential_numbers'] = df['number'].apply(
    lambda n: bool(re.search(r'123|234|345|456|567|678|789|987|876|765|654|543|432|321', f"{int(n):03d}"))
)
print("Created 'has_sequential_numbers' feature.")

# Check for mirror digits (e.g., '121', '303') or palindromic numbers (e.g., '111')
# These are also considered special patterns.
df['has_mirror_numbers'] = df['number'].apply(
    lambda n: (str(int(n))[0] == str(int(n))[-1]) or (str(int(n)) == str(int(n))[::-1])
)
print("Created 'has_mirror_numbers' feature.")

Created 'has_repeated_letters' feature.
Created 'has_repeated_numbers' feature.
Created 'has_sequential_numbers' feature.
Created 'has_mirror_numbers' feature.


In [15]:
# Define a list of prestigious letter series (e.g., specific combinations like 'AAA', 'XXX')
prestigious_letter_series = ["AAA", "MMM", "EEE", "KKK", "OOO", "PPP", "CCC", "TTT", "XXX"]
df['is_beautiful_series'] = df['letters'].isin(prestigious_letter_series)
print("Created 'is_beautiful_series' feature based on prestigious letter combinations.")

# Define a list of prestigious number combinations (e.g., single digits, triple digits, hundreds)
prestigious_numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 111, 222, 333, 444, 555, 666, 777, 888, 999,
                       100, 200, 300, 400, 500, 600, 700, 800, 900, 7] # 7 is often considered lucky
df['is_prestigious_number'] = df['number'].isin(prestigious_numbers)
print("Created 'is_prestigious_number' feature based on specific prestigious number patterns.")

# Calculate the complexity of letters based on the number of unique characters
# A lower complexity (e.g., 'AAA') might indicate simplicity and prestige.
df['letter_complexity'] = df['letters'].apply(
    lambda x: len(set(x)) if pd.notnull(x) else 0
)
print("Calculated 'letter_complexity' feature.")

Created 'is_beautiful_series' feature based on prestigious letter combinations.
Created 'is_prestigious_number' feature based on specific prestigious number patterns.
Calculated 'letter_complexity' feature.


In [16]:
# Create an overall prestige score by weighting different prestige-related features
# This combines multiple signals into a single numeric score.
df['prestige_score'] = (
    (df['is_beautiful_series'].astype(int) * 3) + # Higher weight for beautiful letter series
    (df['is_prestigious_number'].astype(int) * 2) + # Moderate weight for prestigious numbers
    (df['has_repeated_letters'].astype(int) * 1) +
    (df['has_repeated_numbers'].astype(int) * 1) +
    (df['has_sequential_numbers'].astype(int) * 1) +
    (df['has_mirror_numbers'].astype(int) * 1) +
    (df['significance_level'].fillna(0)) # Include governmental significance level
)
print("Calculated 'prestige_score' by combining various prestige indicators.")

# Convert 'prestige_score' to a categorical type for potential use in models/visualizations
df['prestige_score'] = df['prestige_score'].astype('category')
print("Converted 'prestige_score' to categorical type.")

Calculated 'prestige_score' by combining various prestige indicators.
Converted 'prestige_score' to categorical type.


In [17]:
df.head()

Unnamed: 0,id,plate,date,price,is_train,first_letter,number,middle_letters,region_code,letters,...,agency_Police/Security,agency_Presidential,has_repeated_letters,has_repeated_numbers,has_sequential_numbers,has_mirror_numbers,is_beautiful_series,is_prestigious_number,letter_complexity,prestige_score
0,1,X059CP797,2024-12-26 00:00:00,65000.0,1,X,59,CP,797,XCP,...,False,False,False,False,False,False,False,False,3,0
1,2,Y800MH790,2024-07-12 21:31:37,100000.0,1,Y,800,MH,790,YMH,...,False,False,False,True,False,False,False,True,3,3
2,3,A212TX77,2024-04-18 00:00:00,290000.0,1,A,212,TX,77,ATX,...,False,False,False,False,False,True,False,False,3,1
3,4,P001AY199,2025-01-03 00:27:15,680000.0,1,P,1,AY,199,PAY,...,False,False,False,True,False,True,False,True,3,4
4,5,P001AY199,2025-01-10 09:32:41,750000.0,1,P,1,AY,199,PAY,...,False,False,False,True,False,True,False,True,3,4


#### Encoding Categorical Variables and Advanced Feature Interactions

In [18]:
# Frequency Encoding for the 'numbers' feature
# This replaces the number with its frequency of occurrence in the dataset.
freq_table = df['number'].value_counts().reset_index()
freq_table.columns = ['number', 'n']
freq_table['freq_enc'] = freq_table['n'] / freq_table['n'].sum()
freq_table['log_freq_enc'] = np.log1p(freq_table['freq_enc']) # Log transform for potential skewed distribution

# Merge frequency encodings back to the main DataFrame
df = df.merge(freq_table[['number', 'freq_enc', 'log_freq_enc']], 
              on='number', how='left')
df.rename(columns={'freq_enc': 'numbers_freq_enc', 
                  'log_freq_enc': 'numbers_log_freq_enc'}, inplace=True)
print("Applied Frequency Encoding to 'number' feature.")

Applied Frequency Encoding to 'number' feature.


In [19]:
# Logarithmic transformation of the target variable 'price'
# This is a common practice in regression to make the target distribution more normal
# and reduce the impact of outliers, improving model performance.
df['log_price'] = np.log1p(df['price'])
print("Applied logarithmic transformation (log1p) to 'price' to create 'log_price'.")

Applied logarithmic transformation (log1p) to 'price' to create 'log_price'.


In [20]:
# --- Newly Added Features for enhanced modeling ---

# Number Length and Uniqueness:
df['number_length'] = df['number'].apply(lambda x: len(str(x))) # Length of the numeric part
df['is_single_digit'] = (df['number_length'] == 1).astype(int) # Binary flag for single-digit number
print("Added 'number_length' and 'is_single_digit' features.")

# Frequency of letter + region combinations:
# This captures the popularity or rarity of specific plate patterns within regions.
df['letters_region'] = df['letters'] + "_" + df['region_code'].astype(str)
freq_lr = df['letters_region'].value_counts(normalize=True).to_dict()
df['letters_region_freq'] = df['letters_region'].map(freq_lr)
print("Calculated 'letters_region_freq' for letter-region combinations.")

Added 'number_length' and 'is_single_digit' features.
Calculated 'letters_region_freq' for letter-region combinations.


In [21]:
# Relative Prestige Ranking:
# Convert prestige score to a rank, normalized between 0 and 1.
# This gives a relative measure of prestige across all plates.
from scipy.stats import rankdata
df['prestige_rank'] = rankdata(df['prestige_score'].astype(int), method='average') / len(df)
print("Created 'prestige_rank' based on 'prestige_score'.")

# Interaction Features:
df['letter_number_combo'] = df['letters'] + "_" + df['number'].astype(str)
# Interaction between 'is_government' and 'prestige_score'
df['is_gov_and_prestige'] = df['is_government'] * df['prestige_score'].astype(int)
print("Added 'letter_number_combo' and 'is_gov_and_prestige' interaction features.")

Created 'prestige_rank' based on 'prestige_score'.


Added 'letter_number_combo' and 'is_gov_and_prestige' interaction features.


In [22]:
# Similarity with Known Plates (Textual Embedding using CountVectorizer):
# This attempts to capture patterns in letter sequences.
from sklearn.feature_extraction.text import CountVectorizer

# Using character n-grams to capture patterns like 'AA', 'AB', 'BA'
vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,2))
# Apply to 'letters' (e.g., 'XAA', 'TMM')
letter_features = vectorizer.fit_transform(df['letters'].fillna(''))
# Note: 'letter_features' is a sparse matrix and needs to be integrated into the
# modeling pipeline if directly used. For now, it's generated for demonstration.
print(f"Generated textual features for 'letters' using CountVectorizer. Shape: {letter_features.shape}")

# Finer Geography:
# Flag common premium regions (e.g., major cities/oblasts) as a binary feature.
premium_regions = ['Moscow', 'Saint Petersburg', 'Moscow Oblast']
df['is_premium_region'] = df['region_name'].isin(premium_regions).astype(int)
print("Created 'is_premium_region' feature for major economic centers.")

# End of Feature Engineering section
print("\nFeature engineering complete. DataFrame is ready for model training.")
print(f"Final DataFrame shape after feature engineering: {df.shape}")

Generated textual features for 'letters' using CountVectorizer. Shape: (59335, 156)
Created 'is_premium_region' feature for major economic centers.

Feature engineering complete. DataFrame is ready for model training.
Final DataFrame shape after feature engineering: (59335, 59)


In [28]:
train = df[df['is_train'] == 1].copy()
test = df[df['is_train'] == 0].copy()
# Drop the 'is_train' column as it's no longer needed
train.drop(columns=['is_train'], inplace=True)
test.drop(columns=['is_train'], inplace=True)

# save the dataframe to a pickle file
train.to_pickle('data/processed_train.pkl')
test.to_pickle('data/processed_test.pkl')