In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('data/train.csv')

display(train.head())
train.info()

Unnamed: 0,id,plate,date,price
0,1,X059CP797,2024-12-26 00:00:00,65000
1,2,Y800MH790,2024-07-12 21:31:37,100000
2,3,A212TX77,2024-04-18 00:00:00,290000
3,4,P001AY199,2025-01-03 00:27:15,680000
4,5,P001AY199,2025-01-10 09:32:41,750000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51635 entries, 0 to 51634
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      51635 non-null  int64 
 1   plate   51635 non-null  object
 2   date    51635 non-null  object
 3   price   51635 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.6+ MB


In [2]:
# check for missing values
print("Missing values in train:", train.isnull().sum().sum())

Missing values in train: 0


In [3]:
# change the date column to only have year and month
train['date'] = pd.to_datetime(train['date']).dt.to_period('M')
train['date'] = train['date'].dt.to_timestamp()

In [4]:
print("Train date range:", train['date'].min(), "to", train['date'].max())

Train date range: 2021-02-01 00:00:00 to 2025-02-01 00:00:00


In [5]:
REGION_CODES = {
    "Republic of Adygea": ["01"],
    "Altai Republic": ["04"],
    "Republic of Bashkortostan": ["02", "102", "702"],
    "Republic of Buryatia": ["03"],
    "Republic of Dagestan": ["05"],
    "Donetsk People's Republic": ["80", "180"],
    "Republic of Ingushetia": ["06"],
    "Kabardino-Balkarian Republic": ["07"],
    "Republic of Kalmykia": ["08"],
    "Karachay-Cherkess Republic": ["09"],
    "Republic of Karelia": ["10"],
    "Komi Republic": ["11"],
    "Republic of Crimea": ["82"],
    "Luhansk People's Republic": ["81", "181"],
    "Republic of Mari El": ["12"],
    "Republic of Mordovia": ["13", "113"],
    "Sakha Republic": ["14"],
    "Republic of North Ossetia": ["15"],
    "Republic of Tatarstan": ["16", "116", "716"],
    "Republic of Tyva (Tuva)": ["17"],
    "Udmurt Republic": ["18"],
    "Republic of Khakassia": ["19"],
    "Chechen Republic": ["20", "95"],
    "Chuvash Republic": ["21", "121"],
    "Altai Krai": ["22", "122"],
    "Zabaykalsky Krai": ["75"],
    "Kamchatka Krai": ["41"],
    "Krasnodar Krai": ["23", "93", "123", "193", "323"],
    "Krasnoyarsk Krai": ["24", "84", "88", "124"],
    "Perm Krai": ["59", "81", "159"],
    "Primorsky Krai": ["25", "125"],
    "Stavropol Krai": ["26", "126"],
    "Khabarovsk Krai": ["27"],
    "Amur Oblast": ["28"],
    "Arkhangelsk Oblast": ["29"],
    "Astrakhan Oblast": ["30", "130"],
    "Belgorod Oblast": ["31"],
    "Bryansk Oblast": ["32"],
    "Vladimir Oblast": ["33"],
    "Volgograd Oblast": ["34", "134"],
    "Vologda Oblast": ["35"],
    "Voronezh Oblast": ["36", "136"],
    "Zaporizhzhia Oblast": ["85", "185"],
    "Ivanovo Oblast": ["37"],
    "Irkutsk Oblast": ["38", "85", "138"],
    "Kaliningrad Oblast": ["39", "91"],
    "Kaluga Oblast": ["40"],
    "Kemerovo Oblast": ["42", "142"],
    "Kirov Oblast": ["43"],
    "Kostroma Oblast": ["44"],
    "Kurgan Oblast": ["45"],
    "Kursk Oblast": ["46"],
    "Leningrad Oblast": ["47", "147"],
    "Lipetsk Oblast": ["48"],
    "Magadan Oblast": ["49"],
    "Moscow Oblast": ["50", "90", "150", "190", "250", "550", "750", "790"],
    "Murmansk Oblast": ["51"],
    "Nizhny Novgorod Oblast": ["52", "152", "252"],
    "Novgorod Oblast": ["53"],
    "Novosibirsk Oblast": ["54", "154", "754"],
    "Omsk Oblast": ["55", "155"],
    "Orenburg Oblast": ["56", "156"],
    "Oryol Oblast": ["57"],
    "Penza Oblast": ["58", "158"],
    "Pskov Oblast": ["60"],
    "Rostov Oblast": ["61", "161", "761"],
    "Ryazan Oblast": ["62"],
    "Samara Oblast": ["63", "163", "763"],
    "Saratov Oblast": ["64", "164"],
    "Sakhalin Oblast": ["65"],
    "Sverdlovsk Oblast": ["66", "96", "196"],
    "Smolensk Oblast": ["67"],
    "Tambov Oblast": ["68"],
    "Tver Oblast": ["69"],
    "Tomsk Oblast": ["70"],
    "Tula Oblast": ["71"],
    "Tyumen Oblast": ["72", "172"],
    "Ulyanovsk Oblast": ["73", "173"],
    "Kherson Oblast": ["84", "184"],
    "Chelyabinsk Oblast": ["74", "174", "774"],
    "Yaroslavl Oblast": ["76"],
    "Moscow": ["77", "97", "99", "177", "197", "199", "777", "797", "799", "977"],
    "Saint Petersburg": ["78", "98", "178", "198"],
    "Sevastopol": ["92"],
    "Jewish Autonomous Oblast": ["79"],
    "Nenets Autonomous Okrug": ["83"],
    "Khanty-Mansi Autonomous Okrug": ["86", "186"],
    "Chukotka Autonomous Okrug": ["87"],
    "Yamalo-Nenets Autonomous Okrug": ["89"],
    "Baikonur": ["94"],
    "Occupational Administration of Kharkiv Oblast": ["188"],
}

In [6]:
# ((letters, numbers range (from, to), region code), is it forbidden to buy (bool), do they have an advantage on the road (bool), level of significance (author's opinion))
GOVERNMENT_CODES = {
    # Moscow
    ("AMP", (0, 999), "97"): ("Government of Russia", 1, 1, 10),
    ("AMP", (0, 999), "77"): ("Partially Government of Russia", 0, 1, 8),
    ("EKX", (0, 999), "77"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("EKX", (0, 999), "97"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("EKX", (0, 999), "99"): ("Partially Federal Protective Service (Federal Protective Service)", 0, 1, 6),
    ("KKX", (0, 999), "77"): ("Partially used on vehicles of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("CAC", (500, 999), "77"): ("Former officially 'open' plates of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("CAC", (500, 999), "77"): ("Former officially 'open' plates of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia", 0, 0, 1),
    ("AOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("BOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("MOO", (0, 999), "77"): ("Partially Presidential Administrative Directorate plates", 0, 1, 6),
    ("COO", (0, 999), "77"): ("Partially Administrative Directorate, Federation Council plates", 0, 1, 6),
    ("AMM", (0, 999), "99"): ("Partially plates of Moscow City Duma deputies, police", 0, 1, 4),
    ("CCC", (0, 999), "77"): ("Partially Central Special Communication, Courier Service, Ministry of Communications", 0, 1, 3),
    ("CCC", (0, 999), "99"): ("Partially Tax Police, Customs, Special Communications", 0, 1, 3),
    ("CCC", (0, 999), "97"): ("Partially Central Special Communication, Courier Service, Ministry of Communications", 0, 1, 3),
    ("KKK", (0, 999), "99"): ("Initially belonged to Courier Service, now used among private individuals", 0, 0, 1),
    ("OOO", (0, 999), "77"): ("Initially intended for Federal Security Service", 0, 0, 1),
    ("KMM", (0, 999), "77"): ("Partially Fire Department plates", 0, 1, 3),
    ("MMP", (300, 320), "77"): ("Partially Federal Security Service plates", 0, 1, 4),
    ("MMP", (0, 299), "77"): ("Partially Government of Russia, Federal Security Service, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("MMP", (321, 999), "77"): ("Partially Government of Russia, Federal Security Service, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("PMP", (0, 999), "77"): ("Partially Ministry of Justice plates", 0, 1, 3),
    ("AMO", (0, 999), "77"): ("Partially Moscow City Hall plates", 0, 1, 5),
    ("KOO", (0, 999), "77"): ("Partially Constitutional Court plates", 0, 1, 3),
    ("EPE", (0, 999), "77"): ("Partially State Duma plates", 0, 1, 3),
    ("AAA", (0, 999), "77"): ("Partially Administration of the President plates", 0, 1, 6),
    ("KMP", (0, 999), "77"): ("Partially Government of Russia plates", 0, 1, 3),
    ("TMP", (0, 999), "77"): ("Partially Government of Russia plates, as well as private individuals with connections in the traffic police", 0, 1, 2),
    ("YMP", (0, 999), "77"): ("Partially Government of Russia plates, as well as private individuals with connections in the traffic police", 0, 1, 2),
    ("XXX", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),
    ("YYY", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),
    ("XKX", (0, 999), "77"): ("Partially Federal Security Service and Federal Protective Service plates", 0, 1, 2),
    ("OMP", (0, 999), "77"): ("Partially Government of Russia, banks, and private individuals with connections in the traffic police", 0, 1, 2),
    ("EEE", (0, 999), "77"): ("Private individuals with connections in the traffic police", 0, 1, 2),

    # Moscow Oblast
    ("AMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("BMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("KMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("CMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("OMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("MMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("TMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("HMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("YMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("XMO", (0, 999), "50"): ("Partially various government agencies (administration, ambulance, traffic police, etc.)", 0, 1, 3),
    ("AMM", (0, 999), "50"): ("Partially plates of the regional administration", 0, 1, 5),
    ("AMM", (0, 999), "90"): ("Partially plates of the regional administration", 0, 1, 5),
    ("MMM", (0, 999), "50"): ("Partially plates of law enforcement in the region (prosecutor's office, EMERCOM, traffic police, etc.)", 0, 1, 5),
    ("MMM", (0, 999), "90"): ("Partially plates of law enforcement in the region (prosecutor's office, EMERCOM, traffic police, etc.)", 0, 1, 5),

    # Saint Petersburg
    ("OBO", (0, 999), "78"): ("Partially Departmental Security Service plates", 0, 1, 4),
    ("OBO", (0, 999), "98"): ("Partially Departmental Security Service plates", 0, 1, 4),
    ("OTT", (0, 999), "78"): ("Partially former traffic police plates (now replaced by 98)", 0, 0, 1),
    ("OTT", (0, 999), "98"): ("Partially traffic police plates", 0, 1, 4),
    ("OMM", (0, 999), "78"): ("Partially city district police plates", 0, 1, 3),
    ("OMM", (0, 999), "98"): ("Partially city district police plates", 0, 1, 3),
    ("OOM", (0, 999), "78"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OOM", (0, 999), "98"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OKO", (0, 100), "78"): ("Partially former plates of the prosecutor's office and judicial department (now replaced by 98)", 0, 0, 1),
    ("OKO", (0, 100), "98"): ("Partially plates of the prosecutor's office and judicial department", 0, 1, 3),
    ("OKO", (700, 999), "78"): ("Partially former Federal Security Service plates (now replaced by 98)", 0, 0, 1),
    ("OKO", (700, 999), "98"): ("Partially Federal Security Service plates", 0, 1, 3),
    ("OPP", (0, 999), "78"): ("Partially former plates of the Main Department of Internal Affairs (now replaced by 98)", 0, 0, 1),
    ("OPP", (0, 999), "98"): ("Partially plates of the Main Department of Internal Affairs", 0, 1, 3),
    ("OOH", (0, 999), "78"): ("Partially Federal Drug Control Service and Federal Tax Service plates", 0, 1, 3),
    ("OOH", (0, 999), "98"): ("Partially Federal Drug Control Service and Federal Tax Service plates", 0, 1, 3),
    ("OAO", (0, 999), "78"): ("Partially plates of the city and regional administration", 0, 1, 5),
    ("OAO", (0, 999), "98"): ("Partially plates of the city and regional administration", 0, 1, 5),
    ("AAA", (0, 100), "78"): ("Partially plates of the city and regional administration", 0, 1, 6),
    ("AAA", (0, 100), "98"): ("Partially plates of the city and regional administration", 0, 1, 6),
    ("OOO", (0, 899), "78"): ("Commercial plates", 0, 0, 2),
    ("OOO", (0, 899), "98"): ("Commercial plates", 0, 0, 2),
    ("OOO", (900, 999), "78"): ("Partially Federal Protective Service plates", 0, 1, 3),
    ("OOO", (900, 999), "98"): ("Partially Federal Protective Service plates", 0, 1, 3),
    ("OKC", (0, 999), "98"): ("Partially Constitutional Court of the Russian Federation plates", 0, 1, 3),
    ("OOC", (0, 999), "78"): ("Partially plates of heads of enterprises and organizations", 0, 0, 2),
    ("OOC", (0, 999), "98"): ("Partially plates of heads of enterprises and organizations", 0, 0, 2),
    ("MMM", (0, 999), "78"): ("Commercial plates", 0, 0, 2),
    ("MMM", (0, 999), "98"): ("Commercial plates", 0, 0, 2),

    # Altai Republic
    ("XXX", (0, 999), "04"): ("Widespread 'special' plates", 0, 0, 2),
    ("TTT", (0, 999), "04"): ("Rare 'special' plates", 0, 0, 2),
    ("PPP", (0, 999), "04"): ("Partially prosecutor's office of the republic", 0, 1, 3),
    ("PPA", (0, 999), "04"): ("Partially prosecutor's office of the republic", 0, 1, 3),
    ("MPA", (0, 999), "04"): ("Partially Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("OOO", (0, 999), "04"): ("Partially plates of the government of the republic", 0, 1, 5),
    ("HHH", (0, 999), "04"): ("Partially the republic's tax service plates", 0, 1, 3),
    ("CCC", (0, 999), "04"): ("Partially plates belonging to the republic's judges", 0, 1, 3),

    # Republic of Bashkortostan
    ("PKC", (0, 999), "02"): ("Partially State Assembly (Kurultai) plates", 0, 1, 5),
    ("KKC", (0, 999), "02"): ("Partially State Assembly (Kurultai) plates", 0, 1, 5),
    ("OOO", (0, 999), "02"): ("Partially plates of leaders of large enterprises and ministries", 0, 1, 3),
    ("AAA", (0, 999), "02"): ("Partially plates of the republic's government", 0, 1, 5),

    # Republic of Karelia
    ("TTT", (0, 999), "10"): ("Partially government of the republic and Federal Security Service plates", 0, 1, 5),
    ("HHH", (0, 999), "10"): ("Partially plates of city and district administrations of the republic", 0, 1, 4),
    ("MMM", (0, 999), "10"): ("Partially plates of the Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("EMP", (0, 999), "10"): ("Partially plates of the Ministry of Internal Affairs of the republic", 0, 1, 3),
    ("CCC", (0, 999), "10"): ("Partially plates of the prosecutor's office and judges' vehicles", 0, 1, 3),

    # Komi Republic
    ("TTT", (0, 999), "11"): ("Partially government of the republic and Federal Security Service plates", 0, 1, 5),
    ("OOO", (0, 999), "11"): ("Widespread semi-special plates, leaders of large industrial companies", 0, 1, 3),

    # Sakha Republic
    ("PPP", (0, 999), "14"): ("Partially plates of the republic's prosecutor's office", 0, 1, 3),
    ("AAA", (0, 999), "14"): ("Motor pool of the President, Government, Parliament of the republic, as well as heads of state enterprises", 0, 1, 5),

    # Republic of Tatarstan
    ("OAA", (0, 999), "16"): ("Partially plates of heads of district administrations", 0, 1, 5),
    ("OAA", (0, 999), "116"): ("Partially plates of heads of district administrations", 0, 1, 5),
    ("OAA", (0, 999), "716"): ("Partially plates of heads of district administrations", 0, 1, 5),

    # Krasnodar Krai
    ("PPP", (0, 999), "23"): ("Partially plates of the Krai and city administrations", 0, 1, 5),
    ("HHH", (0, 999), "23"): ("Partially plates of the tax authorities", 0, 1, 3),
    ("OOO", (0, 999), "23"): ("Partially plates of the Krai and city administrations", 0, 1, 5),
    ("KKK", (0, 999), "23"): ("Partially plates of the Krai administration", 0, 1, 5),

    # Krasnoyarsk Krai
    ("KPK", (0, 999), "24"): ("Partially plates of the Krai administration", 0, 1, 5),
    ("OOO", (0, 999), "24"): ("Partially Federal Security Service plates of the Krai", 0, 1, 3),
    ("MKK", (0, 999), "24"): ("Partially former plates of the Ministry of Internal Affairs of the Krai", 0, 0, 1),

    # Primorsky Krai
    ("BOO", (0, 999), "25"): ("Partially military plates", 0, 1, 3),
    ("BOO", (0, 999), "125"): ("Partially city services plates in Vladivostok and districts", 0, 1, 2),
    ("AAA", (0, 999), "25"): ("Issued first in Vladivostok", 0, 0, 2),
    ("AAA", (0, 999), "125"): ("One of the most 'special' series, prosecutor's office", 1, 1, 5),
    ("HHH", (0, 999), "25"): ("Partially plates of the administration and vehicles of City Duma deputies", 0, 1, 3),
    ("MMM", (0, 999), "25"): ("Partially plates of the deputies of the Krai Legislative Assembly", 0, 1, 3),
    ("CCC", (0, 999), "25"): ("Partially plates of the Krai administration", 0, 1, 5),
    ("XXX", (0, 999), "25"): ("Partially plates of the prosecutor's office and the Department of Internal Affairs", 0, 1, 2),
    ("OOO", (0, 999), "25"): ("Partially former plates of the Krai administration (during Governor Evgeny Nazdratenko)", 0, 0, 1),
    ("TTT", (0, 999), "25"): ("Partially former plates of the Vladivostok administration and federal agencies in the Krai (during Mayor Yuri Kopylov)", 0, 0, 1),
    ("MBK", (0, 999), "25"): ("Partially plates for employees of the Department of Internal Affairs", 0, 1, 3),
    ("MBK", (0, 999), "125"): ("Partially plates for employees of the Department of Internal Affairs", 0, 1, 3),
    ("MOO", (0, 999), "25"): ("Partially plates for Krai agencies of the Department of Internal Affairs, EMERCOM, firefighters, etc.", 0, 1, 2),
    ("MOO", (0, 999), "125"): ("Partially plates for Krai agencies of the Department of Internal Affairs, EMERCOM, firefighters, etc.", 0, 1, 2),
    ("HOO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the southeastern region of the Krai (Nakhodka)", 0, 1, 3),
    ("HOO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the southeastern region of the Krai (Nakhodka)", 0, 1, 3),
    ("YOO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the central region of the Krai (Ussuriysk)", 0, 1, 3),
    ("YOO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the central region of the Krai (Ussuriysk)", 0, 1, 3),
    ("COO", (0, 999), "25"): ("Partially plates of the Department of Internal Affairs, traffic police in the northern region of the Krai (Spassk-Dalny)", 0, 1, 3),
    ("COO", (0, 999), "125"): ("Partially plates of the Department of Internal Affairs, traffic police in the northern region of the Krai (Spassk-Dalny)", 0, 1, 3),

    # Vologda Oblast
    ("AAA", (0, 999), "35"): ("Partially plates of the regional government and Vologda city administration", 0, 1, 5),

    # Volgograd Oblast
    ("AAM", (0, 999), "34"): ("Partially plates of the Oblast Duma", 0, 1, 3),
    ("PAA", (0, 999), "34"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAA", (0, 999), "34"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("ACK", (0, 999), "34"): ("Partially plates of the Investigative Committee, Main Department of Internal Affairs", 0, 1, 3),
    ("YYY", (0, 999), "34"): ("Partially Federal Security Service plates", 0, 1, 3),
    ("AAK", (0, 999), "34"): ("Partially plates of the Federal Bailiff Service, Ministry of Justice, and Judicial Department", 0, 1, 3),

    # Voronezh Oblast
    ("ААА", (0, 999), "36"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("BOA", (0, 999), "36"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MMM", (0, 999), "36"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Kaliningrad Oblast
    ("AAK", (0, 999), "39"): ("Partially plates of the Oblast Administration, Federal Security Service, and Prosecutor's Office", 0, 1, 5),
    ("KKK", (0, 999), "39"): ("Partially plates of the Oblast Administration, Federal Security Service, and Prosecutor's Office", 0, 1, 5),
    ("PPP", (0, 999), "39"): ("Partially former plates of the Oblast Administration (during Governor Boos)", 0, 0, 1),

    # Kaluga Oblast
    ("OOO", (0, 999), "40"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("TTT", (0, 999), "40"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "40"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Kurgan Oblast
    ("OOO", (0, 999), "45"): ("Partially former plates of the Oblast Administration", 0, 0, 1),
    ("TTT", (0, 999), "45"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("OKO", (0, 999), "45"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),

    # Novosibirsk Oblast
    ("AAA", (0, 199), "54"): ("Plates for Presidential Plenipotentiaries", 1, 1, 7),
    ("AAA", (200, 999), "54"): ("'Special' plates", 0, 1, 4),
    ("HHH", (0, 999), "54"): ("Partially plates of the Novosibirsk mayor's office, Oblast Administration, and Oblast Council", 0, 1, 5),
    ("ACK", (0, 999), "54"): ("Partially Federal Security Service plates of the Oblast", 0, 1, 3),
    ("AHO", (0, 999), "54"): ("Partially former plates of the Oblast Administration", 0, 0, 1),
    ("AAO", (0, 999), "54"): ("Partially plates of various government agencies, including district administrations of Novosibirsk", 0, 1, 3),
    ("PPP", (0, 999), "54"): ("'Morozov' plates (introduced by former head of traffic police Pyotr Morozov)", 0, 1, 2),
    ("MOP", (0, 999), "54"): ("'Morozov' plates (introduced by former head of traffic police Pyotr Morozov)", 0, 1, 2),

    # Oryol Oblast
    ("AAA", (0, 999), "57"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AOO", (0, 999), "57"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("OAO", (0, 999), "57"): ("Partially plates of directors of public joint-stock companies", 0, 1, 2),

    # Rostov Oblast
    ("APO", (0, 999), "61"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAA", (0, 999), "61"): ("Partially plates of district heads of Rostov-on-Don, mayors of Oblast cities", 0, 1, 5),
    ("APY", (0, 999), "61"): ("Partially plates of the Rostov-on-Don administration", 0, 1, 5),
    ("KKK", (0, 999), "61"): ("Partially former plates for Presidential Plenipotentiaries (during Viktor Kazantsev)", 0, 0, 1),
    ("HHH", (0, 999), "61"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("MMM", (0, 999), "61"): ("Partially plates of the Oblast Department of Internal Affairs", 0, 1, 3),
    ("OOO", (0, 999), "61"): ("Partially plates of the Oblast Legislative Assembly", 0, 1, 4),
    ("BBK", (0, 999), "61"): ("Partially plates of insurance companies in Rostov-on-Don", 0, 1, 1),

    # Saratov Oblast
    ("AAA", (0, 999), "164"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "164"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("XXX", (0, 999), "64"): ("Partially plates of the Oblast courts", 0, 1, 3),
    ("MMM", (0, 999), "64"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("OAA", (0, 999), "64"): ("Partially Federal Security Service plates of the Oblast", 0, 1, 3),

    # Tomsk Oblast
    ("ATO", (0, 999), "70"): ("Partially plates of the Oblast Administration", 0, 1, 5),

    # Tyumen Oblast
    ("ATO", (0, 999), "72"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PTO", (0, 999), "72"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MTO", (0, 999), "72"): ("Partially plates of the Oblast Prosecutor's Office", 0, 1, 3),
    ("HTO", (0, 999), "72"): ("Partially plates of the Tax Service", 0, 1, 3),
    ("CTO", (0, 999), "72"): ("Partially plates of the Oblast courts", 0, 1, 3),
    ("YTO", (0, 999), "72"): ("Partially plates of the bailiff service", 0, 1, 3),
    ("BAA", (0, 999), "72"): ("Partially plates of the Oblast Ministry of Internal Affairs", 0, 1, 3),
    ("KKK", (0, 999), "72"): ("'Gangster' plates", 0, 1, 1),

    # Arkhangelsk Oblast
    ("TTT", (0, 999), "29"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("PPP", (0, 999), "29"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("MAO", (0, 999), "29"): ("Partially plates of the Oblast Ministry of Internal Affairs", 0, 1, 3),

    # Ryazan Oblast
    ("APO", (0, 999), "62"): ("Partially plates of the Oblast Administration", 0, 1, 5),

    # Samara Oblast
    ("PAA", (0, 999), "63"): ("Partially plates of the Oblast Administration", 0, 1, 5),
    ("AAP", (0, 999), "63"): ("Partially plates of the Oblast Administration", 0, 1, 5),
}

In [7]:
RENAME_MAP = {
    # — Moscow central government —
    "Government of Russia":                                        "GOV_RUSSIA",
    "Partially Government of Russia":                              "GOV_RUSSIA",
    "Partially Government of Russia plates":                       "GOV_RUSSIA",
    "Partially Government of Russia plates, as well as private individuals with connections in the traffic police":  
                                                                   "GOV_RUSSIA",
    "Partially Government of Russia, Federal Security Service, banks, and private individuals with connections in the traffic police":  
                                                                   "GOV_RUSSIA",

    # — Federal protective & security services —
    "Partially Federal Protective Service (Federal Protective Service)":  
                                                                   "FED_PROT_SERV",
    "Partially Federal Protective Service plates":                  "FED_PROT_SERV",
    "Partially Federal Security Service plates":                    "FED_SEC_SERV",
    "Initially intended for Federal Security Service":              "FED_SEC_SERV",
    "Partially former Federal Security Service plates (now replaced by 98)":  
                                                                   "FED_SEC_SERV",
    "Partially Federal Security Service and Federal Protective Service plates":  
                                                                   "FED_SERVICES",

    # — Counterintelligence / Ministry of Security —
    "Partially used on vehicles of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia":  
                                                                   "COUNTERINTEL_SERVICE",
    "Former officially 'open' plates of Ministry of Security/Federal Counterintelligence Service /Federal Security Service of Russia":  
                                                                   "COUNTERINTEL_SERVICE",

    # — Presidential & Administrative Directorates —
    "Partially Presidential Administrative Directorate plates":     "PRES_ADMIN_DIR",
    "Partially Administrative Directorate, Federation Council plates":  
                                                                   "ADMIN_DIR_FED_COUNCIL",

    # — Moscow legislative & comms —
    "Partially plates of Moscow City Duma deputies, police":       "MOSCOW_DUMA",
    "Partially Central Special Communication, Courier Service, Ministry of Communications":  
                                                                   "COMMUNICATIONS",
    "Partially Tax Police, Customs, Special Communications":        "COMMUNICATIONS",
    "Initially belonged to Courier Service, now used among private individuals":  
                                                                   "COURIER_SERVICE",
    "Partially Fire Department plates":                             "FIRE_DEPT",
    "Partially Ministry of Justice plates":                         "MIN_JUSTICE",
    "Partially Moscow City Hall plates":                            "MOSCOW_CITY_HALL",
    "Partially Constitutional Court plates":                        "CONST_COURT",
    "Partially Constitutional Court of the Russian Federation plates":  
                                                                   "CONST_COURT",
    "Partially State Duma plates":                                  "STATE_DUMA",
    "Partially Administration of the President plates":             "ADMIN_PRES",
    "Private individuals with connections in the traffic police":   "PRIVATE_TP",

    # — Moscow Oblast —
    "Partially various government agencies (administration, ambulance, traffic police, etc.)":  
                                                                   "VAR_GOV_AGENCIES",
    "Partially plates of the regional administration":             "REGIONAL_ADMIN",
    "Partially plates of law enforcement in the region (prosecutor's office, EMERCOM, traffic police, etc.)":  
                                                                   "REG_LAW_ENF",

    # — Saint Petersburg —
    "Partially Departmental Security Service plates":               "DEP_SECURITY_SERV",
    "Partially former traffic police plates (now replaced by 98)":  "TRAFFIC_POLICE",
    "Partially traffic police plates":                              "TRAFFIC_POLICE",
    "Partially city district police plates":                        "DISTRICT_POLICE",
    "Partially plates of the Main Department of Internal Affairs":  "INT_AFFAIRS_DEPT",
    "Partially former plates of the Main Department of Internal Affairs (now replaced by 98)":  
                                                                   "INT_AFFAIRS_DEPT",
    "Partially former plates of the prosecutor's office and judicial department (now replaced by 98)":  
                                                                   "PROSECUTOR_JUD",
    "Partially plates of the prosecutor's office and judicial department":  
                                                                   "PROSECUTOR_JUD",
    "Partially Federal Drug Control Service and Federal Tax Service plates":  
                                                                   "DRUG_TAX_SERVICE",
    "Partially plates of the city and regional administration":     "CITY_REG_ADMIN",
    "Partially plates of heads of enterprises and organizations":   "HEADS_ENTERPRISES",
    "Commercial plates":                                            "COMMERCIAL",

    # — Altai Republic (“special” plates + small admin roles) —
    "Widespread 'special' plates":                                 "SPECIAL_PLATES",
    "Rare 'special' plates":                                       "SPECIAL_PLATES",
    "Partially prosecutor's office of the republic":               "PROSECUTOR_REP",
    "Partially plates of the republic's prosecutor's office":      "PROSECUTOR_REP",
    "Partially Ministry of Internal Affairs of the republic":      "MIN_INT_REP",
    "Partially plates of the Ministry of Internal Affairs of the republic":  
                                                                   "MIN_INT_REP",
    "Partially plates of the government of the republic":          "REP_GOV",
    "Partially plates of the republic's government":               "REP_GOV",
    "Partially the republic's tax service plates":                 "REP_TAX_SERVICE",
    "Partially plates belonging to the republic's judges":         "REP_JUDGES",

    # — Bashkortostan / Karelia / Komi republics —
    "Partially State Assembly (Kurultai) plates":                  "STATE_ASSEMBLY",
    "Partially plates of leaders of large enterprises and ministries":  
                                                                   "LEAD_ENTERPRISES",
    "Partially government of the republic and Federal Security Service plates":  
                                                                   "REP_FSS",
    "Partially plates of city and district administrations of the republic":  
                                                                   "REP_CITY_DISTRICT",
    "Partially plates of the prosecutor's office and judges' vehicles":  
                                                                   "PROSECUTOR_JUD_VEH",
    "Widespread semi-special plates, leaders of large industrial companies":  
                                                                   "SEMI_SPECIAL",

    # — Sakha Republic —
    "Partially plates of the republic's prosecutor's office":      "PROSECUTOR_REP",
    "Motor pool of the President, Government, Parliament of the republic, as well as heads of state enterprises":  
                                                                   "REP_MOTOR_POOL",

    # — Tatarstan republic —
    "Partially plates of heads of district administrations":       "DISTRICT_ADMIN",

    # — Krasnodar & Krasnoyarsk Krai —
    "Partially plates of the Krai and city administrations":       "KRAI_CITY_ADMIN",
    "Partially plates of the tax authorities":                     "KRAI_TAX_AUTH",
    "Partially plates of the Krai administration":                 "KRAI_ADMIN",
    "Partially Federal Security Service plates of the Krai":       "KRAI_FSS",
    "Partially former plates of the Ministry of Internal Affairs of the Krai":  
                                                                   "KRAI_OLD_INT_AFFAIRS",

    # — Primorsky Krai —
    "Partially military plates":                                   "MILITARY",
    "Partially city services plates in Vladivostok and districts":  
                                                                   "CITY_SERVICES_VLAD",
    "Issued first in Vladivostok":                                 "VLAD_FIRST",
    "One of the most 'special' series, prosecutor's office":       "SPECIAL_PROSECUTOR",
    "Partially plates of the administration and vehicles of City Duma deputies":  
                                                                   "CITY_DUMA_DEP",
    "Partially plates of the deputies of the Krai Legislative Assembly":  
                                                                   "KRAI_LEG_ASSEMBLY",
    "Partially plates of the prosecutor's office and the Department of Internal Affairs":  
                                                                   "KRAI_PROSEC_INT",
    "Partially former plates of the Krai administration (during Governor Evgeny Nazdratenko)":  
                                                                   "KRAI_OLD_ADMIN1",
    "Partially former plates of the Vladivostok administration and federal agencies in the Krai (during Mayor Yuri Kopylov)":  
                                                                   "KRAI_OLD_ADMIN2",
    "Partially plates for employees of the Department of Internal Affairs":  
                                                                   "KRAI_INT_EMP",
    "Partially plates for Krai agencies of the Department of Internal Affairs, EMERCOM, firefighters, etc.":  
                                                                   "KRAI_EMERCOM",
    "Partially plates of the Department of Internal Affairs, traffic police in the southeastern region of the Krai (Nakhodka)":  
                                                                   "KRAI_DI_TP_REGION",
    "Partially plates of the Department of Internal Affairs, traffic police in the central region of the Krai (Ussuriysk)":  
                                                                   "KRAI_DI_TP_REGION",
    "Partially plates of the Department of Internal Affairs, traffic police in the northern region of the Krai (Spassk-Dalny)":  
                                                                   "KRAI_DI_TP_REGION",

    # — Vologda Oblast —
    "Partially plates of the regional government and Vologda city administration":  
                                                                   "VOLOGDA_ADMIN",

    # — Volgograd & other Oblasts (bulk-grouped) —
    "Partially plates of the Oblast Duma":                         "OBLAST_DUMA",
    "Partially plates of the Oblast Administration":               "OBLAST_ADMIN",
    "Partially plates of the Oblast Prosecutor's Office":          "OBLAST_PROSECUTOR",
    "Partially plates of the Investigative Committee, Main Department of Internal Affairs":  
                                                                   "INVESTIGATIVE_COMMITTEE",
    "Partially Federal Security Service plates":                   "FED_SEC_SERV",
    "Partially plates of the Federal Bailiff Service, Ministry of Justice, and Judicial Department":  
                                                                   "OBLAST_BAILIFF",
    "Partially plates of the Oblast Department of Internal Affairs":  
                                                                   "OBLAST_INT_AFFAIRS",
    "Partially plates of the Oblast Legislative Assembly":         "OBLAST_LEG_ASSEMBLY",
    "Partially plates of the Oblast courts":                       "OBLAST_COURTS",
    "Commercial plates":                                           "COMMERCIAL",
    "Partially plates of directors of public joint-stock companies":  
                                                                   "OBLAST_DIR_COMPANIES",
    "Partially plates of insurance companies in Rostov-on-Don":    "OBLAST_INSURANCE",
    "Partially plates of district heads of Rostov-on-Don, mayors of Oblast cities":  
                                                                   "ROSTOV_DISTRICT_HEADS",
    "Partially plates of the Rostov-on-Don administration":        "ROSTOV_ADMIN",
    "Partially former plates for Presidential Plenipotentiaries (during Viktor Kazantsev)":  
                                                                   "ROSTOV_PREZ_PLENIPOTENTIARIES",
    "Plates for Presidential Plenipotentiaries":                   "PRES_PLENIPOTENTIARIES",
    "'Special' plates":                                           "SPECIAL",
    "Partially plates of the Novosibirsk mayor's office, Oblast Administration, and Oblast Council":  
                                                                   "NOVOSIB_ADMIN",
    "Partially plates of various government agencies, including district administrations of Novosibirsk":  
                                                                   "NOVOSIB_AGENCIES",
    "'Morozov' plates (introduced by former head of traffic police Pyotr Morozov)":  
                                                                   "MOROZOV",
    "Partially Government of Russia, banks, and private individuals with connections in the traffic police":
        "GOV_RUSSIA",
    "Partially plates of the Oblast Administration, Federal Security Service, and Prosecutor's Office":
        "OBLAST_ADMIN_FSS_PROSECUTOR",
    "Partially former plates of the Oblast Administration (during Governor Boos)":
        "OBLAST_OLD_ADMIN",
    "'Gangster' plates":
        "GANGSTER_PLATES",
    # any unmapped description will simply pass through unchanged
}


In [8]:
# funtion to get the region name from the region code
def get_region_name(region_code):
    for region, codes in REGION_CODES.items():
        if str(region_code) in codes:
            return region
    return "Unknown"

# 2. One “master” function that:
#    • pulls letters/numbers/region
#    • looks up the GOVERNMENT_CODES
#    • renames the description via RENAME_MAP
#    • looks up region_name
def classify_plate(plate):
    if pd.isna(plate):
        return pd.Series([None, None, None,    # letters, numbers, region_code
                          "NON_GOVERNMENT", False, False, 0,  # description, forbidden, advantage, significance
                          "Unknown"],            # region_name
                         index=[
                             "letters","numbers","region_code",
                             "description","forbidden","advantage","significance",
                             "region_name"
                         ])
    # extract parts
    letters = plate[0] + plate[4:6]
    numbers = int(plate[1:4])
    region_code = plate[6:]
    
    # default
    desc, forbidden, advantage, sig = "NON_GOVERNMENT", False, False, 0
    
    # find a matching government‐code entry
    for (code_letters, (lo, hi), code_region), vals in GOVERNMENT_CODES.items():
        if (letters == code_letters 
            and region_code == code_region 
            and lo <= numbers <= hi):
            desc, forbidden, advantage, sig = vals
            break
    
    # apply your rename_map
    desc = RENAME_MAP.get(desc, desc)
    
    # lookup human‐readable region
    region_name = get_region_name(region_code)
    
    return pd.Series([
        letters, numbers, region_code,
        desc, forbidden, advantage, sig,
        region_name
    ], index=[
        "letters","numbers","region_code",
        "description","forbidden","advantage","significance",
        "region_name"
    ])

In [9]:
def preprocess_data(df):
    # Apply the single classifier once
    df = df.join(
        df["plate"]
          .apply(classify_plate)
    )
    
    # Fix dtypes
    df["forbidden"]     = df["forbidden"].astype(bool)
    df["advantage"]     = df["advantage"].astype(bool)
    df["significance"]  = df["significance"].astype(int)
    df["region_name"]   = pd.Categorical(df["region_name"], categories=list(REGION_CODES.keys()))
    return df


In [10]:
train_merged = preprocess_data(train)
train_merged.to_csv('data/train_merged.csv', index=False)

train_merged

Unnamed: 0,id,plate,date,price,letters,numbers,region_code,description,forbidden,advantage,significance,region_name
0,1,X059CP797,2024-12-01,65000,XCP,59,797,NON_GOVERNMENT,False,False,0,Moscow
1,2,Y800MH790,2024-07-01,100000,YMH,800,790,NON_GOVERNMENT,False,False,0,Moscow Oblast
2,3,A212TX77,2024-04-01,290000,ATX,212,77,NON_GOVERNMENT,False,False,0,Moscow
3,4,P001AY199,2025-01-01,680000,PAY,1,199,NON_GOVERNMENT,False,False,0,Moscow
4,5,P001AY199,2025-01-01,750000,PAY,1,199,NON_GOVERNMENT,False,False,0,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...
51630,51631,X023PP797,2025-01-01,70000,XPP,23,797,NON_GOVERNMENT,False,False,0,Moscow
51631,51632,M004KA161,2025-01-01,1600000,MKA,4,161,NON_GOVERNMENT,False,False,0,Rostov Oblast
51632,51633,E888EB199,2025-02-01,850000,EEB,888,199,NON_GOVERNMENT,False,False,0,Moscow
51633,51634,X023XK77,2024-04-01,150000,XXK,23,77,NON_GOVERNMENT,False,False,0,Moscow


In [11]:
train = pd.read_csv('data/train_merged.csv')
train

Unnamed: 0,id,plate,date,price,letters,numbers,region_code,description,forbidden,advantage,significance,region_name
0,1,X059CP797,2024-12-01,65000,XCP,59,797,NON_GOVERNMENT,False,False,0,Moscow
1,2,Y800MH790,2024-07-01,100000,YMH,800,790,NON_GOVERNMENT,False,False,0,Moscow Oblast
2,3,A212TX77,2024-04-01,290000,ATX,212,77,NON_GOVERNMENT,False,False,0,Moscow
3,4,P001AY199,2025-01-01,680000,PAY,1,199,NON_GOVERNMENT,False,False,0,Moscow
4,5,P001AY199,2025-01-01,750000,PAY,1,199,NON_GOVERNMENT,False,False,0,Moscow
...,...,...,...,...,...,...,...,...,...,...,...,...
51630,51631,X023PP797,2025-01-01,70000,XPP,23,797,NON_GOVERNMENT,False,False,0,Moscow
51631,51632,M004KA161,2025-01-01,1600000,MKA,4,161,NON_GOVERNMENT,False,False,0,Rostov Oblast
51632,51633,E888EB199,2025-02-01,850000,EEB,888,199,NON_GOVERNMENT,False,False,0,Moscow
51633,51634,X023XK77,2024-04-01,150000,XXK,23,77,NON_GOVERNMENT,False,False,0,Moscow


In [12]:
train.nunique()

id              51635
plate           43605
date               49
price             798
letters          1728
numbers           999
region_code       145
description        53
forbidden           2
advantage           2
significance        8
region_name        88
dtype: int64

In [13]:
# check for duplicate plates and see the diffences
duplicate_plates = train[train.duplicated(subset=['plate'], keep=False)]
duplicate_plates.sort_values(by=['plate', 'date'])

Unnamed: 0,id,plate,date,price,letters,numbers,region_code,description,forbidden,advantage,significance,region_name
9265,9266,A001BE150,2024-05-01,450000,ABE,1,150,NON_GOVERNMENT,False,False,0,Moscow Oblast
9266,9267,A001BE150,2024-06-01,550000,ABE,1,150,NON_GOVERNMENT,False,False,0,Moscow Oblast
9267,9268,A001BE150,2025-02-01,425000,ABE,1,150,NON_GOVERNMENT,False,False,0,Moscow Oblast
32745,32746,A001BE750,2024-05-01,450000,ABE,1,750,NON_GOVERNMENT,False,False,0,Moscow Oblast
32746,32747,A001BE750,2024-06-01,550000,ABE,1,750,NON_GOVERNMENT,False,False,0,Moscow Oblast
...,...,...,...,...,...,...,...,...,...,...,...,...
11555,11556,Y999XX199,2023-06-01,1500000,YXX,999,199,NON_GOVERNMENT,False,False,0,Moscow
37354,37355,Y999YE790,2025-01-01,400000,YYE,999,790,NON_GOVERNMENT,False,False,0,Moscow Oblast
37355,37356,Y999YE790,2025-02-01,470000,YYE,999,790,NON_GOVERNMENT,False,False,0,Moscow Oblast
29842,29843,Y999YM90,2025-01-01,1200000,YYM,999,90,NON_GOVERNMENT,False,False,0,Moscow Oblast


In [14]:
test = pd.read_csv('data/test.csv')
test_merged = preprocess_data(test)
test_merged.to_csv('data/test_merged.csv', index=False)