In [34]:
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [35]:
# Loading and inspecting training and test datasets

# Paths to training and test files
TRAIN_PATH = r"INSERT_YOUR_PATH TO train_base.csv"
TEST_PATH  = r"INSERT_YOUR_PATH TO test_base.csv"

In [36]:
# Loading dataset using Polars
df_train = pl.read_csv(TRAIN_PATH)
df_test = pl.read_csv(TEST_PATH)

In [37]:
# Outputing the shape of each dataset
print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)

df_train.head()

Train shape: (1526659, 5)
Test shape: (10, 4)


case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


In [38]:
# Now we will handle noisy values

# Convert Polars DataFrames to Pandas for easier analysis
train_pd = df_train.to_pandas()
test_pd = df_test.to_pandas()

# percent missing per column in train and test
missing_train = train_pd.isnull().mean().sort_values(ascending=False)
missing_test  = test_pd.isnull().mean().sort_values(ascending=False)

print("Top 10 missing columns in TRAIN:")
print((missing_train.head(10) * 100).round(2).astype(str) + "%")

print("\nTop 10 missing columns in TEST:")
print((missing_test.head(10) * 100).round(2).astype(str) + "%")

Top 10 missing columns in TRAIN:
case_id          0.0%
date_decision    0.0%
MONTH            0.0%
WEEK_NUM         0.0%
target           0.0%
dtype: object

Top 10 missing columns in TEST:
case_id          0.0%
date_decision    0.0%
MONTH            0.0%
WEEK_NUM         0.0%
dtype: object


In [39]:
# Identify columns shared by both train and test sets

shared_columns = [c for c in train_pd.columns if c in test_pd.columns]

drift_list = []
for column in shared_columns:
    # We will check the numeric columns for the difference of means for train and test sets
    if np.issubdtype(train_pd[column].dtype, np.number):
        train_mean = train_pd[column].mean()
        test_mean = test_pd[column].mean()
        diff = abs(train_mean - test_mean)
        drift_list.append((column, train_mean, test_mean, diff))

# Converting results to DataFrame and sorting by difference
drift_df = pd.DataFrame(drift_list, columns=["feature", "train_mean", "test_mean", "diff"])
print("Mean comparison (numeric shared features):")
print(drift_df.sort_values("diff", ascending=False).head(10))

Mean comparison (numeric shared features):
    feature    train_mean  test_mean          diff
0   case_id  1.286077e+06    57592.4  1.228484e+06
1     MONTH  2.019363e+05   202201.0  2.647120e+02
2  WEEK_NUM  4.076904e+01      100.0  5.923096e+01
