In [2]:
pip install -q pandas

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
pip install -q dask[complete]

[0mNote: you may need to restart the kernel to use updated packages.


In [4]:
!pip install -q humanize  # Run once // for get sizes of each object/

[0m

In [5]:
pip install -q numpy

[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
import numpy as np

In [7]:
train_data_path = '/notebooks/data/train_data.csv';

In [8]:
import sys
import humanize

def print_variable_sizes(top_n=None, scope=None):
    """
    Prints all variables in memory (default: globals()), sorted by size (largest first).
    
    Args:
        top_n (int): Optional. Show only top N variables.
        scope (dict): Optional. Dictionary to inspect (e.g., locals() or globals()).
    """
    if scope is None:
        scope = globals()

    var_list = []
    for k, v in scope.items():
        if k.startswith('_'):
            continue  # skip internal vars
        try:
            size = sys.getsizeof(v)
            var_list.append((k, type(v).__name__, size, humanize.naturalsize(size)))
        except Exception:
            continue

    var_list.sort(key=lambda x: x[2], reverse=True)

    print(f"{'Variable':<20} {'Type':<20} {'Size':<15}")
    print("-" * 55)
    for i, (name, vtype, size, size_str) in enumerate(var_list):
        if top_n is not None and i >= top_n:
            break
        print(f"{name:<20} {vtype:<20} {size_str:<15}")

In [9]:
import pandas as pd

In [10]:
sample = pd.read_csv('/notebooks/data/train_data.csv',  nrows=100_000)

In [11]:
print(sample.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 145.0+ MB
None


In [12]:
int_columns = sample.select_dtypes(include="int64").columns
float_columns = sample.select_dtypes(include="float64").columns
object_cols = sample.select_dtypes(include="object").columns

In [13]:
print_variable_sizes()

Variable             Type                 Size           
-------------------------------------------------------
sample               DataFrame            179.2 MB       
float_columns        Index                11.3 kB        
object_cols          Index                282 Bytes      
In                   list                 184 Bytes      
open                 function             152 Bytes      
print_variable_sizes function             152 Bytes      
int_columns          Index                93 Bytes       
train_data_path      str                  79 Bytes       
pd                   module               72 Bytes       
np                   module               72 Bytes       
sys                  module               72 Bytes       
humanize             module               72 Bytes       
Out                  dict                 64 Bytes       
get_ipython          method               64 Bytes       
unique_vals          dict                 64 Bytes       
exit            

In [14]:
object_cols

Index(['customer_ID', 'S_2', 'D_63', 'D_64'], dtype='object')

In [15]:
sample['D_63'].unique()

array(['CR', 'CO', 'CL', 'XZ', 'XM', 'XL'], dtype=object)

In [16]:
import pandas as pd
from collections import Counter

# Initialize sets and counters
unique_d63 = set()
unique_d64 = set()
count_d63 = Counter()
count_d64 = Counter()
missing_d63 = 0
missing_d64 = 0

# Read in chunks
for chunk in pd.read_csv('/notebooks/data/train_data.csv', usecols=['D_63', 'D_64'], chunksize=100000):
    d63_series = chunk['D_63']
    d64_series = chunk['D_64']
    
    # Count missing values
    missing_d63 += d63_series.isna().sum()
    missing_d64 += d64_series.isna().sum()
    
    # Drop NaNs for unique value scan
    d63_vals = d63_series.dropna()
    d64_vals = d64_series.dropna()
    
    unique_d63.update(d63_vals.unique())
    unique_d64.update(d64_vals.unique())
    
    count_d63.update(d63_vals)
    count_d64.update(d64_vals)

# Add a special category for missing if desired
count_d63['[MISSING]'] = missing_d63
count_d64['[MISSING]'] = missing_d64

# Show results
print("Unique D_63 Categories (excluding NaN):", sorted(unique_d63))
print("D_63 Category Counts:")
for k, v in count_d63.most_common():
    print(f"  {k}: {v}")

print("\nUnique D_64 Categories (excluding NaN):", sorted(unique_d64))
print("D_64 Category Counts:")
for k, v in count_d64.most_common():
    print(f"  {k}: {v}")

Unique D_63 Categories (excluding NaN): ['CL', 'CO', 'CR', 'XL', 'XM', 'XZ']
D_63 Category Counts:
  CO: 4119621
  CR: 930133
  CL: 438390
  XZ: 25786
  XM: 10556
  XL: 6965
  [MISSING]: 0

Unique D_64 Categories (excluding NaN): ['-1', 'O', 'R', 'U']
D_64 Category Counts:
  O: 2913244
  U: 1523448
  R: 840112
  [MISSING]: 217442
  -1: 37205


Lets build solution for handling int and float features , for this will reduce the size like int 64 to int 16 and float 64 to float 32

In [17]:
int_columns

Index(['B_31'], dtype='object')

In [18]:
float_columns

Index(['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42',
       'D_43',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=185)

In [19]:
int_min = {}
int_max = {}
float_min = {}
float_max = {}

In [20]:
chunk_size = 100_000

for chunk in pd.read_csv('/notebooks/data/train_data.csv', chunksize=chunk_size):

    # Handle int columns
    for col in int_columns:
        current_min = chunk[col].min()
        current_max = chunk[col].max()

        int_min[col] = min(int_min.get(col, current_min), current_min)
        int_max[col] = max(int_max.get(col, current_max), current_max)

    # Handle float columns
    for col in float_columns:
        current_min = chunk[col].min()
        current_max = chunk[col].max()

        float_min[col] = min(float_min.get(col, current_min), current_min)
        float_max[col] = max(float_max.get(col, current_max), current_max)

In [21]:
print("=== INT COLUMNS ===")
for col in int_columns:
    print(f"{col} → min: {int_min[col]}, max: {int_max[col]}")

print("\n=== FLOAT COLUMNS ===")
for col in float_columns:
    print(f"{col} → min: {float_min[col]}, max: {float_max[col]}")

=== INT COLUMNS ===
B_31 → min: 0, max: 1

=== FLOAT COLUMNS ===
P_2 → min: -0.4589548331054999, max: 1.009999947398878
D_39 → min: 5.026190099677308e-09, max: 5.389619396242631
B_1 → min: -7.588799098308654, max: 1.3240604020685645
B_2 → min: 9.192279870617882e-09, max: 1.009999999641531
R_1 → min: 1.5342230152981531e-09, max: 3.256284433154198
S_3 → min: -0.6271320453080809, max: 5.482888258321006
D_41 → min: 5.566545069335403e-10, max: 8.988807033179892
B_3 → min: 6.2852931681511845e-09, max: 1.6252622436744049
D_42 → min: -0.0004543302584009, max: 4.191118756428802
D_43 → min: 1.1545498247667041e-07, max: 10.111619400622503
D_44 → min: 5.153088222797917e-10, max: 5.634723661789413
B_4 → min: 3.0993319777294914e-09, max: 19.803285707969977
D_45 → min: 1.5632410106380945e-08, max: 1.6100019829291632
B_5 → min: 2.804821708224381e-11, max: 144.2070226881703
R_2 → min: 8.26574798606927e-10, max: 1.0099999294503634
D_46 → min: -17.28934305264782, max: 16.3199007299845
D_47 → min: -0.0266

In [22]:
FLOAT16_MIN = -65504
FLOAT16_MAX = 65504
FLOAT32_MIN = -3.4e38
FLOAT32_MAX = 3.4e38

def is_low_precision(series, decimal_places=3):
    """Estimate whether values in a float column have low precision"""
    # Sample small chunk to avoid loading everything into memory
    return (series.dropna().head(1000).round(decimal_places) == series.dropna().head(1000)).all()

def choose_best_dtype(min_val, max_val, precision_low):
    if FLOAT16_MIN <= min_val <= FLOAT16_MAX and FLOAT16_MIN <= max_val <= FLOAT16_MAX and precision_low:
        return 'float16'
    elif FLOAT32_MIN <= min_val <= FLOAT32_MAX and FLOAT32_MIN <= max_val <= FLOAT32_MAX:
        return 'float32'
    else:
        return 'float64'

# Process recommendations
downcast_recommendations = {}

for col in float_columns:
    min_val = float_min[col]
    max_val = float_max[col]

    # You should sample a few rows from the original column to check precision
    # Let's assume you can grab a small sample from a chunk
    for chunk in pd.read_csv(train_data_path, usecols=[col], chunksize=10000):
        try:
            precision_low = is_low_precision(chunk[col])
            break  # only need first valid chunk
        except Exception:
            continue

    best_dtype = choose_best_dtype(min_val, max_val, precision_low)
    downcast_recommendations[col] = best_dtype

# Print the summary
print("=== FLOAT COLUMN TYPE RECOMMENDATIONS ===")
for col in float_columns:
    print(f"{col}: min={float_min[col]}, max={float_max[col]} → recommend: {downcast_recommendations[col]}")

=== FLOAT COLUMN TYPE RECOMMENDATIONS ===
P_2: min=-0.4589548331054999, max=1.009999947398878 → recommend: float32
D_39: min=5.026190099677308e-09, max=5.389619396242631 → recommend: float32
B_1: min=-7.588799098308654, max=1.3240604020685645 → recommend: float32
B_2: min=9.192279870617882e-09, max=1.009999999641531 → recommend: float32
R_1: min=1.5342230152981531e-09, max=3.256284433154198 → recommend: float32
S_3: min=-0.6271320453080809, max=5.482888258321006 → recommend: float32
D_41: min=5.566545069335403e-10, max=8.988807033179892 → recommend: float32
B_3: min=6.2852931681511845e-09, max=1.6252622436744049 → recommend: float32
D_42: min=-0.0004543302584009, max=4.191118756428802 → recommend: float32
D_43: min=1.1545498247667041e-07, max=10.111619400622503 → recommend: float32
D_44: min=5.153088222797917e-10, max=5.634723661789413 → recommend: float32
B_4: min=3.0993319777294914e-09, max=19.803285707969977 → recommend: float32
D_45: min=1.5632410106380945e-08, max=1.61000198292916

In [23]:
float16_cols = []
float32_cols = []
float64_cols = []

for col in float_columns:
    dtype = downcast_recommendations[col]
    if dtype == 'float16':
        float16_cols.append(col)
    elif dtype == 'float32':
        float32_cols.append(col)
    else:
        float64_cols.append(col)

# Now you have three lists with column names categorized by recommended dtype:
print("Float16 columns:", float16_cols)
print("Float32 columns:", float32_cols)
print("Float64 columns:", float64_cols)

Float16 columns: ['D_66', 'D_68', 'B_30', 'D_87', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
Float32 columns: ['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'S_18', 'D_86', 'R_17', 'R_18', 'D_88', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R

Handled integer data set

In [24]:
sample['B_31'] = sample['B_31'].astype('uint8')

In [25]:
sample[float16_cols] = sample[float16_cols].astype('float16')
sample[float32_cols] = sample[float32_cols].astype('float32')
sample[float64_cols] = sample[float64_cols].astype('float64')

In [26]:
num_zeros = (sample['B_31'] == 0).sum()
num_ones = (sample['B_31'] == 1).sum()
num_missing = sample['B_31'].isna().sum()

print(f"Zeros: {num_zeros}, Ones: {num_ones}, Missing: {num_missing}")

Zeros: 287, Ones: 99713, Missing: 0


In [27]:
import pandas as pd

# Initialize counters
zero_count = 0
one_count = 0
missing_count = 0

chunk_size = 100_000
file_path = '/notebooks/data/train_data.csv'

# Loop through dataset in chunks
for chunk in pd.read_csv(file_path, usecols=['B_31'], chunksize=chunk_size):
    zero_count += (chunk['B_31'] == 0).sum()
    one_count += (chunk['B_31'] == 1).sum()
    missing_count += chunk['B_31'].isna().sum()

# Print final summary
print(f"Total Zeros: {zero_count}")
print(f"Total Ones: {one_count}")
print(f"Total Missing: {missing_count}")

Total Zeros: 16907
Total Ones: 5514544
Total Missing: 0


In [28]:
sample['B_31'] = sample['B_31'].astype(bool)

Handling object dtype data.

customer_ID (Unique ID)
Nature: Completely unique.
Best action: Keep as string or drop if not needed for modeling.

In [29]:
sample['customer_ID'] = sample['customer_ID'].astype('string')

S_2 (Datetime stamp)
Nature: Dates.
Best action: Convert to datetime64.

In [30]:
sample['S_2'] = pd.to_datetime(sample['S_2'])

D_63 (Categorical)
6 unique categories, no missing values.
High cardinality with clear frequency difference

In [31]:
sample['D_63'] = sample['D_63'].astype('category')

D_64 (Categorical)
5 values: O, U, R, -1, and missing.
Convert to categorical with missing handled.

In [32]:
sample['D_64'] = sample['D_64'].astype('category')

In [33]:
# Add 'MISSING' to categories before filling NaNs
if 'MISSING' not in sample['D_64'].cat.categories:
    sample['D_64'] = sample['D_64'].cat.add_categories('MISSING')

# Now fill NaNs safely
sample['D_64'] = sample['D_64'].fillna('MISSING')

In [34]:
print_variable_sizes()

Variable             Type                 Size           
-------------------------------------------------------
sample               DataFrame            85.2 MB        
d63_vals             Series               2.1 MB         
d64_vals             Series               2.0 MB         
d63_series           Series               1.9 MB         
d64_series           Series               1.8 MB         
chunk                DataFrame            251.8 kB       
float_columns        Index                11.3 kB        
float_min            dict                 6.6 kB         
float_max            dict                 6.6 kB         
downcast_recommendations dict                 6.6 kB         
Counter              type                 1.7 kB         
float32_cols         list                 1.7 kB         
unique_d63           set                  728 Bytes      
In                   list                 376 Bytes      
count_d63            Counter              296 Bytes      
object_cols 

In [35]:
# Path to your full CSV
chunk_size = 100_000

# Initialize counters
total_rows = 0
nan_counts = None

# Iterate through file in chunks
for chunk in pd.read_csv(train_data_path, chunksize=chunk_size):
    total_rows += len(chunk)

    # Initialize nan_counts with column names in first chunk
    if nan_counts is None:
        nan_counts = chunk.isna().sum()
    else:
        nan_counts += chunk.isna().sum()

# Compute missing percentage
nan_percentage = (nan_counts / total_rows) * 100
nan_percentage = nan_percentage.sort_values(ascending=False)

# Display top 20 columns with highest missing percentage
print("Top 20 columns with most missing values:")
print(nan_percentage.head(20))

# Optional: columns with > 60% missing
high_nan_cols = nan_percentage[nan_percentage > 60].index.tolist()
print(f"\nColumns with more than 60% missing: {len(high_nan_cols)}")
print(high_nan_cols)

Top 20 columns with most missing values:
D_87     99.930127
D_88     99.891457
D_108    99.476846
D_111    99.433530
D_110    99.433530
B_39     99.391986
D_73     98.990211
B_42     98.707789
D_134    96.480146
D_135    96.480146
D_136    96.480146
D_137    96.480146
D_138    96.480146
R_9      94.349891
B_29     93.104594
D_106    90.213255
D_132    90.191055
D_49     90.137597
R_26     88.984717
D_76     88.746226
dtype: float64

Columns with more than 60% missing: 25
['D_87', 'D_88', 'D_108', 'D_111', 'D_110', 'B_39', 'D_73', 'B_42', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'R_9', 'B_29', 'D_106', 'D_132', 'D_49', 'R_26', 'D_76', 'D_66', 'D_42', 'D_142', 'D_53', 'D_82']


In [36]:
target = pd.read_csv('/notebooks/data/train_labels.csv',  nrows=100_000)

In [37]:
target.head(10)

Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0
5,000084e5023181993c2e1b665ac88dbb1ce9ef621ec537...,0
6,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,0
7,0000d17a1447b25a01e42e1ac56b091bb7cbb06317be4c...,0
8,0000f99513770170a1aba690daeeb8a96da4a39f11fc27...,1
9,00013181a0c5fc8f1ea38cd2b90fe8ad2fa8cad9d9f13e...,1


In [38]:
merged = sample.merge(target, on='customer_ID', how='left')

In [39]:
high_corr_cols = []
low_corr_cols = []

for col in high_nan_cols:
    missing_flag = merged[col].isna().astype(int)
    corr = missing_flag.corr(merged['target'])
    print(f"{col}: missingness vs target correlation = {corr:.4f}")
    
    if abs(corr) > 0.05:  # correlation threshold can be adjusted
        high_corr_cols.append(col)
    else:
        low_corr_cols.append(col)

print("\nHigh missingness correlation columns:", high_corr_cols)
print("Low missingness correlation columns:", low_corr_cols)

D_87: missingness vs target correlation = -0.0302
D_88: missingness vs target correlation = -0.0451
D_108: missingness vs target correlation = -0.0315
D_111: missingness vs target correlation = -0.0631
D_110: missingness vs target correlation = -0.0631
B_39: missingness vs target correlation = -0.0619
D_73: missingness vs target correlation = -0.0193
B_42: missingness vs target correlation = 0.0282
D_134: missingness vs target correlation = -0.1583
D_135: missingness vs target correlation = -0.1583
D_136: missingness vs target correlation = -0.1583
D_137: missingness vs target correlation = -0.1583
D_138: missingness vs target correlation = -0.1583
R_9: missingness vs target correlation = -0.1956
B_29: missingness vs target correlation = 0.0021
D_106: missingness vs target correlation = -0.2239
D_132: missingness vs target correlation = -0.2246
D_49: missingness vs target correlation = -0.2257
R_26: missingness vs target correlation = -0.2732
D_76: missingness vs target correlation = 0

In [40]:
# Load full train labels (assuming it fits in memory)
train_labels = pd.read_csv('/notebooks/data/train_labels.csv')  # adjust filename/path

chunk_size = 100_000  # or any chunk size suitable for your memory

# Initialize counters to accumulate data for correlation
missing_target_sum = {col: 0 for col in high_nan_cols}
missing_sum = {col: 0 for col in high_nan_cols}
target_sum = 0
total_rows = 0

for chunk in pd.read_csv(train_data_path, chunksize=chunk_size):
    # Merge chunk with labels on customer_ID
    merged = chunk.merge(train_labels, on='customer_ID', how='left')

    total_rows += len(merged)
    target_sum += merged['target'].sum()

    for col in high_nan_cols:
        missing_flag = merged[col].isna().astype(int)
        missing_sum[col] += missing_flag.sum()
        missing_target_sum[col] += ((missing_flag == 1) & (merged['target'] == 1)).sum()

# Calculate correlation for each column
high_corr_cols = []
low_corr_cols = []

for col in high_nan_cols:
    p_missing = missing_sum[col] / total_rows
    p_target = target_sum / total_rows
    p_missing_target = missing_target_sum[col] / total_rows

    numerator = p_missing_target - (p_missing * p_target)
    denominator = np.sqrt(p_missing * (1 - p_missing) * p_target * (1 - p_target))
    corr = numerator / denominator if denominator != 0 else 0

    print(f"{col}: missingness vs target correlation = {corr:.4f}")

    if abs(corr) > 0.05:  # your threshold
        high_corr_cols.append(col)
    else:
        low_corr_cols.append(col)

print("\nHigh missingness correlation columns:", high_corr_cols)
print("Low missingness correlation columns:", low_corr_cols)

D_87: missingness vs target correlation = -0.0378
D_88: missingness vs target correlation = -0.0417
D_108: missingness vs target correlation = -0.0302
D_111: missingness vs target correlation = -0.0539
D_110: missingness vs target correlation = -0.0539
B_39: missingness vs target correlation = -0.0541
D_73: missingness vs target correlation = -0.0295
B_42: missingness vs target correlation = 0.0353
D_134: missingness vs target correlation = -0.1582
D_135: missingness vs target correlation = -0.1582
D_136: missingness vs target correlation = -0.1582
D_137: missingness vs target correlation = -0.1582
D_138: missingness vs target correlation = -0.1582
R_9: missingness vs target correlation = -0.1867
B_29: missingness vs target correlation = 0.0183
D_106: missingness vs target correlation = -0.2292
D_132: missingness vs target correlation = -0.2293
D_49: missingness vs target correlation = -0.2302
R_26: missingness vs target correlation = -0.2689
D_76: missingness vs target correlation = 0

In [41]:
import pandas as pd
import numpy as np

# Create all missing flags in a dictionary first
missing_flag_cols = {
    f"{col}_was_missing": sample[col].isna().astype(np.uint8)
    for col in high_corr_cols
}

# Create a DataFrame from the dictionary
missing_flags_df = pd.DataFrame(missing_flag_cols, index=sample.index)

# Concatenate all at once to avoid fragmentation
sample = pd.concat([sample, missing_flags_df], axis=1)

# Now fill NaNs with appropriate sentinel values
for col in high_corr_cols:
    dtype = sample[col].dtype
    if dtype == np.float16:
        sentinel = -100.0
    elif dtype == np.float32:
        sentinel = -999.0
    else:
        sentinel = -1
    sample[col] = sample[col].fillna(sentinel)

In [42]:
sample.drop(columns=low_corr_cols, inplace=True)

In [43]:
# Perform one-hot encoding (no need to fillna or astype again)
dummies = pd.get_dummies(sample[['D_63', 'D_64']], prefix=['D_63', 'D_64'], dtype=np.uint8)

# Drop original categorical columns and concatenate encoded ones
sample = pd.concat([sample.drop(['D_63', 'D_64'], axis=1), dummies], axis=1)

In [44]:
print_variable_sizes()

Variable             Type                 Size           
-------------------------------------------------------
sample               DataFrame            85.8 MB        
train_labels         DataFrame            59.2 MB        
merged               DataFrame            56.6 MB        
chunk                DataFrame            56.4 MB        
target               DataFrame            12.9 MB        
d63_vals             Series               2.1 MB         
d64_vals             Series               2.0 MB         
missing_flags_df     DataFrame            1.9 MB         
d63_series           Series               1.9 MB         
d64_series           Series               1.8 MB         
dummies              DataFrame            1.1 MB         
missing_flag         Series               251.8 kB       
nan_percentage       Series               17.3 kB        
nan_counts           Series               13.2 kB        
float_columns        Index                11.3 kB        
float_min       

In [45]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 212 entries, customer_ID to D_64_MISSING
dtypes: bool(1), datetime64[ns](1), float16(9), float32(170), string(1), uint8(30)
memory usage: 71.0 MB


In [46]:
numeric_cols = sample.select_dtypes(include=['float16', 'float32']).columns

In [47]:
cardinality = sample[numeric_cols].nunique(dropna=True).sort_values()
print(cardinality)

D_116        2
D_114        2
D_120        2
D_66         3
D_126        3
         ...  
B_28     99922
B_23     99926
B_40     99927
B_37     99928
B_1      99929
Length: 179, dtype: int64


In [48]:
low_card = cardinality[cardinality < 20]
print("Low-cardinality numeric columns:\n", low_card)

Low-cardinality numeric columns:
 D_116    2
D_114    2
D_120    2
D_66     3
D_126    3
B_30     3
D_117    7
B_38     7
D_68     7
dtype: int64


In [10]:
import pandas as pd

chunk_size = 100_000
sample_size = 10_000
unique_vals = {} 

for chunk in pd.read_csv(train_data_path, chunksize=chunk_size):
    sample = chunk.sample(n=min(sample_size, len(chunk)), random_state=42)
    numeric_cols = sample.select_dtypes(include=['float16', 'float32','float64']).columns
    for col in numeric_cols:
        if col not in unique_vals:
            unique_vals[col] = set()
        unique_vals[col].update(sample[col].dropna().unique())


# Now compute actual cardinality
cardinality = {col: len(vals) for col, vals in unique_vals.items()}
cardinality_series = pd.Series(cardinality).sort_values()

# Show low-cardinality numeric columns (e.g., < 20 unique values)
low_card_cols = cardinality_series[cardinality_series < 20]
print("Low-cardinality numeric columns:")
print(low_card_cols)

Low-cardinality numeric columns:
D_87     1
D_120    2
D_66     2
D_116    2
D_114    2
D_126    3
B_30     3
D_117    7
B_38     7
D_68     7
dtype: int64
