In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('used_cars.csv')  
df.shape, df.columns
df.head(10)

In [3]:

df['price'] = df['price'].astype(str).str.replace(r'[\$,]', '', regex=True)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df['milage'] = df['milage'].astype(str).str.replace(r'[^\d]', '', regex=True)
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

df['model_year'] = pd.to_numeric(df['model_year'], errors='coerce')

df['engine_l'] = df['engine'].astype(str).str.extract(r'(\d\.\d|\d)')  # basic
df['engine_l'] = pd.to_numeric(df['engine_l'], errors='coerce')


In [None]:
print(df.isna().sum())
print(df['accident'].value_counts(dropna=False))


In [7]:
df['accident_label'] = df['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1
})


In [None]:
df['clean_title_missing'] = df['clean_title'].isna().astype(int)
print("clean_title missingness vs accident:\n", pd.crosstab(df['clean_title_missing'], df['accident_label'], normalize='index'))

df['fuel_type_missing'] = df['fuel_type'].isna().astype(int)
print("\nfuel_type missingness vs accident:\n", pd.crosstab(df['fuel_type_missing'], df['accident_label'], normalize='index'))

df['engine_l_missing'] = df['engine_l'].isna().astype(int)
print("\nengine_l missingness vs accident:\n", pd.crosstab(df['engine_l_missing'], df['accident_label'], normalize='index'))


In [None]:
print("Rows before dropping missing target:", len(df))
df = df[~df['accident_label'].isna()].copy()
df['accident_label'] = df['accident_label'].astype(int)
print("Rows after dropping missing target:", len(df))

In [None]:
for col in ['clean_title', 'fuel_type', 'engine_l']:
    flag = col + '_missing'
    if flag not in df.columns:
        if col in df.columns:
            df[flag] = df[col].isna().astype(int)
            print(f"Created {flag}")
        else:
            df[flag] = 0
            df[col] = np.nan
            print(f"{col} missing — created {flag}=0 and {col}=NaN placeholder")
    else:
        print(f"{flag} already exists. Sum(flag)= {df[flag].sum()}")


In [None]:
for c in ['clean_title', 'fuel_type']:
    if c in df.columns:
        before = df[c].isna().sum()
        df[c] = df[c].fillna('Unknown')
        after = df[c].isna().sum()
        print(f"{c}: filled {before - after} missing -> 'Unknown'")

if 'engine_l' in df.columns:
    median_engine = df['engine_l'].median()
    before = df['engine_l'].isna().sum()
    df['engine_l'] = df['engine_l'].fillna(median_engine)
    after = df['engine_l'].isna().sum()
    print(f"engine_l: filled {before - after} missing -> median {median_engine}")

In [None]:
cat_cols = [c for c in ['brand','model','fuel_type','transmission','ext_col','int_col','clean_title'] if c in df.columns]
card = {c: df[c].nunique(dropna=False) for c in cat_cols}
print("Categorical columns and unique counts:")
for k,v in card.items():
    print(f"  - {k}: {v} unique values")


In [17]:
ONE_HOT_COLS = ['fuel_type', 'clean_title']                
FREQ_COLS = ['brand','model','transmission','ext_col','int_col']  
RARE_GROUP_THRESH = 10   
DROP_ORIGINALS = True  

In [18]:
missing_cols = [c for c in ONE_HOT_COLS + FREQ_COLS if c not in df.columns]
if missing_cols:
    print("Warning: these columns were not found in df and will be skipped:", missing_cols)


In [None]:
for c in set(ONE_HOT_COLS + FREQ_COLS):
    if c in df.columns:
        if df[c].isna().sum() > 0:
            df[c] = df[c].fillna('Unknown')
            print(f"Filled NaNs in {c} with 'Unknown' (was present).")
        else:
            print(f"No NaNs in {c} (or already filled).")

In [20]:
for c in FREQ_COLS:
    if c in df.columns:
        n = df[c].nunique()
        if n > 200:  
            counts = df[c].value_counts()
            rare_vals = counts[counts < RARE_GROUP_THRESH].index
            if len(rare_vals) > 0:
                newcol = c + '_grp'
                if newcol not in df.columns:
                    df[newcol] = df[c].where(~df[c].isin(rare_vals), other='Other')
                    print(f"Grouped {len(rare_vals)} rare categories into 'Other' for {c} -> new column {newcol}")
                    df[newcol] = df[newcol].astype(str)
                    FREQ_COLS = [newcol if x==c else x for x in FREQ_COLS]
                else:
                    print(f"{newcol} already exists; skipping rare-group step for {c}")
            else:
                print(f"No rare categories to group for {c} (n={n})")
        else:
            print(f"{c} has n={n} uniques; skipping rare-grouping")

brand has n=57 uniques; skipping rare-grouping
Grouped 1830 rare categories into 'Other' for model -> new column model_grp
transmission has n=62 uniques; skipping rare-grouping
Grouped 298 rare categories into 'Other' for ext_col -> new column ext_col_grp
int_col has n=152 uniques; skipping rare-grouping


In [None]:
onehot_created = []
for col in ONE_HOT_COLS:
    if col in df.columns:
        prefix = col + '_'
        already = any(c.startswith(prefix) for c in df.columns)
        if already:
            print(f"One-hot columns for {col} appear to already exist; skipping creation.")
            onehot_created += [c for c in df.columns if c.startswith(prefix)]
            continue
        dummies = pd.get_dummies(df[col].astype(str), prefix=col, dummy_na=False)
        df = pd.concat([df, dummies], axis=1)
        created = list(dummies.columns)
        onehot_created += created
        print(f"One-hot encoded {col} -> created {len(created)} columns (examples: {created[:4]})")
    else:
        print(f"{col} not in df; skipping one-hot.")

In [None]:
freq_created = []
for col in FREQ_COLS:
    if col in df.columns:
        newcol = col + '_freq'
        if newcol in df.columns:
            print(f"{newcol} already exists; skipping freq-encoding for {col}.")
            freq_created.append(newcol)
            continue
        freqs = df[col].value_counts(normalize=True)
        df[newcol] = df[col].map(freqs).fillna(0)
        freq_created.append(newcol)
        print(f"Frequency-encoded {col} -> {newcol} (sample mapping for top 3: {freqs.head(3).to_dict()})")
    else:
        print(f"{col} not in df; skipping freq-encoding.")

In [None]:
if DROP_ORIGINALS:
    originals_to_drop = []
    for c in set(ONE_HOT_COLS + [c.replace('_grp','') for c in FREQ_COLS]):  
        if c in df.columns:
            originals_to_drop.append(c)
    if originals_to_drop:
        df.drop(columns=originals_to_drop, inplace=True)
        print("Dropped original text columns:", originals_to_drop)
    else:
        print("No original text columns found to drop.")

In [None]:
print("\n=== Verification ===")
print("One-hot columns created (sample):", onehot_created[:10])
print("Freq columns created (sample):", freq_created[:10])

In [None]:
preview_cols = (onehot_created + freq_created + ['clean_title_missing','fuel_type_missing','engine_l_missing'])
preview_cols = [c for c in preview_cols if c in df.columns][:20]
print("\nPreview (first 5 rows) of encoded columns:")
display(df[preview_cols].head(5))

In [None]:
print("Dataframe shape after encoding:", df.shape)


In [None]:
import pandas as pd, numpy as np

numeric_features = [c for c in ['price','milage','age','engine_l','model_year'] if c in df.columns]

freq_cols = [c for c in df.columns if c.endswith('_freq')]

onehot_cols = [c for c in df.columns if c.startswith('fuel_type_') or c.startswith('clean_title_')]

missing_flags = [c for c in ['clean_title_missing','fuel_type_missing','engine_l_missing'] if c in df.columns]

final_features = numeric_features + freq_cols + onehot_cols + missing_flags

final_features = [f for i,f in enumerate(final_features) if f not in final_features[:i]]

print("Numeric features:", numeric_features)
print("Frequency-encoded features:", freq_cols)
print("One-hot features (detected):", onehot_cols[:10])
print("Missingness flags:", missing_flags)
print("\nNumber of final features:", len(final_features))
print("Sample final features (first 60):", final_features[:60])


In [None]:
print("Numeric features summary:")
display(df[numeric_features].describe().T[['count','mean','50%','std']])

print("\nTop values / frequencies for freq-encoded columns (top 5):")
for c in freq_cols:
    print(f"\n{c} top 5 value->freq:")
    original = c.replace('_freq','')
    if original in df.columns:
        print(df[original].value_counts().head(5))
    print(df[c].value_counts().head(5))


In [None]:
from sklearn.model_selection import train_test_split

X = df[final_features].fillna(0).copy()   
y = df['accident_label'].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("\nTrain label distribution:")
print(y_train.value_counts())
print("\nTrain label distribution (proportions):")
print(y_train.value_counts(normalize=True))
print("\nTest label distribution (proportions):")
print(y_test.value_counts(normalize=True))

neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = neg / pos if pos != 0 else 1.0
print(f"\nTraining negatives: {neg}, positives: {pos}")
print(f"Computed scale_pos_weight (neg/pos): {scale_pos_weight:.6f}")


In [33]:
train_full = pd.concat([y_train, X_train], axis=1)
test_full = pd.concat([y_test, X_test], axis=1)

In [None]:
train_full.to_csv("usedcars_xgb_train.csv", header=False, index=False)
test_full.to_csv("usedcars_xgb_test.csv", header=False, index=False)

print("Saved files:")
!ls -lh usedcars_xgb_train.csv usedcars_xgb_test.csv