In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df = pd.read_csv('used_cars.csv')  
df.shape, df.columns
df.head(10)

In [3]:

df['price'] = df['price'].astype(str).str.replace(r'[\$,]', '', regex=True)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df['milage'] = df['milage'].astype(str).str.replace(r'[^\d]', '', regex=True)
df['milage'] = pd.to_numeric(df['milage'], errors='coerce')

df['model_year'] = pd.to_numeric(df['model_year'], errors='coerce')

df['engine_l'] = df['engine'].astype(str).str.extract(r'(\d\.\d|\d)')  # basic
df['engine_l'] = pd.to_numeric(df['engine_l'], errors='coerce')


In [None]:
print(df.isna().sum())
print(df['accident'].value_counts(dropna=False))


In [7]:
df['accident_label'] = df['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1
})


In [None]:
df['clean_title_missing'] = df['clean_title'].isna().astype(int)
print("clean_title missingness vs accident:\n", pd.crosstab(df['clean_title_missing'], df['accident_label'], normalize='index'))

df['fuel_type_missing'] = df['fuel_type'].isna().astype(int)
print("\nfuel_type missingness vs accident:\n", pd.crosstab(df['fuel_type_missing'], df['accident_label'], normalize='index'))

df['engine_l_missing'] = df['engine_l'].isna().astype(int)
print("\nengine_l missingness vs accident:\n", pd.crosstab(df['engine_l_missing'], df['accident_label'], normalize='index'))


In [None]:
print("Rows before dropping missing target:", len(df))
df = df[~df['accident_label'].isna()].copy()
df['accident_label'] = df['accident_label'].astype(int)
print("Rows after dropping missing target:", len(df))

In [None]:
for col in ['clean_title', 'fuel_type', 'engine_l']:
    flag = col + '_missing'
    if flag not in df.columns:
        if col in df.columns:
            df[flag] = df[col].isna().astype(int)
            print(f"Created {flag}")
        else:
            df[flag] = 0
            df[col] = np.nan
            print(f"{col} missing — created {flag}=0 and {col}=NaN placeholder")
    else:
        print(f"{flag} already exists. Sum(flag)= {df[flag].sum()}")


In [None]:
for c in ['clean_title', 'fuel_type']:
    if c in df.columns:
        before = df[c].isna().sum()
        df[c] = df[c].fillna('Unknown')
        after = df[c].isna().sum()
        print(f"{c}: filled {before - after} missing -> 'Unknown'")

if 'engine_l' in df.columns:
    median_engine = df['engine_l'].median()
    before = df['engine_l'].isna().sum()
    df['engine_l'] = df['engine_l'].fillna(median_engine)
    after = df['engine_l'].isna().sum()
    print(f"engine_l: filled {before - after} missing -> median {median_engine}")

In [None]:
cat_cols = [c for c in ['brand','model','fuel_type','transmission','ext_col','int_col','clean_title'] if c in df.columns]
card = {c: df[c].nunique(dropna=False) for c in cat_cols}
print("Categorical columns and unique counts:")
for k,v in card.items():
    print(f"  - {k}: {v} unique values")


In [17]:
ONE_HOT_COLS = ['fuel_type', 'clean_title']                
FREQ_COLS = ['brand','model','transmission','ext_col','int_col']  
RARE_GROUP_THRESH = 10   
DROP_ORIGINALS = True  

In [18]:
missing_cols = [c for c in ONE_HOT_COLS + FREQ_COLS if c not in df.columns]
if missing_cols:
    print("Warning: these columns were not found in df and will be skipped:", missing_cols)


In [None]:
for c in set(ONE_HOT_COLS + FREQ_COLS):
    if c in df.columns:
        if df[c].isna().sum() > 0:
            df[c] = df[c].fillna('Unknown')
            print(f"Filled NaNs in {c} with 'Unknown' (was present).")
        else:
            print(f"No NaNs in {c} (or already filled).")

In [20]:
for c in FREQ_COLS:
    if c in df.columns:
        n = df[c].nunique()
        if n > 200:  
            counts = df[c].value_counts()
            rare_vals = counts[counts < RARE_GROUP_THRESH].index
            if len(rare_vals) > 0:
                newcol = c + '_grp'
                if newcol not in df.columns:
                    df[newcol] = df[c].where(~df[c].isin(rare_vals), other='Other')
                    print(f"Grouped {len(rare_vals)} rare categories into 'Other' for {c} -> new column {newcol}")
                    df[newcol] = df[newcol].astype(str)
                    FREQ_COLS = [newcol if x==c else x for x in FREQ_COLS]
                else:
                    print(f"{newcol} already exists; skipping rare-group step for {c}")
            else:
                print(f"No rare categories to group for {c} (n={n})")
        else:
            print(f"{c} has n={n} uniques; skipping rare-grouping")

brand has n=57 uniques; skipping rare-grouping
Grouped 1830 rare categories into 'Other' for model -> new column model_grp
transmission has n=62 uniques; skipping rare-grouping
Grouped 298 rare categories into 'Other' for ext_col -> new column ext_col_grp
int_col has n=152 uniques; skipping rare-grouping


In [None]:
onehot_created = []
for col in ONE_HOT_COLS:
    if col in df.columns:
        prefix = col + '_'
        already = any(c.startswith(prefix) for c in df.columns)
        if already:
            print(f"One-hot columns for {col} appear to already exist; skipping creation.")
            onehot_created += [c for c in df.columns if c.startswith(prefix)]
            continue
        dummies = pd.get_dummies(df[col].astype(str), prefix=col, dummy_na=False)
        df = pd.concat([df, dummies], axis=1)
        created = list(dummies.columns)
        onehot_created += created
        print(f"One-hot encoded {col} -> created {len(created)} columns (examples: {created[:4]})")
    else:
        print(f"{col} not in df; skipping one-hot.")

In [None]:
freq_created = []
for col in FREQ_COLS:
    if col in df.columns:
        newcol = col + '_freq'
        if newcol in df.columns:
            print(f"{newcol} already exists; skipping freq-encoding for {col}.")
            freq_created.append(newcol)
            continue
        freqs = df[col].value_counts(normalize=True)
        df[newcol] = df[col].map(freqs).fillna(0)
        freq_created.append(newcol)
        print(f"Frequency-encoded {col} -> {newcol} (sample mapping for top 3: {freqs.head(3).to_dict()})")
    else:
        print(f"{col} not in df; skipping freq-encoding.")

In [None]:
if DROP_ORIGINALS:
    originals_to_drop = []
    for c in set(ONE_HOT_COLS + [c.replace('_grp','') for c in FREQ_COLS]):  
        if c in df.columns:
            originals_to_drop.append(c)
    if originals_to_drop:
        df.drop(columns=originals_to_drop, inplace=True)
        print("Dropped original text columns:", originals_to_drop)
    else:
        print("No original text columns found to drop.")

In [None]:
print("\n=== Verification ===")
print("One-hot columns created (sample):", onehot_created[:10])
print("Freq columns created (sample):", freq_created[:10])

In [None]:
preview_cols = (onehot_created + freq_created + ['clean_title_missing','fuel_type_missing','engine_l_missing'])
preview_cols = [c for c in preview_cols if c in df.columns][:20]
print("\nPreview (first 5 rows) of encoded columns:")
display(df[preview_cols].head(5))

In [None]:
print("Dataframe shape after encoding:", df.shape)


In [None]:
import pandas as pd, numpy as np

numeric_features = [c for c in ['price','milage','age','engine_l','model_year'] if c in df.columns]

freq_cols = [c for c in df.columns if c.endswith('_freq')]

onehot_cols = [c for c in df.columns if c.startswith('fuel_type_') or c.startswith('clean_title_')]

missing_flags = [c for c in ['clean_title_missing','fuel_type_missing','engine_l_missing'] if c in df.columns]

final_features = numeric_features + freq_cols + onehot_cols + missing_flags

final_features = [f for i,f in enumerate(final_features) if f not in final_features[:i]]

print("Numeric features:", numeric_features)
print("Frequency-encoded features:", freq_cols)
print("One-hot features (detected):", onehot_cols[:10])
print("Missingness flags:", missing_flags)
print("\nNumber of final features:", len(final_features))
print("Sample final features (first 60):", final_features[:60])


In [None]:
print("Numeric features summary:")
display(df[numeric_features].describe().T[['count','mean','50%','std']])

print("\nTop values / frequencies for freq-encoded columns (top 5):")
for c in freq_cols:
    print(f"\n{c} top 5 value->freq:")
    original = c.replace('_freq','')
    if original in df.columns:
        print(df[original].value_counts().head(5))
    print(df[c].value_counts().head(5))


In [None]:
from sklearn.model_selection import train_test_split

X = df[final_features].fillna(0).copy()   
y = df['accident_label'].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)
print("\nTrain label distribution:")
print(y_train.value_counts())
print("\nTrain label distribution (proportions):")
print(y_train.value_counts(normalize=True))
print("\nTest label distribution (proportions):")
print(y_test.value_counts(normalize=True))

neg = int((y_train == 0).sum())
pos = int((y_train == 1).sum())
scale_pos_weight = neg / pos if pos != 0 else 1.0
print(f"\nTraining negatives: {neg}, positives: {pos}")
print(f"Computed scale_pos_weight (neg/pos): {scale_pos_weight:.6f}")


In [33]:
train_full = pd.concat([y_train, X_train], axis=1)
test_full = pd.concat([y_test, X_test], axis=1)

In [None]:
train_full.to_csv("usedcars_xgb_train.csv", header=False, index=False)
test_full.to_csv("usedcars_xgb_test.csv", header=False, index=False)

print("Saved files:")
!ls -lh usedcars_xgb_train.csv usedcars_xgb_test.csv

In [None]:
import os, glob, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split

train_patterns = ["usedcars_xgb_train.csv"]
test_patterns  = ["usedcars_xgb_test.csv"]

def find_first(patterns):
    files = []
    for p in patterns:
        files += glob.glob(p)
    files = sorted(list(dict.fromkeys(files)))
    return files

train_files = find_first(train_patterns)
test_files  = find_first(test_patterns)
print("Train candidates:", train_files)
print("Test candidates: ", test_files)
if not train_files or not test_files:
    raise FileNotFoundError("No train/test CSVs found in working dir. Rename or place them here. Patterns used: train_patterns/test_patterns")

train_path = train_files[0]
test_path  = test_files[0]
print("Using files:", train_path, test_path)

def read_detect(path):
    df0 = pd.read_csv(path, header=None)
    col0 = pd.to_numeric(df0.iloc[:,0], errors='coerce')
    frac_binary = ((col0==0) | (col0==1)).sum() / float(len(col0))
    if col0.notna().all() and frac_binary >= 0.9:
        ncols = df0.shape[1]
        cols = ['accident_label'] + [f'feat_{i}' for i in range(1,ncols)]
        df0.columns = cols
        return df0, 'sagemaker_no_header'
    dfh = pd.read_csv(path, header=0)
    # find label-like column
    lcands = [c for c in dfh.columns if c.lower() in ('accident_label','accident','label','target','y')]
    if lcands:
        dfh = dfh.rename(columns={lcands[0]:'accident_label'})
        return dfh, 'header_with_label'
    # try infer first column numeric 0/1
    col0h = pd.to_numeric(dfh.iloc[:,0], errors='coerce')
    if col0h.notna().all() and set(col0h.unique()).issubset({0.0,1.0}):
        dfh = dfh.rename(columns={dfh.columns[0]:'accident_label'})
        return dfh, 'header_inferred_label'
    return dfh, 'unknown'

train_df_raw, train_type = read_detect(train_path)
test_df_raw,  test_type  = read_detect(test_path)
print("Train type:", train_type, "shape:", train_df_raw.shape)
print("Test  type:", test_type,  "shape:", test_df_raw.shape)

if train_type=='unknown' or test_type=='unknown':
    print("Train head:\n", train_df_raw.head().to_string())
    print("Test  head:\n", test_df_raw.head().to_string())
    raise RuntimeError("Could not auto-detect label column. Ensure label column is named 'accident_label' or files are SageMaker-style (label first, no header).")

# ensure label numeric and available
for name, df in (('train',train_df_raw),('test',test_df_raw)):
    if 'accident_label' not in df.columns:
        raise RuntimeError(f"accident_label missing in {name}")
    df['accident_label'] = pd.to_numeric(df['accident_label'], errors='coerce').astype('Int64')
    if df['accident_label'].isna().any():
        raise RuntimeError(f"Some labels in {name} could not be converted to numeric 0/1")

# choose common numeric features between train and test
def numeric_cols(df):
    cols = [c for c in df.columns if c!='accident_label']
    num = []
    for c in cols:
        s = pd.to_numeric(df[c], errors='coerce')
        if s.notna().sum() > 0: num.append(c)
    return num

train_num = numeric_cols(train_df_raw)
test_num  = numeric_cols(test_df_raw)
common = [c for c in train_num if c in test_num]
if not common:
    # fallback to positional features if headerless sage-maker format
    common = [c for c in train_df_raw.columns if c!='accident_label']
print("Using", len(common), "common numeric features:", common[:10])

train_df = train_df_raw[['accident_label']+common].copy()
test_df  = test_df_raw[['accident_label']+common].copy()
combined = pd.concat([train_df, test_df], ignore_index=True)
print("Combined shape:", combined.shape)
display(combined.head())

# stratified split 70/15/15
X = combined.drop(columns=['accident_label']).copy()
y = combined['accident_label'].astype(int).copy()

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("\nSplit sizes:")
print(" X_train:", X_train.shape, "y_train:", y_train.value_counts().to_dict())
print(" X_val:  ", X_val.shape,   "y_val:  ", y_val.value_counts().to_dict())
print(" X_test: ", X_test.shape,  "y_test: ", y_test.value_counts().to_dict())

# compute scale_pos_weight
neg = int((y_train==0).sum()); pos = int((y_train==1).sum())
scale_pos_weight = neg/pos if pos!=0 else 1.0
print(f"\nscale_pos_weight (neg/pos) computed on training set: {scale_pos_weight:.6f} (neg={neg}, pos={pos})")

# Save SageMaker-style CSVs (label first, no header)
def save_sm(Xdf, ydf, fname):
    arr = np.hstack([ydf.values.reshape(-1,1), Xdf.values])
    pd.DataFrame(arr).to_csv(fname, index=False, header=False)

out_train = "usedcars_xgb_train_resplit.csv"
out_val   = "usedcars_xgb_val_resplit.csv"
out_test  = "usedcars_xgb_test_resplit.csv"
save_sm(X_train, y_train, out_train)
save_sm(X_val,   y_val,   out_val)
save_sm(X_test,  y_test,  out_test)
print("\nSaved files:")
!ls -lh usedcars_xgb_*resplit.csv

In [4]:
test_with_label = 'usedcars_xgb_test_resplit.csv'   
test_features_only = 'usedcars_xgb_test_features.csv'

# Load and drop first column (label)
df = pd.read_csv(test_with_label, header=None)
# Drop column 0 (label), save features only as CSV (no header, no index)
df.iloc[:, 1:].to_csv(test_features_only, index=False, header=False)
print("Saved features-only file:", test_features_only)

Saved features-only file: usedcars_xgb_test_features.csv


In [8]:
model_tar_path = "model.tar.gz"                     
extracted_model_dir = "model_extracted"             
test_csv_path = "usedcars_xgb_test_resplit.csv"

In [1]:
import os, tarfile, glob
import pandas as pd, numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, precision_recall_fscore_support

In [5]:
if model_tar_path and os.path.exists(model_tar_path):
    os.makedirs(extracted_model_dir, exist_ok=True)
    with tarfile.open(model_tar_path, "r:gz") as tar:
        tar.extractall(path=extracted_model_dir)

In [6]:
candidates = []
for root, dirs, files in os.walk(extracted_model_dir):
    for f in files:
        lf = f.lower()
        if lf.startswith("xgboost") or lf.endswith(".model") or lf.endswith(".bin") or lf.endswith(".bst") or lf.endswith(".json"):
            candidates.append(os.path.join(root, f))
if not candidates:
    candidates = [os.path.join(extracted_model_dir, f) for f in os.listdir(extracted_model_dir) if os.path.isfile(os.path.join(extracted_model_dir, f))]
if not candidates:
    raise FileNotFoundError(f"No model file found under {extracted_model_dir}. Files: {os.listdir(extracted_model_dir)}")
model_file = candidates[0]
print("Using model file:", model_file)

Using model file: model_extracted/xgboost-model


In [7]:
bst = xgb.Booster()
bst.load_model(model_file)

In [15]:
def load_test_csv(path):
    df_try = pd.read_csv(path, header=None)
    col0 = pd.to_numeric(df_try.iloc[:,0], errors='coerce')
    frac_binary = ((col0==0) | (col0==1)).sum() / float(len(col0))
    if col0.notna().all() and frac_binary >= 0.9:
        df_try.columns = ['accident_label'] + [f'feat_{i}' for i in range(1, df_try.shape[1])]
        return df_try
    df_h = pd.read_csv(path, header=0)
    label_candidates = [c for c in df_h.columns if c.lower() in ('accident_label','accident','label','target','y')]
    if label_candidates:
        df_h = df_h.rename(columns={label_candidates[0]: 'accident_label'})
        return df_h
    col0h = pd.to_numeric(df_h.iloc[:,0], errors='coerce')
    if col0h.notna().all() and set(col0h.unique()).issubset({0.0,1.0}):
        df_h = df_h.rename(columns={df_h.columns[0]:'accident_label'})
        return df_h
    raise RuntimeError("Could not detect label column in test CSV.")

df_test = load_test_csv(test_csv_path)
df_test['accident_label'] = pd.to_numeric(df_test['accident_label'], errors='coerce').astype(int)
y_test = df_test.iloc[:,0].values
X_test = df_test.iloc[:,1:].copy()

In [14]:
import pandas as pd, numpy as np
threshold = 0.3471
preds = (probs >= threshold).astype(int)

# save predictions with probs and true labels
res = pd.DataFrame({'prob_pos': probs, 'pred': preds, 'label': y_test})
res.to_csv('predictions_thresh_0.3471.csv', index=False)
print("Saved predictions_thresh_0.3471.csv")

# compute expected alerts per 1000
prop_pos = preds.mean()
print(f"Predicted positives: {preds.sum()} / {len(preds)} ({prop_pos:.3%})")
print(f"Expected alerts per 1000 samples: {prop_pos*1000:.1f}")


Saved predictions_thresh_0.3471.csv
Predicted positives: 340 / 585 (58.120%)
Expected alerts per 1000 samples: 581.2


In [21]:
# Convert to DMatrix for XGBoost
dtest = xgb.DMatrix(X_test, label=y_test)

# Predict probabilities
y_proba = bst.predict(dtest)


y_pred = (y_proba >= 0.3471).astype(int)

# Metrics
auc = roc_auc_score(y_test, y_proba)
print("Test AUC:", auc)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Test AUC: 0.701403921083555
Precision: 0.3500, Recall: 0.8041, F1: 0.4877

Classification report:
              precision    recall  f1-score   support

           0       0.88      0.49      0.63       437
           1       0.35      0.80      0.49       148

    accuracy                           0.57       585
   macro avg       0.62      0.65      0.56       585
weighted avg       0.75      0.57      0.60       585

Confusion matrix:
[[216 221]
 [ 29 119]]
