In [1]:
import pandas as pd
import numpy as np
from numpy import log1p
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


train_x = pd.read_csv("train_x.csv")
train_y_data = pd.read_csv("train_y.csv")
test = pd.read_csv("public_private_X.csv")
train = pd.merge(train_x, train_y_data, how='inner', on='ID')

In [2]:
x_train = train.drop(columns=['ID', "DIVISION_CODE", "RESERVABLE_INDICATOR", "PRODUCT_STATUS", 'ON_TIME_AND_COMPLETE', "PURCHASE_ORDER_DUE_DATE", "ORDER_DATE"])  
y_train = train['ON_TIME_AND_COMPLETE']
test = test.drop(columns=['ID', 'DIVISION_CODE', "PRODUCT_STATUS", 'RESERVABLE_INDICATOR', "PURCHASE_ORDER_DUE_DATE", "ORDER_DATE"])


In [3]:
categorical_vars = ["PURCHASE_ORDER_TYPE", "DUE_DATE_WEEKDAY", "ORDER_DAY_OF_WEEK", "PRODUCT_MARKET", "PRODUCT_CLASSIFICATION"]
# date_vars = ["PURCHASE_ORDER_DUE_DATE", "ORDER_DATE"]
# numerical vars includes date vars
numerical_vars = [col for col in x_train.columns if col not in categorical_vars]

In [4]:
bin_vars = ["ORDER_QUANTITY_DEVIATION", "SHIP_FROM_VENDOR", "TRANSIT_LEAD_TIME"]
log_vars = [col for col in numerical_vars if col not in bin_vars and col in [
    "GIVEN_TIME_TO_LEAD_TIME_RATIO", "AVERAGE_VENDOR_ORDER_CYCLE_DAYS", "AVERAGE_ORDER_CYCLE_DAYS",
    "AVERAGE_PRODUCT_ORDER_QUANTITY_MARKET", "AVERAGE_ORDER_CYCLE_CASES", "LEAD_TIME_TO_DISTANCE_RATIO",
    "AVERAGE_DAILY_DEMAND_CASES", "PURCHASING_LEAD_TIME", "DAYS_BETWEEN_ORDER_AND_DUE_DATE"]]
other_numerical_vars = [col for col in numerical_vars if col not in bin_vars + log_vars] + ["DISTANCE_IN_MILES"]
numeric_cols = log_vars + other_numerical_vars 

In [5]:
# def convert_date(df):
#     df_copy = df.copy()
#     date_cols = date_vars
#     for col in date_cols:
#         df_copy[col] = pd.to_datetime(df_copy[col])
#         df_copy[col] = df_copy[col].astype(int) / 10**9
#     return df_copy

# date_transformer = FunctionTransformer(convert_date, validate=False)

def log_transform(X, log_cols=None):
    X_copy = np.array(X, copy=True)
    if log_cols is None or not log_cols:
        return X_copy
    # Map log_cols to indices in numeric_cols (the columns passed to numeric_pipeline)
    col_indices = [numeric_cols.index(col) for col in log_cols if col in numeric_cols]
    if col_indices:
        X_copy[:, col_indices] = np.clip(X_copy[:, col_indices], a_min=-0.999, a_max=None)
        X_copy[:, col_indices] = np.log1p(X_copy[:, col_indices])
    return X_copy
log_transformer = FunctionTransformer(log_transform, kw_args={'log_cols': log_vars})

def bin_transform(X):
    X_copy = X.copy()
    bin_cols = [col for col in bin_vars if col in X_copy.columns]
    for col in bin_cols:
        X_copy[col] = pd.qcut(X_copy[col], q=5, labels=False, duplicates='drop')  # Quartiles
    return X_copy

bin_transformer = FunctionTransformer(bin_transform)

def numeric_interactions(X):
    X_copy = np.array(X)
    distance_idx = numeric_cols.index("DISTANCE_IN_MILES")
    demand_idx = numeric_cols.index("AVERAGE_DAILY_DEMAND_CASES")
    days_idx = numeric_cols.index("DAYS_BETWEEN_ORDER_AND_DUE_DATE")
    int1 = X_copy[:, distance_idx] * X_copy[:, demand_idx]
    int2 = X_copy[:, distance_idx] * X_copy[:, days_idx]
    int3 = X_copy[:, days_idx] * X_copy[:, demand_idx]
    return np.column_stack((X_copy, int1, int2, int3))

numeric_interaction_transformer = FunctionTransformer(numeric_interactions)

def bin_interaction(X):
    X_copy = np.array(X, copy=True)
    transit_idx = bin_vars.index("TRANSIT_LEAD_TIME")
    qty_idx = bin_vars.index("ORDER_QUANTITY_DEVIATION")
    interaction = X_copy[:, transit_idx] * X_copy[:, qty_idx]
    X_copy = np.column_stack((X_copy, interaction))
    return X_copy

bin_interaction_transformer = FunctionTransformer(bin_interaction)

def cat_interactions(X):
    X_copy = pd.DataFrame(X, columns=x_train.columns)  # Use full column names
    X_copy = X_copy[categorical_vars]  # Subset to categorical_vars
    X_copy["ORDER_TYPE_MARKET"] = X_copy["PURCHASE_ORDER_TYPE"].astype(str) + "_" + X_copy["PRODUCT_MARKET"].astype(str)
    return X_copy

cat_interaction_transformer = FunctionTransformer(cat_interactions)

# def add_interactions(X):
#     X_copy = np.array(X, copy=True)
#     lead_time_idx = numeric_cols.index("PURCHASING_LEAD_TIME")
#     distance_idx = numeric_cols.index("DISTANCE_IN_MILES")
#     buffer_idx = numeric_cols.index("DAYS_BETWEEN_ORDER_AND_DUE_DATE")
#     demand_idx = numeric_cols.index("AVERAGE_DAILY_DEMAND_CASES")
#     transit_idx = numeric_cols.index("TRANSIT_LEAD_TIME")
#     # Add interactions
#     interaction1 = X_copy[:, lead_time_idx] * X_copy[:, distance_idx]  # Lead time * Distance
#     interaction2 = X_copy[:, buffer_idx] - X_copy[:, lead_time_idx]    # Buffer time
#     interaction3 = X_copy[:, demand_idx] * X_copy[:, transit_idx]      # Demand * Transit
#     X_copy = np.column_stack((X_copy, interaction1, interaction2, interaction3))
#     return X_copy
# interaction_transformer = FunctionTransformer(add_interactions)



In [6]:
numeric_pipeline = Pipeline([
    # ("date_to_numeric", date_transformer),
    ("imputer", SimpleImputer(strategy="median")),
    ("log_transform", log_transformer),
    ("numerical_interaction", numeric_interaction_transformer),
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
])

bin_pipeline = Pipeline(steps=[
    ('bin_transform', bin_transformer),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('bin_interaction', bin_interaction_transformer),
    ('scaler', StandardScaler(with_mean=False))
])

# Categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('scaler', StandardScaler(with_mean=False))
])

In [7]:
print(f"Numeric cols: {len(numeric_cols)}")
print(f"Bin cols: {len(bin_vars)}")
print(f"Cat cols: {len(categorical_vars)}")

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_pipeline, log_vars + other_numerical_vars),
    ('bin', bin_pipeline, bin_vars),
    ('categorical', cat_pipeline, categorical_vars)
])

Numeric cols: 15
Bin cols: 3
Cat cols: 5


In [8]:
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_classif, k=250)),
    ('model', LogisticRegression(
        C=100000,
        max_iter=20000,
        solver='lbfgs',
        penalty='l2',
        random_state=42
    ))
])

full_pipeline.fit(x_train, y_train)

  f = msb / msw


In [9]:
# Get probabilities
train_accuracy = full_pipeline.score(x_train, y_train)
y_pred_train_proba = full_pipeline.predict_proba(x_train)[:, 1]
y_pred_test_proba = full_pipeline.predict_proba(test)[:, 1]

# Tune threshold
thresholds = np.arange(0.51, 0.91, 0.01)
best_threshold = 0.51
best_accuracy = train_accuracy
for threshold in thresholds:
    y_pred_train_adjusted = (y_pred_train_proba >= threshold).astype(int)
    accuracy = np.mean(y_pred_train_adjusted == y_train)
    if accuracy >= best_accuracy:
        best_accuracy = accuracy
        best_threshold = threshold

# Final predictions
y_pred_train = (y_pred_train_proba >= best_threshold).astype(int)
y_pred_test = (y_pred_test_proba >= best_threshold).astype(int)

print(f"Training Accuracy (default 0.5): {train_accuracy:.4f}")
print(f"Optimal Threshold: {best_threshold:.2f}")
print(f"Training Accuracy (optimal): {best_accuracy:.4f}")
print(f"Number of zeros in y_train: {np.sum(y_train == 0)}")
print(f"Number of zeros in y_pred_train (optimal): {np.sum(y_pred_train == 0)}")
print(f"Number of zeros in y_pred_test (optimal): {np.sum(y_pred_test == 0)}")

Training Accuracy (default 0.5): 0.7759
Optimal Threshold: 0.54
Training Accuracy (optimal): 0.7790
Number of zeros in y_train: 10434
Number of zeros in y_pred_train (optimal): 11790
Number of zeros in y_pred_test (optimal): 5317


In [11]:
original_test = pd.read_csv("public_private_X.csv")
submission = pd.DataFrame({"ID": original_test["ID"], "ON_TIME_AND_COMPLETE": y_pred_test})
submission.to_csv("model_test.csv", index=False)