In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    FunctionTransformer
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV


In [40]:
#load our data
data = pd.read_csv('data_2016.csv')


In [41]:
#lets train test split
X = data.drop(columns=['bought_highbrow_wines'])
y = data['bought_highbrow_wines']
#lets drop x values whose y is nan
X = X.loc[y.dropna().index]
y = y.dropna()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [42]:
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # numeric coercion (NO filling here)
    num_cols = [c for c in df.columns if c.startswith("cat_")] + [
        "rev_ticket", "prod_ticket", "n_cogo", "cogo_rev",
        "total_discount", "price_sens_colr", "SOW_colr"
    ]
    num_cols = [c for c in num_cols if c in df.columns]

    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # SOW normalization + flag
    if "SOW_type_colr" in df.columns:
        df["SOW_!"] = (df["SOW_type_colr"] == "!").astype(int)
        df["SOW_type_colr"] = (
            df["SOW_type_colr"]
            .fillna("unknown")
            .replace("!", "unknown")
        )

    # Household normalization
    if "HOUSEHOLDTYPOLOGY" in df.columns:
        df["HOUSEHOLDTYPOLOGY"] = (
            df["HOUSEHOLDTYPOLOGY"]
            .fillna("unknown")
            .replace("!", "unknown")
        )

    # Negative value flags
    for c in [col for col in df.columns if col.startswith("cat_")]:
        negs = df[c] < 0
        df[f"{c}_neg_flag"] = negs.astype(int)

    return df


In [43]:
from sklearn.base import BaseEstimator, TransformerMixin

class ClipLog(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        arr = np.array(X, dtype=float, copy=True)
        arr[arr < 0] = 0
        return np.log1p(arr)


In [None]:
# #making categories 
# # Spend / turnover-like features
# spend_cols = [col for col in X_train.columns if col.startswith("cat_")] + [
#     "cogo_rev",
#     "total_discount",
#     "rev_ticket"
# ]

# # Count-like features (can also be scaled)
# count_cols = [
#     "prod_ticket",
#     "n_cogo"
# ]

# # Other numeric features
# other_numeric_cols = [
#     "price_sens_colr",
#     "SOW_colr"
# ]

# numeric_cols = spend_cols + count_cols + other_numeric_cols

# categorical_cols = [
#     "HOUSEHOLDTYPOLOGY",
#     "SOW_type_colr"
# ]

import numpy as np
import pandas as pd

def infer_feature_groups(df: pd.DataFrame,
                          skew_thresh=1.0,
                          range_ratio_thresh=20):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

    # ---- Binary (0/1) ----
    binary_cols = []
    for c in numeric_cols:
        vals = df[c].dropna().unique()
        if len(vals) <= 2 and set(vals).issubset({0, 1}):
            binary_cols.append(c)

    # ---- Bounded [0,1] but not binary we want to keep ----
    bounded_cols = []
    for c in numeric_cols:
        if c in binary_cols:
            continue
        col = df[c].dropna()
        if len(col) and col.min() >= 0 and col.max() <= 1:
            bounded_cols.append(c)

    log_cols = [
        c for c in numeric_cols
        if c not in binary_cols
        and c not in bounded_cols
        
    ]

    return {
        "log_numeric": log_cols,
        "binary": binary_cols,
        "categorical": categorical_cols,
    }


In [None]:
def finalpipeline(X_train: pd.DataFrame):
    groups = infer_feature_groups(X_train)

    
    log_numeric_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("clip_log", ClipLog()),
        ("scaler", StandardScaler())
    ])

    bounded_numeric_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value=0)),
        ("scaler", StandardScaler())
    ])
    binary_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value=0))
    ])
#this try except isfordifferent sklearn versions
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    categorical_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="constant", fill_value="missing")),
        ("ohe", ohe)
    ])

   #column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("log_num", log_numeric_pipeline, groups["log_numeric"]),

            ("bin", binary_pipeline, groups["binary"]),
            ("cat", categorical_pipeline, groups["categorical"]),
        ],
        remainder="drop",
        verbose_feature_names_out=False
    )

   #model
    model = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("classifier", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            solver="lbfgs",
            random_state=42
        ))
    ])

    return model, groups


In [49]:
gold_model, feature_groups = finalpipeline(X_train)

# shows what feature is in which group
for k, v in feature_groups.items():
    print(f"\n{k.upper()} ({len(v)}):")
    for c in v:
        print("  ", c)

# Diagnostic: show pipeline steps and a few classifier parameter keys
print('\nPipeline steps:', list(gold_model.named_steps.keys()))
print('Some classifier param keys:', [k for k in gold_model.get_params().keys() if k.startswith('classifier')][:30])

# Hyperparameter grid (note: use the step name 'classifier' not 'model')
# Use a list of dicts so we can avoid invalid solver/penalty combinations
param_grid = [
    {
        'classifier__penalty': ['l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__class_weight': [None, 'balanced'],
        'classifier__solver': ['lbfgs', 'newton-cg']  # solvers compatible with l2
    },
    {
        'classifier__penalty': ['l1'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__class_weight': [None, 'balanced'],
        'classifier__solver': ['saga']  # saga supports l1
    }
]




LOG_NUMERIC (8):
   cat_Bot_Mar_Boter
   cat_Chips
   cat_KoudeSauzen
   cat_Notengedroogdfruit_groenten
   cat_Wijn_Stillewijnen_RAYON
   rev_ticket
   prod_ticket
   SOW_colr

LINEAR_NUMERIC (44):
   masked_customer_id
   jaar
   cat_AP_STDR_PortoONLINE
   cat_AP_STDR_WhiskyONLINE
   cat_Babyluiers
   cat_Ber_Ger_DVPortioneerbaar
   cat_Ber_Ger_VersMaaltijdsalades
   cat_Bier_Genietbieren
   cat_Bloemen
   cat_BroodKorthoudbaar
   cat_ColruytMobile_Toestellen
   cat_DeegDV
   cat_EleKtroKeuken
   cat_Houtpelletskolen_briketten
   cat_Incontinentie_luiers
   cat_KaasSeizoenskazen
   cat_Kauwgum
   cat_MelkKarnemelk
   cat_Ontbijtgranen_Volwassenen
   cat_ParfumerieEHBO
   cat_Tapas
   cat_Textiel_Bedlinnen
   cat_Textiel_Herenondergoed
   cat_Textiel_Pantys
   cat_VNCBGBereidegerechten
   cat_VNCBerBurgers
   cat_VNCCharBHWildpasteien
   cat_VNCFSalades
   cat_VNCGevgeheel
   cat_VNCKalfStoofvlees
   cat_VNCLamSnedenkoteletsteak
   cat_VNCRest
   cat_VNCVarkenRest
   cat_VNCWildSteak

In [47]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=gold_model,
    param_grid=param_grid,
    scoring="f1",
    cv=cv,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [None]:
#print best results AND all evaluation metrics, precision recall f1 false positives false engatives true pos true negatives
print("Best CV F1 score:", grid_search.best_score_)
print("Best parameters:", grid_search.best_params_)
from sklearn.metrics import classification_report


Fitting 5 folds for each of 16 candidates, totalling 80 fits



KeyboardInterrupt

