<a href="https://colab.research.google.com/github/xadicavadzade/binar_classification_bank/blob/main/binar_classification_bank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
sample_submission = pd.read_csv('/content/sample_submission.csv')
train = pd.read_csv("/content/train.csv")

In [None]:
sample_submission

Unnamed: 0,id,y
0,750000,0.5
1,750001,0.5
2,750002,0.5
3,750003,0.5
4,750004,0.5
...,...,...
249995,999995,0.5
249996,999996,0.5
249997,999997,0.5
249998,999998,0.5


In [None]:
train

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0


In [None]:
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# ----------------------------
# Feature Engineering Function
# ----------------------------
def feature_engineering(df):
    df = df.drop("id", axis=1, errors='ignore')

    # Age group
    df["age_group"] = pd.cut(df["age"], bins=[17, 25, 40, 60, 100],
                             labels=["18-25", "26-40", "41-60", "60+"])

    # Balance status
    df["balance_status"] = pd.cut(df["balance"], bins=[-np.inf, 0, 500, 2000, np.inf],
                                  labels=["negative/zero", "low", "medium", "high"])

    # Pdays binning
    conditions = [
        df["pdays"] == -1,
        df["pdays"] <= 30,
        df["pdays"] <= 90,
        df["pdays"] <= 180,
        df["pdays"] <= 365
    ]
    choices = ["never_contacted", "within_1_month", "1-3_months", "3-6_months", "6-12_months"]
    df["pdays_bin"] = np.select(conditions, choices, default="more_than_1_year")

    # Low cardinality columns
    low_card_cols = ["marital", "housing", "loan", "contact", "poutcome",
                     "month", "age_group", "balance_status"]
    df = pd.get_dummies(df, columns=low_card_cols, drop_first=True)

    # High cardinality columns
    high_card_cols = ["job", "pdays_bin"]
    for col in high_card_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

    # Ordinal encoding for education
    edu_order = {"primary":1, "secondary":2, "tertiary":3, "unknown":0}
    if "education" in df.columns:
        df["education"] = df["education"].map(edu_order)

    return df

feature_engineering_transformer = FunctionTransformer(feature_engineering)

# ----------------------------
# Stacking Model
# ----------------------------
def create_binary_stacking_pipeline():
    base_models = [
        ("rf", RandomForestClassifier(n_estimators=50, max_depth=8, n_jobs=-1, random_state=42)),
        ("lgbm", LGBMClassifier(n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=42))
    ]

    meta_learner = LogisticRegression(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)

    stacking_classifier = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_learner,
        cv=StratifiedKFold(n_splits=2, shuffle=True, random_state=42),
        stack_method='predict_proba',
        n_jobs=-1
    )
    return stacking_classifier

# ----------------------------
# Full Pipeline
# ----------------------------
def create_full_pipeline(X_train_raw):
    # Apply feature engineering once to determine columns
    X_train_fe = feature_engineering(X_train_raw.copy())
    target_cols = ["y"]
    X_train_fe = X_train_fe.drop(columns=[col for col in target_cols if col in X_train_fe.columns])

    numerical_cols = X_train_fe.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = X_train_fe.select_dtypes(include='object').columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), categorical_cols)
        ],
        remainder='passthrough'
    )

    feature_selector = SelectFromModel(
        RandomForestClassifier(n_estimators=50, random_state=42),
        threshold='median'
    )

    stacking_model = create_binary_stacking_pipeline()

    full_pipeline = Pipeline([
        ('feature_engineering', feature_engineering_transformer),
        ('preprocessor', preprocessor),
        ('feature_selection', feature_selector),
        ('stacking_classifier', stacking_model)
    ])

    return full_pipeline

# ----------------------------
# Main
# ----------------------------
def main():
    print("🚀 Starting Classification with Stacking Ensemble...")

    try:
        df = pd.read_csv("train.csv")
        print(f"✅ Training data loaded: {df.shape}")
    except FileNotFoundError:
        print("❌ train.csv file not found.")
        return

    X = df.drop('y', axis=1)
    y = df['y']

    # Sample dataset for faster execution
    sample_size = 50000  # ilk test üçün
    X_train_small = X.sample(n=sample_size, random_state=42)
    y_train_small = y.loc[X_train_small.index]

    X_train, X_test, y_train, y_test = train_test_split(X_train_small, y_train_small, test_size=0.2, random_state=42)
    print(f"✅ Data split - Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")

    pipeline = create_full_pipeline(X_train)

    print("🔄 Training the pipeline...")
    pipeline.fit(X_train, y_train)
    print("✅ Model training completed!")

    y_pred = pipeline.predict(X_test)
    print("📊 Model Performance:")
    print(classification_report(y_test, y_pred))

    # Test set və submission (opsional)
    try:
        test_df = pd.read_csv("test.csv")
        test_final = test_df.copy()
        preds = pipeline.predict(test_final)
        submission = pd.DataFrame({"Id": test_df["id"], "y": preds})
        submission.to_csv("submission.csv", index=False)
        print("✅ submission.csv created successfully!")
    except FileNotFoundError:
        print("⚠️ test.csv not found. Skipping submission creation.")

    return pipeline

# ----------------------------
# Run
# ----------------------------
if __name__ == "__main__":
    pipeline = main()


🚀 Starting Classification with Stacking Ensemble...
✅ Training data loaded: (750000, 18)
✅ Data split - Train: 40000, Test: 10000
🔄 Training the pipeline...
✅ Model training completed!
📊 Model Performance:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      8804
           1       0.55      0.84      0.66      1196

    accuracy                           0.90     10000
   macro avg       0.76      0.87      0.80     10000
weighted avg       0.93      0.90      0.91     10000





✅ submission.csv created successfully!
