In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# --- 1. Load and Explore the Data ---
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Train Data Info:")
print(train.info())
print("\nTest Data Info:")
print(test.info())

# --- 2. Feature Engineering and Preprocessing ---
# Identify categorical and numerical columns
categorical_cols = train.select_dtypes(include=['object']).columns
numerical_cols = train.select_dtypes(include=np.number).columns

# Handle missing values
train = train.fillna(train.median(numeric_only=True))
test = test.fillna(test.median(numeric_only=True))

for col in categorical_cols:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

# Pipeline components for preprocessing
label_encoders = {}
categorical_transformer = Pipeline(steps=[
    ('label_encoding', 'passthrough')  # Label encoding happens dynamically below
])

numerical_transformer = Pipeline(steps=[
    ('scaling', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply LabelEncoder dynamically for categorical columns
for col in categorical_cols:
    if col != 'target':
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        label_encoders[col] = le
        test[col] = test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# --- 3. Prepare Training and Validation Data ---
X_train = train.drop('target', axis=1)
y_train = train['target'].apply(lambda x: 1 if x == 'yes' else 0)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

# --- 4. Build the LightGBM Pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=-1,  # No depth limit
        min_data_in_leaf=20,  # Prevent overfitting
        min_gain_to_split=0.1,  # Minimum gain required to split
        random_state=42
    ))
])

# --- 5. Train the Model ---
pipeline.fit(X_train_split, y_train_split)
y_lgbm_pred = pipeline.predict(X_val_split)

# --- 6. Evaluate the Model ---
print("\n--- LightGBM Results ---")
print(f"Validation F1 Score (macro): {f1_score(y_val_split, y_lgbm_pred, average='macro')}")
print(classification_report(y_val_split, y_lgbm_pred))

# Confusion matrix visualization
cm = confusion_matrix(y_val_split, y_lgbm_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - LightGBM")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# --- 7. Hyperparameter Tuning ---
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__min_gain_to_split': [0.0, 0.1, 0.2],
    'classifier__min_data_in_leaf': [20, 50, 100]
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_macro', cv=3)
grid_search.fit(X_train_split, y_train_split)

print("\n--- Best Parameters for LightGBM ---")
print(grid_search.best_params_)

# --- 8. Final Predictions ---
X_test = test.copy()
submission = pd.DataFrame({'id': range(len(X_test)), 'target': pipeline.predict(X_test)})
submission.to_csv('submission_lgbm_pipeline.csv', index=False)


Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39211 entries, 0 to 39210
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   last contact date  39211 non-null  object
 1   age                39211 non-null  int64 
 2   job                38982 non-null  object
 3   marital            39211 non-null  object
 4   education          37744 non-null  object
 5   default            39211 non-null  object
 6   balance            39211 non-null  int64 
 7   housing            39211 non-null  object
 8   loan               39211 non-null  object
 9   contact            28875 non-null  object
 10  duration           39211 non-null  int64 
 11  campaign           39211 non-null  int64 
 12  pdays              39211 non-null  int64 
 13  previous           39211 non-null  int64 
 14  poutcome           9760 non-null   object
 15  target             39211 non-null  object
dtypes: int64(6), object(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True)


KeyError: 'target'