In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report
import joblib

In [5]:
# Feature engineering function
def feature_engineering(df):
    df['age_group'] = pd.cut(df['age'], bins=[0, 30, 50, 100], labels=['<30', '30-50', '>50'])
    df['balance_bin'] = pd.cut(df['balance'], bins=[-float('inf'), 0, 1000, 5000, float('inf')],
                               labels=['negative', 'low', 'medium', 'high'])
    df['balance_duration_ratio'] = df['balance'] / (df['duration'] + 1)
    df['age_balance_interaction'] = df['age'] * df['balance']
    df['previous_contacted'] = (df['pdays'] != -1).astype(int)
    df['engagement_rate'] = df['previous'] / (df['campaign'] + 1)
    return df


# Load the train and test datasets
train_file_path = "cleaned_file.csv"
test_file_path = "test.csv"
train_data = pd.read_csv("/content/train (1).csv")
test_data = pd.read_csv("/content/test.csv")

# Apply feature engineering
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

# Define target and features
target = 'y'
features = [col for col in train_data.columns if col != target]

# Separate numeric and categorical features
numeric_features = train_data[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_data[features].select_dtypes(include=['object', 'category']).columns

# Define preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define the model with class weights
model = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced')

# Combine preprocessing and model in a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Split the dataset
X = train_data[features]
y = train_data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [50, 100],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,  # Reduced folds for faster runtime
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

# Calculate the F1 score and display the classification report
f1 = f1_score(y_val, y_val_pred, pos_label='yes')
print(classification_report(y_val, y_val_pred, target_names=['no', 'yes']))

# Save the best model
joblib.dump(best_model, 'best_model.pkl')
print("Best model saved as best_model.pkl")

# Predict on the test dataset
X_test = test_data[features]
test_predictions = best_model.predict(X_test)
test_data['y'] = test_predictions

Fitting 3 folds for each of 24 candidates, totalling 72 fits


 nan nan nan nan nan nan]


              precision    recall  f1-score   support

          no       0.98      0.89      0.93      7421
         yes       0.36      0.79      0.50       579

    accuracy                           0.88      8000
   macro avg       0.67      0.84      0.72      8000
weighted avg       0.94      0.88      0.90      8000

Best model saved as best_model.pkl


In [6]:
# Save predictions to a CSV file
test_data.to_csv('Final 2775.csv', index=False)
print("Predictions saved to Final 2775.csv")

Predictions saved to Final 2775.csv
