In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  # For handling class imbalance
from scipy.stats import randint, uniform

# Load the data
train_path = "/content/train.csv"  # Update with your path
test_path = "/content/test.csv"      # Update with your path
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

target_column = 'Target'
id_column = 'id'
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if target_column in numerical_cols:
    numerical_cols.remove(target_column)
if id_column in numerical_cols:
    numerical_cols.remove(id_column)

# Encode categorical variables
encoder = LabelEncoder()
for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = encoder.transform(test_df[col])

# Scale numerical features
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
if all(col in test_df.columns for col in numerical_cols):
    test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Prepare training and validation sets
X = train_df.drop(columns=[target_column, id_column])
y = train_df[target_column]

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Set parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'verbosity': 0
}

# Create XGBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 15),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 1.0),
    'colsample_bytree': uniform(0.5, 1.0),
}

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                                   n_iter=100, scoring='accuracy', cv=3, verbose=2, n_jobs=-1, random_state=42)

print("Starting Randomized Search...")
random_search.fit(X_train, y_train)

# Best parameters from Randomized Search
print("Best parameters found: ", random_search.best_params_)

# Train the model with the best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Validate the model
y_val_pred = best_model.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy with XGBoost: ", validation_accuracy)
print(classification_report(y_val, y_val_pred))

# If you want to make predictions on the test set
X_test = test_df.drop(columns=[id_column])
y_test_pred = best_model.predict(X_test)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    id_column: test_df[id_column],
    target_column: y_test_pred
})

# Save the submission DataFrame to a CSV file
submission_file_path = "/content/final_submission.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"Submission file created: {submission_file_path}")

Starting Randomized Search...
Fitting 3 folds for each of 100 candidates, totalling 300 fits


216 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^

Best parameters found:  {'colsample_bytree': np.float64(0.8998609717152555), 'learning_rate': np.float64(0.023999698964084628), 'max_depth': 14, 'n_estimators': 882, 'subsample': np.float64(0.9560699842170359)}
Validation Accuracy with XGBoost:  0.9523297491039426
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5571
           1       0.95      0.96      0.95      5589

    accuracy                           0.95     11160
   macro avg       0.95      0.95      0.95     11160
weighted avg       0.95      0.95      0.95     11160

Submission file created: /content/final_submission.csv


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform

# Load the data
train_path = "/content/train.csv"  # Update with your path
test_path = "/content/test.csv"      # Update with your path
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

target_column = 'Target'
id_column = 'id'
categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if target_column in numerical_cols:
    numerical_cols.remove(target_column)
if id_column in numerical_cols:
    numerical_cols.remove(id_column)

# Encode categorical variables
encoder = LabelEncoder()
for col in categorical_cols:
    train_df[col] = encoder.fit_transform(train_df[col])
    if col in test_df.columns:
        test_df[col] = encoder.transform(test_df[col])

# Scale numerical features
scaler = StandardScaler()
train_df[numerical_cols] = scaler.fit_transform(train_df[numerical_cols])
if all(col in test_df.columns for col in numerical_cols):
    test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Prepare training and validation sets
X = train_df.drop(columns=[target_column, id_column])
y = train_df[target_column]

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Stratified split for better representation of classes
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Set parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'verbosity': 0,
    'scale_pos_weight': len(y) / sum(y)  # Adjust for class imbalance
}

# Create XGBoost classifier
xgb_model = xgb.XGBClassifier(**params)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(300, 1500),  # Increased range for more trees
    'max_depth': randint(3, 20),          # Increased depth for more complex trees
    'learning_rate': uniform(0.01, 0.3),  # Learning rate
    'subsample': uniform(0.5, 1.0),       # Subsample ratio (0.5 to 1.0)
    'colsample_bytree': uniform(0.5, 1.0), # Column sample ratio (0.5 to 1.0)
    'gamma': uniform(0, 5),               # Minimum loss reduction required to make a further partition
    'reg_alpha': uniform(0, 1),           # L1 regularization term
    'reg_lambda': uniform(0, 1)            # L2 regularization term
}

# Perform Randomized Search with Stratified K-Folds
skf = StratifiedKFold(n_splits=2)  # Increased folds for better validation
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist,
                                   n_iter=200, scoring='roc_auc', cv=skf, verbose=2, n_jobs=-1, random_state=42)

print("Starting Randomized Search...")
random_search.fit(X_train, y_train)

# Best parameters from Randomized Search
print("Best parameters found: ", random_search.best_params_)

# Train the model with the best parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# Validate the model
y_val_pred = best_model.predict(X_val)
validation_accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, best_model.predict_proba(X_val)[:, 1])  # AUC score for better evaluation
print("Validation Accuracy with XGBoost: ", validation_accuracy)
print("ROC AUC Score: ", roc_auc)
print(classification_report(y_val, y_val_pred))

# If you want to make predictions on the test set
X_test = test_df.drop(columns=[id_column])
y_test_pred = best_model.predict(X_test)

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    id_column: test_df[id_column],
    target_column: y_test_pred
})

# Save the submission DataFrame to a CSV file
submission_file_path = "/content/final_submission.csv"
submission_df.to_csv(submission_file_path, index=False)

print(f"Submission file created: {submission_file_path}")

Starting Randomized Search...
Fitting 2 folds for each of 200 candidates, totalling 400 fits


312 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/xgboost/sklearn.py", line 1599, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^

Best parameters found:  {'colsample_bytree': np.float64(0.5094854336284981), 'gamma': np.float64(0.26341917988614294), 'learning_rate': np.float64(0.03656302318408544), 'max_depth': 14, 'n_estimators': 996, 'reg_alpha': np.float64(0.5807572072273277), 'reg_lambda': np.float64(0.2716869710690041), 'subsample': np.float64(0.8982873839756356)}
Validation Accuracy with XGBoost:  0.942921146953405
ROC AUC Score:  0.9916291221849667
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      5580
           1       0.92      0.98      0.94      5580

    accuracy                           0.94     11160
   macro avg       0.94      0.94      0.94     11160
weighted avg       0.94      0.94      0.94     11160

Submission file created: /content/final_submission.csv
