In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e11/sample_submission.csv
/kaggle/input/playground-series-s4e11/train.csv
/kaggle/input/playground-series-s4e11/test.csv


# **1. Import Necessary Libraries **

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [3]:
df_train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv")
df_test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")

X = df_train.drop('Depression', axis=1)
y = df_train['Depression']

# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define preprocessing for numerical features
numerical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine both preprocessors using ColumnTransformer
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ]
)

# Fit the preprocessor on the training data and transform both the training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)




In [5]:
from sklearn.ensemble import IsolationForest

# Apply Isolation Forest for outlier detection on the training data
isolation_forest = IsolationForest(contamination=0.01, random_state=42)
outlier_labels = isolation_forest.fit_predict(X_train_preprocessed)

# Filter out outliers from both X_train_preprocessed and y_train
non_outliers_mask = outlier_labels != -1
X_train_preprocessed = X_train_preprocessed[non_outliers_mask]
y_train = y_train[non_outliers_mask]




In [6]:
from xgboost import XGBClassifier

# Instantiate the model with the best parameters from Optuna tuning
best_params = {
    'colsample_bytree': 0.3431507276242327, 
    'learning_rate': 0.21386105990207085, 
    'max_depth': 4, 
    'min_child_weight': 7, 
    'n_estimators': 738, 
    'subsample': 0.7533763963831641, 
    'gamma': 4.540114884762189, 
    'reg_lambda': 5.059981920152652
}

model = XGBClassifier(**best_params, use_label_encoder=False, random_state=42)

# Train the model
model.fit(X_train_preprocessed, y_train)


In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict on the test data
y_pred = model.predict(X_test_preprocessed)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.93909026297086
Precision: 0.8416765991259436
Recall: 0.8220799379123012
F1 Score: 0.831762858264625


In [8]:
# Load your test data again
test_data = pd.read_csv('/kaggle/input/playground-series-s4e11/test.csv')

# Preprocess the test data correctly
X_test_preprocessed = preprocessor.transform(test_data)

# Generate predictions using your trained model
y_pred_test = model.predict(X_test_preprocessed)

# Ensure that 'y_pred_test' has the same length as 'test_data'
if len(y_pred_test) == len(test_data):
    submission_df = pd.DataFrame({
        'id': test_data['id'],  # Ensure 'id' is correctly referenced from test_data
        'Depression': y_pred_test  # Model predictions
    })

    # Save the submission DataFrame to a CSV file
    submission_df.to_csv('submission.csv', index=False)

    # Display the first 5 rows
    print(submission_df.head())
else:
    print("Error: The lengths of predictions and test data do not match.")



       id  Depression
0  140700           0
1  140701           0
2  140702           0
3  140703           1
4  140704           0
