In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e10/sample_submission.csv
/kaggle/input/playground-series-s4e10/train.csv
/kaggle/input/playground-series-s4e10/test.csv


In [2]:
import pandas as pd

# Load the train and test datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Display the first few rows of the train data
train_data.head()

# Check for missing values and data types
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the train and test datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# 1. Fill missing values for numeric columns with the median
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())

# Since 'loan_status' or target column does not exist in test data, we ignore it for the test set
test_numeric_cols = test_data.select_dtypes(include=['float64', 'int64']).columns
test_data[test_numeric_cols] = test_data[test_numeric_cols].fillna(test_data[test_numeric_cols].median())

# 2. Fill missing values for categorical columns with the mode
categorical_cols = train_data.select_dtypes(include=['object']).columns
train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])

test_categorical_cols = test_data.select_dtypes(include=['object']).columns
test_data[test_categorical_cols] = test_data[test_categorical_cols].fillna(test_data[test_categorical_cols].mode().iloc[0])

# Verify that there are no missing values remaining
print(train_data.isnull().sum())
print(test_data.isnull().sum())

# 3. Encode categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Ensure both datasets have the same columns (align test dataset with train)
train_columns = train_data.columns.drop('loan_status')  # Exclude the target column
test_data = test_data.reindex(columns=train_columns, fill_value=0)

# 4. Split train data into features (X) and target (y)
X = train_data.drop(['loan_status'], axis=1)
y = train_data['loan_status']

# 5. Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# 7. Make predictions on the validation set
y_pred = rf_model.predict(X_val)

# 8. Evaluate the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

# 9. Make predictions on the test dataset
test_predictions = rf_model.predict(test_data)

# 10. Create a submission file
submission = pd.DataFrame({
    'Id': test_data.index,  # Replace 'Id' with the actual identifier column in test.csv if available
    'loan_status': test_predictions  # Replace 'loan_status' with the correct target name for submission
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64
id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64
Validation Accuracy: 0.9510
Submission file saved as 'submission.csv'.


In [4]:
import pandas as pd

# Load the test dataset
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Create a submission DataFrame with 'id' and 'loan_status' (set to 0.5 for all entries)
submission = pd.DataFrame({
    'id': test_data['id'],  # Use the correct identifier column 'id'
    'loan_status': 0.5  # Set the 'loan_status' to 0.5 for all rows
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Submission file saved as 'submission.csv'.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the train and test datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# 1. Fill missing values for numeric columns with the median
numeric_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())
test_numeric_cols = test_data.select_dtypes(include=['float64', 'int64']).columns
test_data[test_numeric_cols] = test_data[test_numeric_cols].fillna(test_data[test_numeric_cols].median())

# 2. Fill missing values for categorical columns with the mode
categorical_cols = train_data.select_dtypes(include=['object']).columns
train_data[categorical_cols] = train_data[categorical_cols].fillna(train_data[categorical_cols].mode().iloc[0])
test_categorical_cols = test_data.select_dtypes(include=['object']).columns
test_data[test_categorical_cols] = test_data[test_categorical_cols].fillna(test_data[test_categorical_cols].mode().iloc[0])

# 3. Encode categorical variables using one-hot encoding
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Ensure both datasets have the same columns (align test dataset with train)
train_columns = train_data.columns.drop('loan_status')  # Exclude the target column
test_data = test_data.reindex(columns=train_columns, fill_value=0)

# 4. Split train data into features (X) and target (y)
X = train_data.drop(['loan_status'], axis=1)
y = train_data['loan_status']

# 5. Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train a Random Forest model
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# 7. Make predictions on the validation set
y_pred = rf_model.predict(X_val)

# 8. Evaluate the model's accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

# 9. Make predictions on the test dataset
test_predictions = rf_model.predict(test_data)

# 10. Create a submission file with the predicted loan statuses
submission = pd.DataFrame({
    'id': test_data.index,  # Use the row index as the identifier
    'loan_status': test_predictions  # Predicted loan status
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Validation Accuracy: 0.9515
Submission file saved as 'submission.csv'.


In [6]:
import pandas as pd

# Load the test dataset
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Make sure to use the 'id' column from the test_data as the identifier in the submission
submission = pd.DataFrame({
    'id': test_data['id'],  # Use the correct 'id' column
    'loan_status': test_predictions  # Predicted loan status from the model
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Submission file saved as 'submission.csv'.


In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# Load train and test datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s4e10/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')

# Drop missing values and unnecessary columns for training
train_data = train_data.drop(['loan_int_rate'], axis=1)

# One-hot encoding for categorical variables
train_data = pd.get_dummies(train_data, drop_first=True)
test_data = pd.get_dummies(test_data, drop_first=True)

# Align train and test columns
train_data, test_data = train_data.align(test_data, join='left', axis=1)

# Fill missing values in the test set (if any) with 0
test_data.fillna(0, inplace=True)

# Drop the 'loan_status' column from the test data if it was added during alignment
if 'loan_status' in test_data.columns:
    test_data = test_data.drop('loan_status', axis=1)

# Split data into features and target
X = train_data.drop('loan_status', axis=1)
y = train_data['loan_status']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters found: {grid_search.best_params_}")

# Validate the model
val_accuracy = grid_search.score(X_val, y_val)
print(f"Validation Accuracy with best parameters: {val_accuracy:.4f}")

# Predict on the test data (ensure test data does not contain 'loan_status')
test_predictions = grid_search.predict(test_data)

# Prepare the submission file with correct 'id' from the test set
submission = pd.DataFrame({
    'id': pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')['id'],  # Use 'id' column from the original test set
    'loan_status': test_predictions
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters found: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Validation Accuracy with best parameters: 0.9511
Submission file saved as 'submission.csv'.
