In [22]:
sample_submission_path = "/kaggle/input/playground-series-s5e7/sample_submission.csv"
train_path = "/kaggle/input/playground-series-s5e7/train.csv"
test_path = "/kaggle/input/playground-series-s5e7/test.csv"

In [23]:
sample_submission_path = "/home/yanncauchepin/Datasets/SupervisedLearning/kaggle_introvertsextroverts/sample_submission.csv"
train_path = "/home/yanncauchepin/Datasets/SupervisedLearning/kaggle_introvertsextroverts/train.csv"
test_path = "/home/yanncauchepin/Datasets/SupervisedLearning/kaggle_introvertsextroverts/test.csv"

In [24]:
import pandas as pd

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission_df = pd.read_csv(sample_submission_path)

In [25]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

def impute_nan_non_numeric(df):
    for col in df.columns:
        if df[col].isnull().any():
            if not pd.api.types.is_numeric_dtype(df[col]):
                   # For non-numeric columns, use mode imputation
                df[col] = df[col].fillna(df[col].mode()[0])
    return df
                
def impute_nan_numeric(df):
    for col in df.columns:
        if df[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                print(f"Imputing missing values in column: {col}")
                missing_mask = df[col].isnull()
                df_complete = df[~missing_mask]
                df_incomplete = df[missing_mask]
                if df_complete.empty:
                    imputer = SimpleImputer(strategy='mean')
                    df[col] = imputer.fit_transform(df[[col]])
                    continue
                X_complete = df_complete.drop(col, axis=1, errors='ignore')  # Drop target, handle missing columns
                y_complete = df_complete[col]
                X_incomplete = df_incomplete.drop(col, axis=1, errors='ignore') # Drop target, handle missing columns

                # Handle missing values in features using SimpleImputer (before model training)
                # num_imputer = SimpleImputer(strategy='median')  # Or use 'mean', 'most_frequent', 'constant'
                # X_complete = num_imputer.fit_transform(X_complete)
                # X_incomplete = num_imputer.transform(X_incomplete)

                # Train the RandomForestRegressor model
                print(f"{X_incomplete} - {X_complete}")
                model = RandomForestRegressor(n_estimators=100, random_state=42)  # Adjust hyperparameters as needed
                model.fit(X_complete, y_complete)

                # Predict the missing values
                predicted_values = model.predict(X_incomplete)

                # Fill the missing values in the original DataFrame
                df.loc[missing_mask, col] = predicted_values
    return df

In [None]:
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df, label_encoders=None):
    
    df = impute_nan_non_numeric(df)
    
    if label_encoders is None:
        label_encoders = {}
    
    for col in df.select_dtypes(include=['object']).columns:
        if col in label_encoders:
            le = label_encoders[col]
            df[col] = le.transform(df[col])
        else:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le
    
    df = impute_nan_numeric(df)
    
    return df, label_encoders

In [27]:
preprocessed_train_df, label_encoders = preprocess_data(train_df)
preprocessed_test_df, _ = preprocess_data(test_df, label_encoders)

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score

X = preprocessed_train_df.drop(columns=['Personality']).values
Y = preprocessed_train_df['Personality'].values
X_test = preprocessed_test_df.values

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.15, random_state=42)

In [29]:
import xgboost as xgb

model = xgb.XGBClassifier(
    objective='binary:logistic',  # Or 'multi:softmax' for multi-class
    n_estimators=1000,             # Number of boosting rounds
    learning_rate=0.1,            # Step size shrinkage
    max_depth=5,                  # Maximum depth of a tree
    random_state=42               # Random seed for reproducibility
)

In [30]:
model.fit(
    X_train,
    Y_train,
    eval_set=[(X_val, Y_val)],  # Use test set for validation
    verbose=True  # Suppress verbose output during training
)

[0]	validation_0-logloss:0.49639
[1]	validation_0-logloss:0.43643
[2]	validation_0-logloss:0.38952
[3]	validation_0-logloss:0.35146
[4]	validation_0-logloss:0.31978
[5]	validation_0-logloss:0.29317
[6]	validation_0-logloss:0.27053
[7]	validation_0-logloss:0.25133
[8]	validation_0-logloss:0.23474
[9]	validation_0-logloss:0.22029
[10]	validation_0-logloss:0.20795
[11]	validation_0-logloss:0.19720
[12]	validation_0-logloss:0.18784
[13]	validation_0-logloss:0.17964
[14]	validation_0-logloss:0.17254
[15]	validation_0-logloss:0.16633
[16]	validation_0-logloss:0.16102
[17]	validation_0-logloss:0.15640
[18]	validation_0-logloss:0.15239
[19]	validation_0-logloss:0.14889
[20]	validation_0-logloss:0.14582
[21]	validation_0-logloss:0.14321
[22]	validation_0-logloss:0.14099
[23]	validation_0-logloss:0.13907
[24]	validation_0-logloss:0.13744
[25]	validation_0-logloss:0.13615
[26]	validation_0-logloss:0.13495
[27]	validation_0-logloss:0.13392
[28]	validation_0-logloss:0.13282
[29]	validation_0-loglos

In [31]:
accuracy_score(Y_val, model.predict(X_val))

0.9697732997481109

In [32]:
Y_pred = model.predict(X_test)
Y_pred = (Y_pred > 0.5).astype(int)

In [19]:
Y_pred = label_encoders['Personality'].inverse_transform(Y_pred.flatten())

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Personality': Y_pred.flatten()
})
submission_df.to_csv('xgboost_submission.csv', index=False)

In [21]:
submission_df

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert
