In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, auc
from sklearn.feature_selection import RFE

# Load the datasets
train_features = pd.read_csv(r"C:\Users\99846\OneDrive\Desktop\training_set_features.csv")
train_labels = pd.read_csv(r"C:\Users\99846\OneDrive\Desktop\training_set_labels.csv")
test_features = pd.read_csv(r"C:\Users\99846\Downloads\test_set_features.csv")

# Save respondent IDs for submission
test_respondent_ids = test_features['respondent_id']

# Drop 'respondent_id' column
train_features.drop('respondent_id', axis=1, inplace=True)
train_labels.drop('respondent_id', axis=1, inplace=True)
test_features.drop('respondent_id', axis=1, inplace=True)

# Identify categorical and numerical columns
cat_cols = [col for col in train_features.columns if train_features[col].dtype == 'O']
num_cols = [col for col in train_features.columns if col not in cat_cols]

# Impute numerical columns using KNNImputer
df_knn_imputer = train_features.copy()
knn_imputer = KNNImputer(n_neighbors=7)
df_knn_imputer[num_cols] = knn_imputer.fit_transform(df_knn_imputer[num_cols])

# Function to encode categorical columns with missing values
def encode_missing_cols(df, col):
    le = LabelEncoder()
    unique_without_nan = pd.Series([i for i in df[col].unique() if type(i) == str])
    le.fit(unique_without_nan)
    df[col] = df[col].apply(lambda x: le.transform([x])[0] if type(x) == str else x)

# Apply the encoding to categorical columns
for col in cat_cols:
    encode_missing_cols(df_knn_imputer, col)

X_imputed = df_knn_imputer.copy()

# Function for predictive imputation
def predictive_imputation(df, column):
    df = df.copy()
    train_df = df[df[column].notna()]
    test_df = df[df[column].isna()]
    
    model = RandomForestClassifier()
    model.fit(train_df.drop(column, axis=1), train_df[column])
    
    df.loc[df[column].isna(), column] = model.predict(test_df.drop(column, axis=1))
    return df

# Apply predictive imputation
for col in ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'employment_industry', 'employment_occupation']:
    X_imputed = predictive_imputation(X_imputed, col)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_imputed, train_labels, test_size=0.2, random_state=42)

# Initialize the base model
base_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Recursive Feature Elimination (RFE) for multi-output
rfe = RFE(estimator=base_model, n_features_to_select=20)
rfe.fit(X_train, y_train)

# Selected features
selected_features = X_imputed.columns[rfe.support_]
print("Selected Features:\n", selected_features)

# Reduce features
X_train_rfe = X_train[selected_features]
X_test_rfe = X_test[selected_features]

# Train with reduced features using MultiOutputClassifier
model_rfe = MultiOutputClassifier(base_model, n_jobs=-1)
model_rfe.fit(X_train_rfe, y_train)

# Evaluate
y_pred = model_rfe.predict(X_test_rfe)
print("Accuracy with RFE:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

# Calculate ROC AUC for internal validation
y_prob = model_rfe.predict_proba(X_test_rfe)

roc_auc_scores = []
for i, label in enumerate(y_train.columns):
    roc_auc = roc_auc_score(y_test.iloc[:, i], y_prob[i][:, 1])
    roc_auc_scores.append(roc_auc)
    print(f'ROC AUC for {label}: {roc_auc:.4f}')

mean_roc_auc = np.mean(roc_auc_scores)
print(f'Mean ROC AUC: {mean_roc_auc:.4f}')

# Prepare the submission file
test_features_imputed = df_knn_imputer.iloc[:test_features.shape[0]].copy()

# Encode categorical columns in test set
for col in cat_cols:
    encode_missing_cols(test_features_imputed, col)

# Apply predictive imputation on test set
for col in ['education', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'employment_industry', 'employment_occupation']:
    test_features_imputed = predictive_imputation(test_features_imputed, col)

# Reduce features for test set
X_test_rfe_final = test_features_imputed[selected_features]

# Predict probabilities on test set
test_prob = model_rfe.predict_proba(X_test_rfe_final)

# Prepare the submission dataframe
submission_df = pd.DataFrame({
    'respondent_id': test_respondent_ids,
    'xyz_vaccine': test_prob[0][:, 1],
    'seasonal_vaccine': test_prob[1][:, 1]
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values