In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sa-hackathon/training_set_features.csv
/kaggle/input/sa-hackathon/test_set_features.csv
/kaggle/input/sa-hackathon/training_set_labels.csv
/kaggle/input/sa-hackathon/submission_format.csv


In [2]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Loading training and test set features
train_features = pd.read_csv('/kaggle/input/sa-hackathon/training_set_features.csv')
test_features = pd.read_csv('/kaggle/input/sa-hackathon/test_set_features.csv')

# Loading training set labels
train_labels = pd.read_csv('/kaggle/input/sa-hackathon/training_set_labels.csv')

# Merge training features and labels based on respondent_id
train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [3]:
# Separating features and target variables
X_train = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_train_xyz = train_data['xyz_vaccine']
y_train_seasonal = train_data['seasonal_vaccine']

# Preprocessing test set (remove respondent_id)
X_test = test_features.drop('respondent_id', axis=1)

In [4]:
# Encode categorical variables
X_train = pd.get_dummies(X_train)  
X_test = pd.get_dummies(X_test) 

missing_cols = set(X_train.columns) - set(X_test.columns)
for col in missing_cols:
    X_test[col] = 0
X_test = X_test[X_train.columns]

In [6]:
from sklearn.impute import SimpleImputer

# Columns with missing values
missing_cols_train = X_train.columns[X_train.isnull().any()]
missing_cols_test = X_test.columns[X_test.isnull().any()]

for col in missing_cols_train:
    if X_train[col].dtype == 'object':
        X_train[col].fillna(X_train[col].mode()[0], inplace=True)
    else:  # numerical columns
        X_train[col].fillna(X_train[col].mean(), inplace=True)

for col in missing_cols_test:
    if X_test[col].dtype == 'object':  # categorical columns
        X_test[col].fillna(X_train[col].mode()[0], inplace=True)
    else:  # numerical columns
        X_test[col].fillna(X_train[col].mean(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(X_train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test[col].fillna(X_train[col].mean(), inplace=True)


In [8]:
from sklearn.linear_model import LogisticRegression

model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)

model_xyz.fit(X_train_scaled, y_train_xyz)
model_seasonal.fit(X_train_scaled, y_train_seasonal)


In [10]:
model_xyz = LogisticRegression(max_iter=1000, solver='saga')
model_seasonal = LogisticRegression(max_iter=1000, solver='saga')


In [11]:
model_xyz = LogisticRegression(max_iter=1000, C=0.1)
model_seasonal = LogisticRegression(max_iter=1000, C=0.1)


In [12]:
# Fitting models
model_xyz.fit(X_train, y_train_xyz)
model_seasonal.fit(X_train, y_train_seasonal)

# Predicting probabilities for test set
pred_proba_xyz = model_xyz.predict_proba(X_test)[:, 1]
pred_proba_seasonal = model_seasonal.predict_proba(X_test)[:, 1]

In [13]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for training set to compute ROC AUC score
train_pred_proba_xyz = model_xyz.predict_proba(X_train)[:, 1]
train_pred_proba_seasonal = model_seasonal.predict_proba(X_train)[:, 1]

# Compute ROC AUC score for training set
train_auc_xyz = roc_auc_score(y_train_xyz, train_pred_proba_xyz)
train_auc_seasonal = roc_auc_score(y_train_seasonal, train_pred_proba_seasonal)

print(f"Train ROC AUC for xyz_vaccine: {train_auc_xyz:.4f}")
print(f"Train ROC AUC for seasonal_vaccine: {train_auc_seasonal:.4f}")

# Predict probabilities for test set
pred_proba_xyz = model_xyz.predict_proba(X_test)[:, 1]
pred_proba_seasonal = model_seasonal.predict_proba(X_test)[:, 1]


Train ROC AUC for xyz_vaccine: 0.8395
Train ROC AUC for seasonal_vaccine: 0.8550


In [14]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': pred_proba_xyz,
    'seasonal_vaccine': pred_proba_seasonal
})

# Save submission to CSV file
submission_df.to_csv('submission.csv', index=False)

print('Submission file saved successfully.')


Submission file saved successfully.
