In [54]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

In [55]:
data1 = pd.read_csv('training_set_features.csv')

In [56]:
data2=pd.read_csv('training_set_labels.csv')

In [57]:
categorical_cols = [
    'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 
    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 
    'employment_industry', 'employment_occupation'
]

numerical_cols = [
    'xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 
    'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 
    'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 
    'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 
    'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 
    'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 
    'opinion_seas_sick_from_vacc', 'household_adults', 'household_children'
]

In [58]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [59]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [60]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [61]:
X = data1.drop(columns=['respondent_id'])
y = data2[['xyz_vaccine', 'seasonal_vaccine']]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
model = MultiOutputClassifier(LogisticRegression(solver='liblinear'))

In [64]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])


In [65]:
clf.fit(X_train, y_train)

In [66]:
y_pred_prob = clf.predict_proba(X_test)

In [67]:
y_pred_prob = np.hstack([pred[:, 1].reshape(-1, 1) for pred in y_pred_prob])


In [68]:
roc_auc_xyz = roc_auc_score(y_test['xyz_vaccine'], y_pred_prob[:, 0])
roc_auc_seasonal = roc_auc_score(y_test['seasonal_vaccine'], y_pred_prob[:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

In [69]:
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz:.2f}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal:.2f}')
print(f'Mean ROC AUC: {mean_roc_auc:.2f}')

ROC AUC for xyz_vaccine: 0.83
ROC AUC for seasonal_vaccine: 0.86
Mean ROC AUC: 0.84


In [70]:
data3 = pd.read_csv('test_set_features.csv')

In [71]:
X_test = data3.drop(columns=['respondent_id'])

In [72]:
y_pred_prob_full = clf.predict_proba(X_test)

In [73]:
y_pred_prob_full = np.hstack([pred[:, 1].reshape(-1, 1) for pred in y_pred_prob_full])


In [74]:
submissionfinal = pd.read_csv("submission_format.csv", 
                            index_col=0)

In [75]:
submissionfinal = pd.DataFrame({
    'respondent_id': data3['respondent_id'],
    'xyz_vaccine': y_pred_prob_full[:, 0],
    'seasonal_vaccine': y_pred_prob_full[:, 1]
})

In [85]:
submissionfinal.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0.050071,0.297075
1,26708,0.046355,0.046452
2,26709,0.367333,0.515693
3,26710,0.514121,0.881451
4,26711,0.150038,0.457432


In [95]:
submissionfinal.to_csv('submissionfinal.csv')

In [96]:
df=pd.read_csv('submissionfinal.csv')

In [97]:
df.set_index('respondent_id',inplace=True)

In [98]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,xyz_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26707,0,0.050071,0.297075
26708,1,0.046355,0.046452
26709,2,0.367333,0.515693
26710,3,0.514121,0.881451
26711,4,0.150038,0.457432


In [99]:
df = df.drop('Unnamed: 0', axis=1)

In [100]:
df.head()

Unnamed: 0_level_0,xyz_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.050071,0.297075
26708,0.046355,0.046452
26709,0.367333,0.515693
26710,0.514121,0.881451
26711,0.150038,0.457432


In [102]:
df.to_csv('df.csv')