In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/training-set/training_set_features.csv
/kaggle/input/submission-file/Submission.xlsx
/kaggle/input/test-file/test_set_features.csv
/kaggle/input/training-set-labels/training_set_labels.csv


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
data = pd.read_csv('/kaggle/input/training-set/training_set_features.csv')
print(data.head())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [20]:
test_file = pd.read_csv('/kaggle/input/test-file/test_set_features.csv')
training_set_labels=pd.read_csv('/kaggle/input/training-set-labels/training_set_labels.csv')


In [5]:
target1 = data.drop(columns=['respondent_id']).pop('doctor_recc_xyz')
target2 = data.drop(columns=['respondent_id']).pop('doctor_recc_seasonal')

In [6]:

target1.fillna(target1.mode()[0], inplace=True)
target2.fillna(target2.mode()[0], inplace=True)

In [7]:
columns_categorical = data.drop(columns=['respondent_id']).select_dtypes(include=['object', 'category']).columns
columns_numerical =  data.drop(columns=['respondent_id']).select_dtypes(include=['int64', 'float64']).columns

In [8]:
tranform_numerical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [9]:
transform_categorical = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', tranform_numerical, columns_numerical),
        ('cat', transform_categorical, columns_categorical)
    ])

In [12]:
xyz = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

seasonal = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [13]:
xyz.fit(data.drop(columns=['respondent_id']), target1)
y_prob_xyz = xyz.predict_proba(test_file.drop(columns=['respondent_id']))[:, 1]

In [14]:
seasonal.fit(data.drop(columns=['respondent_id']), target2)
y_prob_seasonal = seasonal.predict_proba(test_file.drop(columns=['respondent_id']))[:, 1]


In [15]:
predictions = pd.DataFrame({
    'respondent_id': test_file['respondent_id'],
    'xyz_vaccine': y_prob_xyz,
    'seasonal_vaccine': y_prob_seasonal
})

In [16]:
print(predictions.head())

   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707     0.000075          0.000132
1          26708     0.000063          0.000070
2          26709     0.000116          0.000180
3          26710     0.999538          0.999880
4          26711     0.000102          0.000187


In [17]:
predictions.to_csv('/kaggle/working/submissions.csv', index=False)