In [9]:
import pandas as pd

train = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test = pd.read_csv('test_set_features.csv')
train = train.merge(train_labels, on='respondent_id')

In [10]:
from sklearn.impute import SimpleImputer

X = train.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train[['xyz_vaccine']].values
y_s = train[['seasonal_vaccine']].values
X_test = test.drop(columns=['respondent_id'])

imputer = SimpleImputer(strategy='most_frequent')
X_i = imputer.fit_transform(X)
X_test_i = imputer.transform(X_test)
X_i = pd.DataFrame(X_i, columns=X.columns)
X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns)

In [11]:
from sklearn.preprocessing import LabelEncoder

headings = {}
for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X_i[column] = le.fit_transform(X_i[column])
        X_test_i[column] = le.transform(X_test_i[column])
        headings[column] = le

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X_i)
X_test = scaler.transform(X_test_i)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

lr_xyz = LogisticRegression(max_iter=1000)
lr_xyz.fit(X_i, y_xyz.ravel())

lr_seasonal = LogisticRegression(max_iter=1000)
lr_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_lr = roc_auc_score(y_xyz, lr_xyz.predict_proba(X_i)[:, 1],average='macro')
roc_auc_seasonal_lr = roc_auc_score(y_s, lr_seasonal.predict_proba(X_i)[:, 1],average='macro')
print(f"ROC AUC in Logistic {(roc_auc_xyz_lr+roc_auc_seasonal_lr )/2}")

xyz_vaccine_lr = lr_xyz.predict_proba(X_test_i)[:, 1] 
seasonal_vaccine_lr = lr_seasonal.predict_proba(X_test_i)[:, 1] 

ROC AUC in Logistic 0.8398171059647985
[0.08754912 0.04488835 0.44553873 ... 0.13580028 0.05609342 0.55317203]


In [24]:
from sklearn.naive_bayes import GaussianNB

gnb_xyz = GaussianNB()
gnb_seasonal = GaussianNB()

gnb_xyz.fit(X_i, y_xyz.ravel())
gnb_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_gnb = roc_auc_score(y_xyz, gnb_xyz.predict_proba(X_i)[:, 1])
roc_auc_seasonal_gnb = roc_auc_score(y_s, gnb_seasonal.predict_proba(X_i)[:, 1]) 
print(f"ROC AUC in Guassian Naive Bias is {(roc_auc_xyz_gnb+roc_auc_seasonal_gnb)/2}")

xyz_vaccine_gnb = gnb_xyz.predict_proba(X_test_i)[:, 1]
seasonal_vaccine_gnb = gnb_seasonal.predict_proba(X_test_i)[:, 1]

ROC AUC in Guassian Naive Bias is 0.7931017761076203
[1.77670661e-02 2.02827471e-06 9.70925489e-01 ... 1.12145017e-03
 5.89355481e-02 5.24741491e-01]


In [17]:
from sklearn import svm

svm_xyz = svm.SVC(probability=True, kernel='rbf')
svm_seasonal = svm.SVC(probability=True, kernel='rbf')

svm_xyz.fit(X_i, y_xyz.ravel())
svm_seasonal.fit(X_i, y_s.ravel())

roc_auc_xyz_svm = roc_auc_score(y_xyz, svm_xyz.predict_proba(X_i)[:, 1])
roc_auc_seasonal_svm = roc_auc_score(y_s, svm_seasonal.predict_proba(X_i)[:, 1])
print(f"ROC AUC in SVM is {(roc_auc_xyz_svm+roc_auc_seasonal_svm)/2}")

xyz_vaccine_svm = svm_xyz.predict_proba(X_test_i)[:, 1]
seasonal_vaccine_svm = svm_seasonal.predict_proba(X_test_i)[:, 1]

ROC AUC in SVM is 0.8434488200259773


In [19]:
test_ids = test['respondent_id']
submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine': xyz_vaccine_lr,
    'seasonal_vaccine': seasonal_vaccine_lr
})
submission.to_csv('submission.csv', index=False)