In [181]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


In [182]:
X=pd.read_csv("/kaggle/input/my-dataset/training_set_features.csv")
y=pd.read_csv("/kaggle/input/my-dataset/training_set_labels.csv")

In [184]:
def normalize_num_cols(df):
    scaler = StandardScaler()
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

In [185]:
df_dropped = X.drop(columns=['respondent_id'])

In [186]:
X.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [187]:
cat_cols=["age_group","education","race","sex","income_poverty","marital_status","rent_or_own","employment_status","hhs_geo_region","census_msa","employment_industry","employment_occupation"]

In [188]:
df = pd.get_dummies(df_dropped, columns = cat_cols)

In [189]:
df.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,False,False,False,False,False,False,True,False,False,False
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,False,False,False,False,False,False,False,False,True,False
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,False,False,False,False,False,False,False,False,False,False


In [190]:
# Identify columns with any null values
columns_with_null = df.columns[df.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: ['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']


In [191]:
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the data and transform the data
# Note: imputer.fit_transform() returns a numpy array
imputed_data = imputer.fit_transform(df)

# Create a DataFrame from the imputed data
df_imputed = pd.DataFrame(imputed_data, columns=df.columns)

df_imputed.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.220312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [192]:
null_values_sum = df_imputed.isnull().sum()

print("Sum of null values in each column:")
print(null_values_sum)

Sum of null values in each column:
xyz_concern                       0
xyz_knowledge                     0
behavioral_antiviral_meds         0
behavioral_avoidance              0
behavioral_face_mask              0
                                 ..
employment_occupation_vlluhbov    0
employment_occupation_xgwztkwe    0
employment_occupation_xqwwgdyp    0
employment_occupation_xtkaffoo    0
employment_occupation_xzmlyyjv    0
Length: 105, dtype: int64


In [193]:
y.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0


In [194]:
normalize_num_cols(df_imputed)

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,-0.680609,-2.046928,-0.226911,-1.632555,-0.272298,-2.177583,-0.749009,1.403796,0.691971,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
1,1.520279,1.195647,-0.226911,0.617345,-0.272298,0.459948,-0.749009,1.403796,0.691971,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,4.866518,-0.136,-0.267063,-0.096814
2,-0.680609,-0.425641,-0.226911,0.617345,-0.272298,-2.177583,-0.749009,-0.714548,-1.452107,0.000000,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,3.744437,-0.096814
3,-0.680609,-0.425641,-0.226911,0.617345,-0.272298,0.459948,1.339461,-0.714548,-1.452107,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
4,0.419835,-0.425641,-0.226911,0.617345,-0.272298,0.459948,1.339461,-0.714548,0.691971,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0.419835,-2.046928,-0.226911,0.617345,-0.272298,-2.177583,-0.749009,1.403796,-1.452107,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
26703,-0.680609,1.195647,-0.226911,0.617345,-0.272298,0.459948,-0.749009,-0.714548,-1.452107,1.962251,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
26704,0.419835,1.195647,-0.226911,0.617345,3.675062,0.459948,1.339461,-0.714548,0.691971,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814
26705,-0.680609,-0.425641,-0.226911,-1.632555,-0.272298,-2.177583,-0.749009,-0.714548,0.000000,-0.554462,...,-0.144737,-0.102187,-0.121417,-0.118852,-0.131209,-0.115901,-0.205486,-0.136,-0.267063,-0.096814


In [195]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_1, y_test_1 = train_test_split(
   df_imputed, y["xyz_vaccine"], test_size=0.20, random_state=42)
X_train, X_test, y_train_2, y_test_2 = train_test_split(
   df_imputed, y["seasonal_vaccine"], test_size=0.20, random_state=42)

In [196]:
clf_1 = LogisticRegression(random_state=42,max_iter=5000)
clf_1.fit(X_train,y_train_1)

In [197]:
y_pred_1=clf_1.predict(X_test)
print(y_pred_1)

[0 0 0 ... 1 0 0]


In [206]:
# Predict probabilities on the testing data
y_pred_proba_1 = clf_1.predict_proba(X_test)[:, 1]

# Calculate the ROC AUC score
roc_auc1 = roc_auc_score(y_test_1, y_pred_proba_1)

print("ROC AUC Score for xyz vac:", roc_auc1)

ROC AUC Score for xyz vac: 0.834735143584701


In [199]:
clf_2 = LogisticRegression(random_state=42,max_iter=5000)
clf_2.fit(X_train,y_train_2)

In [200]:
y_pred_2=clf_2.predict(X_test)
print(y_pred_2)

[0 0 1 ... 1 0 1]


In [207]:
# Predict probabilities on the testing data
y_pred_proba_2 = clf_2.predict_proba(X_test)[:, 1]

# Calculate the ROC AUC score
roc_auc2 = roc_auc_score(y_test_2, y_pred_proba_2)

print("ROC AUC Score for seasonal vac:", roc_auc2)

ROC AUC Score for seasonal vac: 0.8563981325575891


In [208]:
print("avg_roc_auc:", (roc_auc1+roc_auc2)/2)

avg_roc_auc: 0.8455666380711451


In [209]:
test_set=pd.read_csv("/kaggle/input/my-dataset/test_set_features.csv")

In [210]:
test_set.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [211]:
submission_format=pd.read_csv("/kaggle/input/submission/submission.csv")

In [212]:
submission_format.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0.5,0.7
1,26708,0.5,0.7
2,26709,0.5,0.7
3,26710,0.5,0.7
4,26711,0.5,0.7


In [213]:
sub1=pd.DataFrame(columns=submission_format.columns)

In [214]:
respondent_id_column=submission_format["respondent_id"]
respondent_id_column

0        26707
1        26708
2        26709
3        26710
4        26711
         ...  
26703    53410
26704    53411
26705    53412
26706    53413
26707    53414
Name: respondent_id, Length: 26708, dtype: int64

In [215]:
sub1=sub1.drop(columns=['respondent_id'])

In [216]:
sub1.columns

Index(['h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [217]:
sub1=pd.concat([sub1,respondent_id_col])

In [218]:
sub1.columns

Index(['h1n1_vaccine', 'seasonal_vaccine', 'respondent_id'], dtype='object')

In [219]:
cols_set = ['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']
sub1 = sub1[cols_set]
sub1.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], dtype='object')

In [220]:
sub1.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707.0,,
1,26708.0,,
2,26709.0,,
3,26710.0,,
4,26711.0,,


In [221]:
test_set = pd.get_dummies(test_set, columns = cat_cols)

In [222]:
# Identify columns with any null values
columns_with_null = test_set.columns[test_set.isnull().any()].tolist()

print("Columns with null values:", columns_with_null)

Columns with null values: ['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk', 'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults', 'household_children']


In [223]:
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the data and transform the data
# Note: imputer.fit_transform() returns a numpy array
test_imputed_data = imputer.fit_transform(test_set)

# Create a DataFrame from the imputed data
df_test_imputed = pd.DataFrame(test_imputed_data, columns=test_set.columns)

df_test_imputed.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,26707.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26708.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,26709.0,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26710.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26711.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [224]:
df_test_imputed=normalize_num_cols(df_test_imputed)
df_test_imputed.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,-1.731986,0.418123,1.194987,-0.228896,0.610916,-0.272927,0.459181,1.360074,-0.714408,0.68173,...,-0.137983,-0.106044,-0.123619,-0.111678,-0.138264,-0.110989,-0.210963,-0.134562,-0.264636,-0.090296
1,-1.731856,-0.691385,-0.433154,-0.228896,-1.650045,-0.272927,-2.181059,-0.737242,-0.714408,-1.473919,...,-0.137983,-0.106044,-0.123619,-0.111678,-0.138264,-0.110989,-0.210963,7.431512,-0.264636,-0.090296
2,-1.731727,0.418123,1.194987,-0.228896,-1.650045,3.666586,0.459181,1.360074,1.404071,0.68173,...,-0.137983,-0.106044,-0.123619,-0.111678,-0.138264,-0.110989,-0.210963,-0.134562,-0.264636,-0.090296
3,-1.731597,-0.691385,-0.433154,-0.228896,-1.650045,-0.272927,-2.181059,-0.737242,-0.714408,-1.473919,...,-0.137983,-0.106044,-0.123619,-0.111678,-0.138264,-0.110989,-0.210963,-0.134562,-0.264636,-0.090296
4,-1.731467,1.527632,-0.433154,4.381753,0.610916,-0.272927,0.459181,1.360074,1.404071,0.68173,...,-0.137983,-0.106044,-0.123619,-0.111678,-0.138264,-0.110989,-0.210963,-0.134562,-0.264636,-0.090296


In [225]:
p1 = clf_1.predict_proba(df_test_imputed.drop(columns=['respondent_id']))
p_1 = p1[:, 1]
p2 = clf_2.predict_proba(df_test_imputed.drop(columns=['respondent_id']))
p_2 = p2[:, 1]

In [226]:
sub1["h1n1_vaccine"]=p_1
sub1["seasonal_vaccine"]=p_2
sub1.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707.0,0.057585,0.311732
1,26708.0,0.019591,0.038163
2,26709.0,0.371173,0.512821
3,26710.0,0.503751,0.871925
4,26711.0,0.169317,0.48589


In [227]:
sub1.to_csv('submission1.csv', index=False)