In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer

# Load the dataset
X = pd.read_csv("training_set_features.csv")
y = pd.read_csv("training_set_labels.csv")

# Drop columns with high cardinality or irrelevant features, and drop respondent_id
X = X.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])

# Identify ordinal and nominal categorical columns
ordinal_features = ['income_poverty']
nominal_features = ['age_group', 'education', 'race', 'sex',
                    'marital_status', 'rent_or_own', 'employment_status',
                    'hhs_geo_region', 'census_msa']
numerical_features = [col for col in X.columns if col not in ordinal_features + nominal_features]

# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ])

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVC': SVC(probability=True)
}

# Function to train and evaluate models
def train_and_evaluate(X, y, vaccine_type):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    results = {}
    for model_name, model in models.items():
        print(model_name)
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', model)])
        
        clf.fit(X_train, y_train)
        
        y_pred_proba = clf.predict_proba(X_test)[:, 1]
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        results[model_name] = roc_auc
    
    print(f"Results for {vaccine_type}:")
    print(results)
    print("\n")

# Train and evaluate models for xyz_vaccine
train_and_evaluate(X, y['xyz_vaccine'], "xyz_vaccine")

# Train and evaluate models for seasonal_vaccine
train_and_evaluate(X, y['seasonal_vaccine'], "seasonal_vaccine")


Logistic Regression
Random Forest
SVC
Results for xyz_vaccine:
{'Logistic Regression': 0.826700787467749, 'Random Forest': 0.8254671650320616, 'SVC': 0.7993514106345966}


Logistic Regression
Random Forest
SVC
Results for seasonal_vaccine:
{'Logistic Regression': 0.8514959621589024, 'Random Forest': 0.8483982352976873, 'SVC': 0.8507263993081415}




In [6]:
y.seasonal_vaccine.value_counts()

seasonal_vaccine
0    14272
1    12435
Name: count, dtype: int64

In [5]:
21033/(21033+5674)

0.7875463361665481

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

# Load the dataset
X = pd.read_csv("training_set_features.csv")
y = pd.read_csv("training_set_labels.csv")

# Drop columns with high cardinality or irrelevant features, and drop respondent_id
X = X.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])

# Identify ordinal and nominal categorical columns
ordinal_features = ['income_poverty']
nominal_features = ['age_group', 'education', 'race', 'sex',
                    'marital_status', 'rent_or_own', 'employment_status',
                    'hhs_geo_region', 'census_msa']
numerical_features = [col for col in X.columns if col not in ordinal_features + nominal_features]

# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ])

# Define the models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'SVC': SVC(probability=True),
    'KNN': KNeighborsClassifier()
}

# Function to train and evaluate models
def train_and_evaluate(X, y, vaccine_type, use_smote=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply the preprocessor to training data
    X_train_transformed = preprocessor.fit_transform(X_train)
    X_test_transformed = preprocessor.transform(X_test)
    
    if use_smote:
        smote = SMOTE(random_state=42)
        X_train_transformed, y_train = smote.fit_resample(X_train_transformed, y_train)
    
    results = {}
    for model_name, model in models.items():
        print(model_name)
        model.fit(X_train_transformed, y_train)
        
        y_pred = model.predict(X_test_transformed)
        y_pred_proba = model.predict_proba(X_test_transformed)[:, 1]
        
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        results[model_name] = {'roc_auc': roc_auc, 'accuracy': accuracy, 'f1_score': f1}
    
    print(f"Results for {vaccine_type}:")
    for model_name, metrics in results.items():
        print(f"{model_name}: ROC AUC = {metrics['roc_auc']:.4f}, Accuracy = {metrics['accuracy']:.4f}, F1 Score = {metrics['f1_score']:.4f}")
    print("\n")

# Train and evaluate models for xyz_vaccine with SMOTE
train_and_evaluate(X, y['xyz_vaccine'], "xyz_vaccine", use_smote=True)

# Train and evaluate models for seasonal_vaccine without SMOTE
train_and_evaluate(X, y['seasonal_vaccine'], "seasonal_vaccine", use_smote=False)


Logistic Regression
Random Forest
SVC
KNN
Results for xyz_vaccine:
Logistic Regression: ROC AUC = 0.8242, Accuracy = 0.7780, F1 Score = 0.5764
Random Forest: ROC AUC = 0.8247, Accuracy = 0.8371, F1 Score = 0.5256
SVC: ROC AUC = 0.7914, Accuracy = 0.8098, F1 Score = 0.5613
KNN: ROC AUC = 0.7329, Accuracy = 0.6511, F1 Score = 0.4711


Logistic Regression
Random Forest
SVC
KNN
Results for seasonal_vaccine:
Logistic Regression: ROC AUC = 0.8515, Accuracy = 0.7827, F1 Score = 0.7586
Random Forest: ROC AUC = 0.8499, Accuracy = 0.7787, F1 Score = 0.7545
SVC: ROC AUC = 0.8507, Accuracy = 0.7799, F1 Score = 0.7562
KNN: ROC AUC = 0.7842, Accuracy = 0.7239, F1 Score = 0.7008




In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.impute import SimpleImputer

# Load the training dataset
X_train = pd.read_csv("training_set_features.csv")
y_train = pd.read_csv("training_set_labels.csv")

# Drop columns with high cardinality or irrelevant features, and drop respondent_id
X_train = X_train.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])

# Identify ordinal and nominal categorical columns
ordinal_features = ['income_poverty']
nominal_features = ['age_group', 'education', 'race', 'sex',
                    'marital_status', 'rent_or_own', 'employment_status',
                    'hhs_geo_region', 'census_msa']
numerical_features = [col for col in X_train.columns if col not in ordinal_features + nominal_features]

# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ])

# Define the logistic regression model
logistic_model = LogisticRegression()

# Function to train and predict
def train_and_predict(X, y, test_data):
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', logistic_model)])
    
    clf.fit(X, y)
    
    predictions = clf.predict(test_data)
    return predictions

# Load the test dataset
X_test = pd.read_csv("test_set_features.csv")
respondent_ids = X_test['respondent_id']
X_test = X_test.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])

# Train and predict for xyz_vaccine
xyz_vaccine_predictions = train_and_predict(X_train, y_train['xyz_vaccine'], X_test)

# Train and predict for seasonal_vaccine
seasonal_vaccine_predictions = train_and_predict(X_train, y_train['seasonal_vaccine'], X_test)

# Create the submission dataframe
submission = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': xyz_vaccine_predictions,
    'seasonal_vaccine': seasonal_vaccine_predictions
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")


Submission file created successfully.


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

# Load the dataset
X_train = pd.read_csv("training_set_features.csv")
y_train = pd.read_csv("training_set_labels.csv")
X_test = pd.read_csv("test_set_features.csv")

# Preserve respondent_id for final output
respondent_ids = X_test['respondent_id']

# Drop columns with high cardinality or irrelevant features, and drop respondent_id
X_train = X_train.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])
X_test = X_test.drop(columns=['respondent_id', 'employment_industry', 'employment_occupation', 'health_insurance'])

# Identify ordinal and nominal categorical columns
ordinal_features = ['income_poverty']
nominal_features = ['age_group', 'education', 'race', 'sex',
                    'marital_status', 'rent_or_own', 'employment_status',
                    'hhs_geo_region', 'census_msa']
numerical_features = [col for col in X_train.columns if col not in ordinal_features + nominal_features]

# Define transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ])

# Define the logistic regression model
logistic_model = LogisticRegression()

# Create pipeline with preprocessor and logistic regression model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', logistic_model)])

# Train and predict function
def train_and_predict(X_train, y_train, X_test):
    pipeline.fit(X_train, y_train)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    return y_pred_proba

# Train and predict for xyz_vaccine
y_train_xyz = y_train['xyz_vaccine']
xyz_vaccine_pred = train_and_predict(X_train, y_train_xyz, X_test)

# Train and predict for seasonal_vaccine
y_train_seasonal = y_train['seasonal_vaccine']
seasonal_vaccine_pred = train_and_predict(X_train, y_train_seasonal, X_test)

# Create the final output dataframe
output = pd.DataFrame({
    'respondent_id': respondent_ids,
    'xyz_vaccine': xyz_vaccine_pred,
    'seasonal_vaccine': seasonal_vaccine_pred
})

# Save the output to a CSV file
output.to_csv('predictions.csv', index=False)

# Print a sample of the output
print(output.head())


   respondent_id  xyz_vaccine  seasonal_vaccine
0          26707     0.085034          0.223102
1          26708     0.052961          0.055415
2          26709     0.477814          0.631209
3          26710     0.496202          0.881146
4          26711     0.178540          0.442073


In [18]:
submission.tail()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
26703,53410,0,1
26704,53411,0,0
26705,53412,0,0
26706,53413,0,0
26707,53414,1,0


In [17]:
output.tail()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
26703,53410,0.313331,0.513397
26704,53411,0.100461,0.316928
26705,53412,0.133954,0.209855
26706,53413,0.058366,0.364629
26707,53414,0.547563,0.491216


In [19]:
X_test.tail()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
26703,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,White,Female,,,,,dqpwygqj,"MSA, Principle City",1.0,1.0
26704,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,White,Male,Below Poverty,Married,Rent,Employed,qufhixun,Non-MSA,1.0,3.0
26705,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Not Principle City",1.0,0.0
26706,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,bhuqouqj,"MSA, Not Principle City",1.0,0.0
26707,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,White,Female,,Not Married,Rent,Employed,lrircsnp,"MSA, Principle City",0.0,0.0


In [20]:
X_test.shape

(26708, 32)

In [21]:
X_train.shape

(26707, 32)

In [22]:
X_train.tail()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,White,Female,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0
26706,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,mlyzmhmf,"MSA, Principle City",1.0,0.0


In [26]:
submission.head()

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,26707,0,0
1,26708,0,0
2,26709,0,1
3,26710,0,1
4,26711,0,0
