In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
!pip install matplotlib --quiet
import matplotlib.pyplot as plt





In [2]:
#read data
training_features_data = pd.read_csv("training_set_features.csv",
                    sep=',')


training_set_labels = pd.read_csv("training_set_labels.csv",
                    sep=',')


test_features_data = pd.read_csv("test_set_features.csv",
                    sep=',')


In [3]:
print(test_features_data.shape)  
print(training_set_labels.shape) 

(26708, 36)
(26707, 3)


# **here is preprocessing for train dataset**

In [4]:
# Separate numeric and non-numeric columns
numeric_cols = training_features_data.select_dtypes(include=['number']).columns
non_numeric_cols = training_features_data.select_dtypes(exclude=['number']).columns

# Fill missing values in numeric columns with mean
training_features_data[numeric_cols] = training_features_data[numeric_cols].fillna(training_features_data[numeric_cols].mean())

# Fill missing values in non-numeric columns with a placeholder value
training_features_data[non_numeric_cols] = training_features_data[non_numeric_cols].fillna('out-of-category')


In [5]:
#check no missing values are left 
training_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [6]:
#encoding categorical features (str-->float)

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

enc.fit(training_features_data)
training_features_data_arr=enc.transform(training_features_data)

col_names_list=training_features_data.columns
encoded_categorical_df=pd.DataFrame(training_features_data_arr, columns=col_names_list)

In [7]:
#normalization(make all values bet. 0-1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(encoded_categorical_df)
normalized_arr=scaler.transform(encoded_categorical_df)

normalized_df=pd.DataFrame(normalized_arr, columns=col_names_list)

In [8]:
#check if data types are correct or not 
normalized_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  float64
 1   xyz_concern                  26707 non-null  float64
 2   xyz_knowledge                26707 non-null  float64
 3   behavioral_antiviral_meds    26707 non-null  float64
 4   behavioral_avoidance         26707 non-null  float64
 5   behavioral_face_mask         26707 non-null  float64
 6   behavioral_wash_hands        26707 non-null  float64
 7   behavioral_large_gatherings  26707 non-null  float64
 8   behavioral_outside_home      26707 non-null  float64
 9   behavioral_touch_face        26707 non-null  float64
 10  doctor_recc_xyz              26707 non-null  float64
 11  doctor_recc_seasonal         26707 non-null  float64
 12  chronic_med_condition        26707 non-null  float64
 13  child_under_6_mo

# **here is preprocessing for test dataset**

In [9]:
#check types of test dataset
test_features_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  int64  
 1   xyz_concern                  26623 non-null  float64
 2   xyz_knowledge                26586 non-null  float64
 3   behavioral_antiviral_meds    26629 non-null  float64
 4   behavioral_avoidance         26495 non-null  float64
 5   behavioral_face_mask         26689 non-null  float64
 6   behavioral_wash_hands        26668 non-null  float64
 7   behavioral_large_gatherings  26636 non-null  float64
 8   behavioral_outside_home      26626 non-null  float64
 9   behavioral_touch_face        26580 non-null  float64
 10  doctor_recc_xyz              24548 non-null  float64
 11  doctor_recc_seasonal         24548 non-null  float64
 12  chronic_med_condition        25776 non-null  float64
 13  child_under_6_mo

In [10]:
# Separate numeric and non-numeric columns
numeric_cols = test_features_data.select_dtypes(include=['number']).columns
non_numeric_cols = test_features_data.select_dtypes(exclude=['number']).columns

# Fill missing values in numeric columns with mean
test_features_data[numeric_cols] = test_features_data[numeric_cols].fillna(test_features_data[numeric_cols].mean())

# Fill missing values in non-numeric columns with a placeholder value
test_features_data[non_numeric_cols] = test_features_data[non_numeric_cols].fillna('out-of-category')


In [11]:
#check no missing values are left 
test_features_data.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

In [12]:
#encoding categorical features  (str-->float)

from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(test_features_data)
test_features_data_arr=enc.transform(test_features_data)

col_names_list=test_features_data.columns
test_encoded_categorical_df=pd.DataFrame(test_features_data_arr, columns=col_names_list)

In [13]:
#check data types
test_encoded_categorical_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26708 entries, 0 to 26707
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26708 non-null  float64
 1   xyz_concern                  26708 non-null  float64
 2   xyz_knowledge                26708 non-null  float64
 3   behavioral_antiviral_meds    26708 non-null  float64
 4   behavioral_avoidance         26708 non-null  float64
 5   behavioral_face_mask         26708 non-null  float64
 6   behavioral_wash_hands        26708 non-null  float64
 7   behavioral_large_gatherings  26708 non-null  float64
 8   behavioral_outside_home      26708 non-null  float64
 9   behavioral_touch_face        26708 non-null  float64
 10  doctor_recc_xyz              26708 non-null  float64
 11  doctor_recc_seasonal         26708 non-null  float64
 12  chronic_med_condition        26708 non-null  float64
 13  child_under_6_mo

In [14]:
#normalization(bet. 0-1)

#using minmax scaler(look up)
test_normalized_arr=scaler.transform(test_encoded_categorical_df)
test_normalized_df=pd.DataFrame(test_normalized_arr, columns=col_names_list)

# **here is regression**

In [34]:
#import sklearn methods 
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multiclass import OneVsRestClassifier

In [16]:
# split df to X and Y
y = training_set_labels.loc[:, 'xyz_vaccine'].values
X = normalized_df


In [17]:
# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 42)

# Regressor-1: Decision Tree regressor

In [18]:
#decision tree regressor
regressor = DecisionTreeRegressor(random_state = 0)

# parameters 
parameters = {
                "criterion": ["mse", "friedman_mse", "mae"],
                "splitter": ["best","random"],
                }

# grid search for parameters
grid = GridSearchCV(estimator=regressor, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid.best_params_, grid.best_score_))

# detailed dataframe of gridsearch
detailed_grid_results = pd.DataFrame(grid.cv_results_)
detailed_grid_results


20 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Anaconda\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 1247, in fit
    super().fit(
  File "D:\Anaconda\Lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "D:\Anaconda\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "D:\Anaconda\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_pa

The best parameters are {'criterion': 'friedman_mse', 'splitter': 'best'} with a score of -0.3257


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005905,0.001628,0.0,0.0,mse,best,"{'criterion': 'mse', 'splitter': 'best'}",,,,,,,,3
1,0.005193,0.000255,0.0,0.0,mse,random,"{'criterion': 'mse', 'splitter': 'random'}",,,,,,,,3
2,0.126625,0.007038,0.002166,7.8e-05,friedman_mse,best,"{'criterion': 'friedman_mse', 'splitter': 'best'}",-0.378832,-0.303318,-0.325693,-0.286537,-0.334083,-0.325693,0.031394,1
3,0.095636,0.013463,0.00418,0.004346,friedman_mse,random,"{'criterion': 'friedman_mse', 'splitter': 'ran...",-0.342474,-0.384426,-0.272553,-0.409597,-0.334083,-0.348627,0.04698,2
4,0.003513,0.002309,0.0,0.0,mae,best,"{'criterion': 'mae', 'splitter': 'best'}",,,,,,,,3
5,0.008693,0.004581,0.0,0.0,mae,random,"{'criterion': 'mae', 'splitter': 'random'}",,,,,,,,3


In [19]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("TEST SCORES\n")
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes


In [20]:
# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

TEST SCORES

AUC: 0.6802




# Regressor-2: Bayesian-Ridge

In [21]:
#Bayesian Ridge for regression 

clf_ridge = linear_model.BayesianRidge()


# parameters 
parameters = {
                'alpha_init': [None, 1],
                'lambda_init': [1, 1e-3],
            }


# grid search for parameters
grid = GridSearchCV(estimator=clf_ridge, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)


# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

The best parameters are {'alpha_init': None, 'lambda_init': 0.001} with a score of 0.2833

TEST SCORES

AUC: 0.8311




# Regressor-3: SVR

In [22]:
regr = SVR(C=1.0, epsilon=0.2)

# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.01,0.1,1,10,100],
                'max_iter': [100,1000],
            }

# grid search for parameters
grid = GridSearchCV(estimator=regr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))

# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)



The best parameters are {'C': 0.1, 'kernel': 'poly', 'max_iter': 1000} with a score of 0.0616

TEST SCORES

AUC: 0.7188




# Regressor-4: SGDRegressor

In [23]:
reg = SGDRegressor( tol=1e-3)


# parameters 
parameters = {
                'alpha': [0.0001, 0.001, 0.01, 1],
                'max_iter': [10,100,1000],
                'learning_rate': ['invscaling', 'optimal', 'adaptive'],
            }

# grid search for parameters
grid = GridSearchCV(estimator=reg, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))


# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

The best parameters are {'alpha': 0.0001, 'learning_rate': 'adaptive', 'max_iter': 100} with a score of 0.2833

TEST SCORES

AUC: 0.8311




# Regressor-5: RandomForestRegressor

In [24]:
rfr = RandomForestRegressor(random_state=0)

# parameters 
parameters = {
                'n_estimators': [20, 50, 100],
            }

# grid search for parameters
grid = GridSearchCV(estimator=rfr, param_grid=parameters, cv=cv, n_jobs=-1)
grid.fit(X_train, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid.best_params_, grid.best_score_))



# prediction results
y_pred = grid.predict(X_test)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)


The best parameters are {'n_estimators': 100} with a score of 0.3252

TEST SCORES

AUC: 0.8404




In [58]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier


#model1 = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1,random_state=42, max_depth=15,n_estimators=200))
model1 = OneVsRestClassifier(RandomForestClassifier(n_jobs=-1,random_state=42,n_estimators= 250,max_depth=15))
model1.fit(X_train, y_train)

In [59]:

y_pred = model1.predict_proba(X_train)
y_pred.shape

(21365, 2)

In [64]:
test_predict = model1.predict_proba(X_test)
test_predict
test_predict_df= pd.DataFrame(test_predict)
test_predict_df.columns = ['h1n1_vaccine','seasonal_vaccine']
test_predict_df
submission_df = pd.read_csv('submission_format.csv')
submission_df.drop(columns=['h1n1_vaccine','seasonal_vaccine'], inplace=True)
submission_df = pd.concat([submission_df, test_predict_df], axis=1)
submission_df
submission_df.to_csv('submission.csv', index=None)

In [65]:
from IPython.display import FileLink
FileLink('submission.csv')