# EXPLORING BAGGING TECHNIQUES

### IMPORTING PREPROCESSED DATA

In [1]:
%run Selected_Questions_Combined.ipynb
dataframes = preprocessed_data()
salary_data = dataframes["all_questions_dataframe"]
salary_data_as_num = dataframes["selected_numeric_questions"]
salary_data_selected_questions = dataframes["selected_questions_dataframe"]

### FURTHER DATA PREPROCESSING

#### Features and Target

In [2]:
null_indices = salary_data[salary_data['q24'].isnull()].index

In [3]:
y = salary_data['q24'].dropna()
X = salary_data_as_num.drop(index=null_indices)

#### Bin the levels of the target variables to reduce it to 4-class classification 

In [4]:
y.value_counts()

$0-999             2128
10,000-14,999       665
1,000-1,999         581
100,000-124,999     573
40,000-49,999       552
30,000-39,999       540
50,000-59,999       510
5,000-7,499         488
15,000-19,999       449
60,000-69,999       408
20,000-24,999       404
70,000-79,999       394
7,500-9,999         371
150,000-199,999     347
2,000-2,999         330
125,000-149,999     315
25,000-29,999       310
90,000-99,999       280
4,000-4,999         279
80,000-89,999       273
3,000-3,999         264
200,000-249,999     115
300,000-500,000      55
> $500,000           50
250,000-299,999      48
Name: q24, dtype: int64

In [5]:
def binning_categories(c):
    if c in ['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']:
        return "0-9,999"
    elif c in ['10,000-14,999','15,000-19,999','20,000-24,999',
             '25,000-29,999','30,000-39,999','40,000-49,999',
             '50,000-59,999','60,000-69,999','70,000-79,999',
             '80,000-89,999','90,000-99,999']:
        return "10,000-99,999"
    elif c in ['100,000-124,999','125,000-149,999','150,000-199,999',
             '200,000-249,999','250,000-299,999','300,000-500,000']:
        return "100,000-500,000"
    else:
        return "> $500,000" 

In [6]:
y = y.apply(binning_categories)
y.value_counts()

10,000-99,999      4785
0-9,999            4441
100,000-500,000    1453
> $500,000           50
Name: q24, dtype: int64

In [7]:
# Train-test split

from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify = y)


In [8]:
# label enc y

from sklearn.preprocessing import LabelEncoder

l_enc = LabelEncoder()
l_enc.fit_transform(y_dev)
l_enc.transform(y_test)

array([1, 0, 0, ..., 1, 1, 1])

In [9]:
from sklearn.metrics import roc_auc_score

## BAGGING CLASSIFIER

In [10]:
from sklearn.ensemble import BaggingClassifier

### BASELINE MODEL

#### Training

In [11]:
bgc = BaggingClassifier(random_state = 84)
bgc.fit(X_dev, y_dev)

#### Evaluation

In [12]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test, bgc.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")


The ROC-AUC score for this model is: 0.7776


### HYPERPARAMETER OPTIMIZATION USING GRID SEARCH


In [13]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#### Preparing the hyperparameter space and performing GridSearch CV

In [14]:
'''
Input:

n_estimators = [i for i in range(50)]
sfold = StratifiedKFold(n_splits=10)

grid_search = GridSearchCV(estimator = bgc, param_grid = {'n_estimators': n_estimators}, scoring='roc_auc_ovr', cv=sfold, n_jobs=-1)
grid_search.fit(X_dev, y_dev)
grid_search.best_params_
'''

'''
Output:

{'n_estimators': 39}

'''

"\nOutput:\n\n{'n_estimators': 39}\n\n"

#### Re-training Bagging Classifier using the best parameters obtained

In [15]:
bgc_best = BaggingClassifier(random_state = 84, n_estimators = 39)
bgc_best.fit(X_dev, y_dev)

#### Evaluation

In [16]:
bagging_roc = roc_auc_score(y_test, bgc_best.predict_proba(X_test), average='weighted', multi_class='ovr')
print(f"The ROC-AUC score for this model is: {bagging_roc}")

The ROC-AUC score for this model is: 0.8007806692134761


## RANDOM FOREST

In [17]:
from sklearn.ensemble import RandomForestClassifier

### BASELINE MODEL

#### Training

In [18]:
rf = RandomForestClassifier(random_state = 84)
rf.fit(X_dev, y_dev)

#### Evaluation

In [19]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test, rf.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")


The ROC-AUC score for this model is: 0.7870


### RANDOMIZED HYPERPARAMETER OPTIMIZATION

In [20]:
from sklearn.model_selection import RandomizedSearchCV

#### Preparing the hyperparameter space

In [21]:
'''
n_estimators = [np.random.randint(75,200) for i in range(50)]
criterion = ["gini", "entropy", "log_loss"]
max_depth = [np.random.randint(10,200) for i in range(50)]

param_distributions = {"n_estimators": n_estimators, "criterion": criterion, "max_depth": max_depth}
'''

'\nn_estimators = [np.random.randint(75,200) for i in range(50)]\ncriterion = ["gini", "entropy", "log_loss"]\nmax_depth = [np.random.randint(10,200) for i in range(50)]\n\nparam_distributions = {"n_estimators": n_estimators, "criterion": criterion, "max_depth": max_depth}\n'

#### Performing RandomSearch CV

In [22]:
'''
Input:

sfold = StratifiedKFold(n_splits=10)

rm_search = RandomizedSearchCV(estimator = rf, param_distributions = param_distributions, scoring='roc_auc_ovr', cv=sfold, n_jobs=-1, n_iter=20)
rm_search.fit(X_dev, y_dev)
print(rm_search.best_params_)

'''


'''
Output : 

{'n_estimators': 156,
 'max_depth': 11,
 'criterion': 'entropy'}
 
'''


"\nOutput : \n\n{'n_estimators': 156,\n 'max_depth': 11,\n 'criterion': 'entropy'}\n \n"

#### Re-training Random Forest using the best parameters obtained

In [23]:
rf_new = RandomForestClassifier(random_state = 84, n_estimators = 156, criterion = 'entropy', max_depth = 11)
rf_new.fit(X_dev, y_dev)

#### Evaluation

In [24]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test, rf_new.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")


The ROC-AUC score for this model is: 0.7904


### HYPERPARAMETER OPTIMIZATION USING GRID SEARCH
##### (In and around the values obtained using Randomized Search)

#### Preparing the hyperparameter space

In [25]:
'''
n_estimators = [i for i in range(151,161)]
criterion = ["gini", "entropy", "log_loss"]
max_depth = [i for i in range(6,16)]

param_grid = {"n_estimators": n_estimators, "criterion": criterion, "max_depth": max_depth}
'''

'\nn_estimators = [i for i in range(151,161)]\ncriterion = ["gini", "entropy", "log_loss"]\nmax_depth = [i for i in range(6,16)]\n\nparam_grid = {"n_estimators": n_estimators, "criterion": criterion, "max_depth": max_depth}\n'

#### Performing GridSearch CV

In [26]:
'''
Input:

sfold = StratifiedKFold(n_splits=10)

grid_search = GridSearchCV(estimator = rf_new, param_grid = param_grid, scoring='roc_auc_ovr', cv = sfold, n_jobs=-1)
grid_search.fit(X_dev, y_dev)
print(grid_search.best_params_)

'''

'''
Output:

{'criterion': 'entropy',
 'max_depth': 10,
 'n_estimators': 151}
 
'''

"\nOutput:\n\n{'criterion': 'entropy',\n 'max_depth': 10,\n 'n_estimators': 151}\n \n"

#### Re-training Random Forest using the best parameters obtained

In [27]:
rf_best = RandomForestClassifier(random_state = 84, n_estimators = 151, criterion = 'entropy', max_depth = 10)
rf_best.fit(X_dev, y_dev)

#### Evaluation

In [28]:
rf_roc = roc_auc_score(y_test, rf_best.predict_proba(X_test), average='weighted', multi_class='ovr')
print(f"The ROC-AUC score for this model is: {rf_roc}")

The ROC-AUC score for this model is: 0.7903886423382891


In [None]:
from sklearn import metrics

bagging_fpr, bagging_tpr, bagging_thresholds = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])

In [29]:
def BaggingBT_ROC_scores():
    return({"Bagging Classifier": bagging_roc,
           "Random Forest Classifier": rf_roc})

### COMMENTS

The dataset, with 10,000 non-null observations, presents challenges due to its high dimensionality. Notably, Bagging techniques, including the Bagging Classifier and Random Forest, underperform compared to the simpler Logistic Regression, possibly due to the sensitivity of random forest methods to data noise. The superior performance of Logistic Regression suggests the presence of inherent linear relationships between features and the target variable. Another significant factor contributing to the performance gap may perhaps be the potential non-uniform importance of features, where Logistic Regression excels in discerning and leveraging varying degrees of influence, particularly in the context of linear relationships.