In [156]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math

In [157]:
%matplotlib inline
sns.set_theme()

In [158]:
df = pd.read_csv("kaggle_survey_2020_responses.csv")

In [159]:
salary_data = df.drop(columns = ["time_from_start_to_finish_seconds"])

In [160]:
salary_data_as_num = pd.DataFrame()

In [161]:
def convert_to_category(col_name: str, order_rules: list, data):
    data[col_name] = pd.Categorical(data[col_name], order_rules)

In [162]:
def convert_to_category_no_specified_order(col_name, data):
    if sum(data[col_name].isna().astype(int)) > 0:
        data[col_name].fillna("No response", inplace = True)

    order = list(set(data[col_name]))
    convert_to_category(col_name, order, data)

In [163]:
def convert_category_to_code(col_name: str, data, inplace = False):
    if inplace:
        data[col_name] = data[col_name].cat.codes + 1 # because NaN automatically becomes -1
    else:
        return data[col_name].cat.codes + 1

In [164]:
def process_column(col_name: str, order_rules = None, data = salary_data, num_data = salary_data_as_num):
    if order_rules:
        convert_to_category(col_name, order_rules, data)
    else:
        convert_to_category_no_specified_order(col_name, data)
    num_data[col_name] = convert_category_to_code(col_name, data)

In [165]:
def one_hot_column_to_binary(col_name, data = salary_data):
    data[col_name].fillna(0, inplace = True)
    data[col_name].mask(data[col_name] != 0, 1, inplace = True)

In [166]:
def process_one_hot_encoded_columns(columns, data = salary_data, num_data = salary_data_as_num):
    for col in columns:
        one_hot_column_to_binary(col, data)
        num_data[col] = data[col]

In [167]:
def column_text_to_binary(col_name, data = salary_data, num_data = salary_data_as_num):
    data[col_name] = data[col_name].notna().astype(int)
    num_data[col_name] = data[col_name]

In [168]:
def combine_multiple_columns_into_one_binary(columns, new_col_name, data = salary_data, num_data = salary_data_as_num):
    for col_name in columns:
        one_hot_column_to_binary(col_name)

    data[new_col_name] = data[columns].sum(axis = 1)
    data[new_col_name] = data[new_col_name].astype(int)

    data[new_col_name].mask(data[new_col_name] > 0, 1, inplace = True)
    num_data[new_col_name] = data[new_col_name]

## Q1 Age

In [169]:
salary_data["q1"].value_counts()

25-29    4011
22-24    3786
18-21    3469
30-34    2811
35-39    1991
40-44    1397
45-49     988
50-54     698
55-59     411
60-69     398
70         76
Name: q1, dtype: int64

In [170]:
q1_order = [
    "18-21",
    "22-24",
    "25-29",
    "30-34",
    "35-39",
    "40-44",
    "45-49",
    "50-54",
    "55-59",
    "60-69",
    "70"
]

In [171]:
convert_to_category("q1", q1_order, salary_data)

In [172]:
salary_data_as_num["q1"] = convert_category_to_code("q1", salary_data, False)
salary_data_as_num

Unnamed: 0,q1
0,5
1,4
2,5
3,4
4,4
...,...
20031,1
20032,9
20033,4
20034,2


In [173]:
salary_data_as_num["q1"].value_counts()

3     4011
2     3786
1     3469
4     2811
5     1991
6     1397
7      988
8      698
9      411
10     398
11      76
Name: q1, dtype: int64

## Q2 Gender

In [174]:
salary_data["q2"].value_counts()

Man                        15789
Woman                       3878
Prefer not to say            263
Prefer to self-describe       54
Nonbinary                     52
Name: q2, dtype: int64

In [175]:
q2_order = ['Man', "Woman", "Nonbinary", 'Prefer to self-describe', 'Prefer not to say']

In [176]:
convert_to_category("q2", q2_order, salary_data)

In [177]:
salary_data_as_num["q2"] = convert_category_to_code("q2", salary_data, False)
salary_data_as_num

Unnamed: 0,q1,q2
0,5,1
1,4,1
2,5,1
3,4,1
4,4,1
...,...,...
20031,1,1
20032,9,2
20033,4,1
20034,2,1


In [178]:
salary_data_as_num["q2"].value_counts()

1    15789
2     3878
5      263
4       54
3       52
Name: q2, dtype: int64

## Q3 Country

In [179]:
convert_to_category_no_specified_order("q3", salary_data)

In [180]:
salary_data_as_num["q3"] = convert_category_to_code("q3", salary_data)

## Q4 Education

In [181]:
salary_data["q4"].value_counts()

Master’s degree                                                      7859
Bachelor’s degree                                                    6978
Doctoral degree                                                      2302
Some college/university study without earning a bachelor’s degree    1092
Professional degree                                                   699
I prefer not to answer                                                399
No formal education past high school                                  240
Name: q4, dtype: int64

In [182]:
q4_order = ["No formal education past high school",
            "Some college/university study without earning a bachelor’s degree",
            "Professional degree",
            "Bachelor’s degree",
            "Master’s degree",
            "Doctoral degree",
            "I prefer not to answer"
            ]

In [183]:
convert_to_category("q4", q4_order, salary_data)

In [184]:
salary_data_as_num["q4"] = convert_category_to_code("q4", salary_data)

In [185]:
salary_data_as_num["q4"].value_counts()

5    7859
4    6978
6    2302
2    1092
3     699
0     467
7     399
1     240
Name: q4, dtype: int64

## Q5 Job Title

In [186]:
salary_data["q5"].value_counts()

Student                      5171
Data Scientist               2676
Software Engineer            1968
Other                        1737
Currently not employed       1652
Data Analyst                 1475
Research Scientist           1174
Machine Learning Engineer    1082
Business Analyst              798
Product/Project Manager       692
Data Engineer                 437
Statistician                  290
DBA/Database Engineer         125
Name: q5, dtype: int64

In [187]:
convert_to_category_no_specified_order("q5", salary_data)

In [188]:
salary_data_as_num["q5"] = convert_category_to_code("q5", salary_data)

## Q6 Years Coding

In [189]:
q6_order = [
 'I have never written code',
 '< 1 years',
 '1-2 years',
 '3-5 years',
 '5-10 years',
 '10-20 years',
 '20+ years']

In [190]:
process_column("q6", q6_order)

## Q7 Language

In [191]:
q7_columns = [
     'q7_part_1',
 'q7_part_2',
 'q7_part_3',
 'q7_part_4',
 'q7_part_5',
 'q7_part_6',
 'q7_part_7',
 'q7_part_8',
 'q7_part_9',
 'q7_part_10',
 'q7_part_11',
 'q7_part_12',
 'q7_other'
]

In [192]:
process_one_hot_encoded_columns(q7_columns)

## Q11 Computing Platform

In [193]:
process_column("q11")

## Q12 Specialized Hardware

In [194]:
q12_columns = [
    'q12_part_1',
 'q12_part_2',
 'q12_part_3',
 'q12_other'
]

In [195]:
process_one_hot_encoded_columns(q12_columns)

## Q14 Visualization

In [196]:
q14_columns = [
    'q14_part_1',
 'q14_part_2',
 'q14_part_3',
 'q14_part_4',
 'q14_part_5',
 'q14_part_6',
 'q14_part_7',
 'q14_part_8',
 'q14_part_9',
 'q14_part_10',
 'q14_part_11',
 'q14_other'
]

In [197]:
process_one_hot_encoded_columns(q14_columns)

## Q15 Years ML

In [198]:
q15_order = [
    'I do not use machine learning methods',
    'Under 1 year',
    '1-2 years',
    '2-3 years',
    '3-4 years',
    '4-5 years',
    '5-10 years',
    '10-20 years',
    '20 or more years'
]

In [199]:
process_column("q15", q15_order)

## Q17 ML Algorithms

In [200]:
q17_columns = [
    'q17_part_1',
 'q17_part_2',
 'q17_part_3',
 'q17_part_4',
 'q17_part_5',
 'q17_part_6',
 'q17_part_7',
 'q17_part_8',
 'q17_part_9',
 'q17_part_10',
 'q17_part_11',
 'q17_other'
]

In [201]:
process_one_hot_encoded_columns(q17_columns)

## Q20 Company Size

In [202]:
q20_order = [
    '0-49 employees',
    '50-249 employees',
    '250-999 employees',
    '1000-9,999 employees',
    '10,000 or more employees'
]

In [203]:
process_column("q20", q20_order)

## Q21 Datascience Workloads

In [204]:
q21_order = [
    '0',
    '1-2',
    '3-4',
    '5-9',
    '10-14',
    '15-19',
    '20'
]

In [205]:
process_column("q21", q21_order)

## Q22 Incorporating ML

i'm not super sure about the proper "order" for this question. Feel free to change this if you find it more appropriate. Just please let the chat know in case it affects others' encoding.

In [206]:
q22_order = [
    'I do not know',
    'No (we do not use ML methods)',
    'We are exploring ML methods (and may one day put a model into production)',
    'We use ML methods for generating insights (but do not put working models into production)',
    'We recently started using ML methods (i.e., models in production for less than 2 years)',
    'We have well established ML methods (i.e., models in production for more than 2 years)'
]

In [207]:
process_column("q22", q22_order)

## Q30 Big Data Products

In [208]:
column_text_to_binary("q30")

## Q32 Business Intelligence Tools

In [209]:
column_text_to_binary("q32")

## Q33 Automated ML Tools

In [210]:
q33_columns = [
    'q33_a_part_1',
 'q33_a_part_2',
 'q33_a_part_3',
 'q33_a_part_4',
 'q33_a_part_5',
 'q33_a_part_6',
 'q33_a_part_7',
 'q33_a_other'
]

In [211]:
combine_multiple_columns_into_one_binary(q33_columns, "q33")

## Q37 Data Science Courses

In [212]:
q37_columns = [
    'q37_part_1',
 'q37_part_2',
 'q37_part_3',
 'q37_part_4',
 'q37_part_5',
 'q37_part_6',
 'q37_part_7',
 'q37_part_8',
 'q37_part_9',
 'q37_part_10',
 'q37_part_11',
 'q37_other'
]

In [213]:
process_one_hot_encoded_columns(q37_columns)

## Q38 Primary Data Analysis Tool

In [214]:
process_column("q38")

## Q39 Media Sources

In [215]:
q39_columns = [
    'q39_part_1',
 'q39_part_2',
 'q39_part_3',
 'q39_part_4',
 'q39_part_5',
 'q39_part_6',
 'q39_part_7',
 'q39_part_8',
 'q39_part_9',
 'q39_part_10',
 'q39_part_11',
 'q39_other'
]

In [216]:
process_one_hot_encoded_columns(q39_columns)

## Dropped Columns

In [217]:
one_hot_dropped = [
    'q33_a_part_1',
 'q33_a_part_2',
 'q33_a_part_3',
 'q33_a_part_4',
 'q33_a_part_5',
 'q33_a_part_6',
 'q33_a_part_7',
 'q33_a_other',
]

In [218]:
dropped_questions = [
    "q8",
    'q9_part_1',
 'q9_part_2',
 'q9_part_3',
 'q9_part_4',
 'q9_part_5',
 'q9_part_6',
 'q9_part_7',
 'q9_part_8',
 'q9_part_9',
 'q9_part_10',
 'q9_part_11',
 'q9_other',
 'q10_part_1',
 'q10_part_2',
 'q10_part_3',
 'q10_part_4',
 'q10_part_5',
 'q10_part_6',
 'q10_part_7',
 'q10_part_8',
 'q10_part_9',
 'q10_part_10',
 'q10_part_11',
 'q10_part_12',
 'q10_part_13',
 'q10_other',
 "q13",
 'q16_part_1',
 'q16_part_2',
 'q16_part_3',
 'q16_part_4',
 'q16_part_5',
 'q16_part_6',
 'q16_part_7',
 'q16_part_8',
 'q16_part_9',
 'q16_part_10',
 'q16_part_11',
 'q16_part_12',
 'q16_part_13',
 'q16_part_14',
 'q16_part_15',
 'q16_other',
 'q18_part_1',
 'q18_part_2',
 'q18_part_3',
 'q18_part_4',
 'q18_part_5',
 'q18_part_6',
 'q18_other',
 'q19_part_1',
 'q19_part_2',
 'q19_part_3',
 'q19_part_4',
 'q19_part_5',
 'q19_other',
 'q23_part_1',
 'q23_part_2',
 'q23_part_3',
 'q23_part_4',
 'q23_part_5',
 'q23_part_6',
 'q23_part_7',
 'q23_other',
 'q24',
 'q25',
 'q26_a_part_1',
 'q26_a_part_2',
 'q26_a_part_3',
 'q26_a_part_4',
 'q26_a_part_5',
 'q26_a_part_6',
 'q26_a_part_7',
 'q26_a_part_8',
 'q26_a_part_9',
 'q26_a_part_10',
 'q26_a_part_11',
 'q26_a_other',
 'q27_a_part_1',
 'q27_a_part_2',
 'q27_a_part_3',
 'q27_a_part_4',
 'q27_a_part_5',
 'q27_a_part_6',
 'q27_a_part_7',
 'q27_a_part_8',
 'q27_a_part_9',
 'q27_a_part_10',
 'q27_a_part_11',
 'q27_a_other',
 'q28_a_part_1',
 'q28_a_part_2',
 'q28_a_part_3',
 'q28_a_part_4',
 'q28_a_part_5',
 'q28_a_part_6',
 'q28_a_part_7',
 'q28_a_part_8',
 'q28_a_part_9',
 'q28_a_part_10',
 'q28_a_other',
 'q29_a_part_1',
 'q29_a_part_2',
 'q29_a_part_3',
 'q29_a_part_4',
 'q29_a_part_5',
 'q29_a_part_6',
 'q29_a_part_7',
 'q29_a_part_8',
 'q29_a_part_9',
 'q29_a_part_10',
 'q29_a_part_11',
 'q29_a_part_12',
 'q29_a_part_13',
 'q29_a_part_14',
 'q29_a_part_15',
 'q29_a_part_16',
 'q29_a_part_17',
 'q29_a_other',
 'q31_a_part_1',
 'q31_a_part_2',
 'q31_a_part_3',
 'q31_a_part_4',
 'q31_a_part_5',
 'q31_a_part_6',
 'q31_a_part_7',
 'q31_a_part_8',
 'q31_a_part_9',
 'q31_a_part_10',
 'q31_a_part_11',
 'q31_a_part_12',
 'q31_a_part_13',
 'q31_a_part_14',
 'q31_a_other',
 'q34_a_part_1',
 'q34_a_part_2',
 'q34_a_part_3',
 'q34_a_part_4',
 'q34_a_part_5',
 'q34_a_part_6',
 'q34_a_part_7',
 'q34_a_part_8',
 'q34_a_part_9',
 'q34_a_part_10',
 'q34_a_part_11',
 'q34_a_other',
 'q35_a_part_1',
 'q35_a_part_2',
 'q35_a_part_3',
 'q35_a_part_4',
 'q35_a_part_5',
 'q35_a_part_6',
 'q35_a_part_7',
 'q35_a_part_8',
 'q35_a_part_9',
 'q35_a_part_10',
 'q35_a_other',
 'q36_part_1',
 'q36_part_2',
 'q36_part_3',
 'q36_part_4',
 'q36_part_5',
 'q36_part_6',
 'q36_part_7',
 'q36_part_8',
 'q36_part_9',
 'q36_other',
]

In [219]:
part_b_dropped = [
    'q26_b_part_1',
 'q26_b_part_2',
 'q26_b_part_3',
 'q26_b_part_4',
 'q26_b_part_5',
 'q26_b_part_6',
 'q26_b_part_7',
 'q26_b_part_8',
 'q26_b_part_9',
 'q26_b_part_10',
 'q26_b_part_11',
 'q26_b_other',
 'q27_b_part_1',
 'q27_b_part_2',
 'q27_b_part_3',
 'q27_b_part_4',
 'q27_b_part_5',
 'q27_b_part_6',
 'q27_b_part_7',
 'q27_b_part_8',
 'q27_b_part_9',
 'q27_b_part_10',
 'q27_b_part_11',
 'q27_b_other',
 'q28_b_part_1',
 'q28_b_part_2',
 'q28_b_part_3',
 'q28_b_part_4',
 'q28_b_part_5',
 'q28_b_part_6',
 'q28_b_part_7',
 'q28_b_part_8',
 'q28_b_part_9',
 'q28_b_part_10',
 'q28_b_other',
 'q29_b_part_1',
 'q29_b_part_2',
 'q29_b_part_3',
 'q29_b_part_4',
 'q29_b_part_5',
 'q29_b_part_6',
 'q29_b_part_7',
 'q29_b_part_8',
 'q29_b_part_9',
 'q29_b_part_10',
 'q29_b_part_11',
 'q29_b_part_12',
 'q29_b_part_13',
 'q29_b_part_14',
 'q29_b_part_15',
 'q29_b_part_16',
 'q29_b_part_17',
 'q29_b_other',
 'q31_b_part_1',
 'q31_b_part_2',
 'q31_b_part_3',
 'q31_b_part_4',
 'q31_b_part_5',
 'q31_b_part_6',
 'q31_b_part_7',
 'q31_b_part_8',
 'q31_b_part_9',
 'q31_b_part_10',
 'q31_b_part_11',
 'q31_b_part_12',
 'q31_b_part_13',
 'q31_b_part_14',
 'q31_b_other',
 'q33_b_part_1',
 'q33_b_part_2',
 'q33_b_part_3',
 'q33_b_part_4',
 'q33_b_part_5',
 'q33_b_part_6',
 'q33_b_part_7',
 'q33_b_other',
 'q34_b_part_1',
 'q34_b_part_2',
 'q34_b_part_3',
 'q34_b_part_4',
 'q34_b_part_5',
 'q34_b_part_6',
 'q34_b_part_7',
 'q34_b_part_8',
 'q34_b_part_9',
 'q34_b_part_10',
 'q34_b_part_11',
 'q34_b_other',
 'q35_b_part_1',
 'q35_b_part_2',
 'q35_b_part_3',
 'q35_b_part_4',
 'q35_b_part_5',
 'q35_b_part_6',
 'q35_b_part_7',
 'q35_b_part_8',
 'q35_b_part_9',
 'q35_b_part_10',
 'q35_b_other'
]

In [220]:
salary_data = salary_data.drop(columns = one_hot_dropped)

In [221]:
salary_data = salary_data.drop(columns = part_b_dropped)

In [222]:
salary_data_selected_questions = salary_data.drop(columns = dropped_questions)

### DATA PREPROCESSING

#### Features and Target

In [223]:
null_indices = salary_data[salary_data['q24'].isnull()].index

In [224]:
y = salary_data['q24'].dropna()
X = salary_data_as_num.drop(index=null_indices)

#### Bin the levels of the target variables to reduce it to 4-class classification

In [225]:
y.value_counts()

$0-999             2128
10,000-14,999       665
1,000-1,999         581
100,000-124,999     573
40,000-49,999       552
30,000-39,999       540
50,000-59,999       510
5,000-7,499         488
15,000-19,999       449
60,000-69,999       408
20,000-24,999       404
70,000-79,999       394
7,500-9,999         371
150,000-199,999     347
2,000-2,999         330
125,000-149,999     315
25,000-29,999       310
90,000-99,999       280
4,000-4,999         279
80,000-89,999       273
3,000-3,999         264
200,000-249,999     115
300,000-500,000      55
> $500,000           50
250,000-299,999      48
Name: q24, dtype: int64

In [226]:
def binning_categories(c):
    if c in ['$0-999','1,000-1,999','2,000-2,999','3,000-3,999','4,000-4,999','5,000-7,499','7,500-9,999']:
        return "0-9,999"
    elif c in ['10,000-14,999','15,000-19,999','20,000-24,999',
             '25,000-29,999','30,000-39,999','40,000-49,999',
             '50,000-59,999','60,000-69,999','70,000-79,999',
             '80,000-89,999','90,000-99,999']:
        return "10,000-99,999"
    elif c in ['100,000-124,999','125,000-149,999','150,000-199,999',
             '200,000-249,999','250,000-299,999','300,000-500,000']:
        return "100,000-500,000"
    else:
        return "> $500,000"

In [227]:
y = y.apply(binning_categories)
y.value_counts()

10,000-99,999      4785
0-9,999            4441
100,000-500,000    1453
> $500,000           50
Name: q24, dtype: int64

In [228]:
# Train-test split

from sklearn.model_selection import train_test_split

X_dev, X_test, y_dev, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify = y)


In [229]:
# label enc y

from sklearn.preprocessing import LabelEncoder

l_enc = LabelEncoder()
y_dev_encoded = l_enc.fit_transform(y_dev)
y_test_encoded = l_enc.transform(y_test)

In [230]:
from sklearn.metrics import roc_auc_score

In [231]:
y_dev_encoded_series = pd.Series(y_dev_encoded)

# Now you can use value_counts
value_counts = y_dev_encoded_series.value_counts()
print(value_counts)

1    3828
0    3553
2    1162
3      40
dtype: int64


In [232]:
from sklearn.metrics import roc_auc_score

In [233]:
!pip install scikit-learn  lightgbm catboostxgboost

[31mERROR: Could not find a version that satisfies the requirement catboostxgboost (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for catboostxgboost[0m[31m
[0m

## BOOSTING CLASSIFIER

In [234]:
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

### BASELINE MODEL

#### Training

In [235]:
bgc = GradientBoostingClassifier(random_state = 84)
bgc.fit(X_dev, y_dev_encoded)

#### Evaluation

In [236]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, bgc.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8204


In [237]:
# # Identify columns with object dtype
print(X_dev.columns)
categorical_columns = X_dev.columns
for col in categorical_columns:
    X_dev[col] = X_dev[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Now, fit the XGBoost model
xgbc = xgb.XGBClassifier(random_state=84,enable_categorical=True)
xgbc.fit(X_dev, y_dev_encoded)

Index(['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7_part_1', 'q7_part_2',
       'q7_part_3', 'q7_part_4', 'q7_part_5', 'q7_part_6', 'q7_part_7',
       'q7_part_8', 'q7_part_9', 'q7_part_10', 'q7_part_11', 'q7_part_12',
       'q7_other', 'q11', 'q12_part_1', 'q12_part_2', 'q12_part_3',
       'q12_other', 'q14_part_1', 'q14_part_2', 'q14_part_3', 'q14_part_4',
       'q14_part_5', 'q14_part_6', 'q14_part_7', 'q14_part_8', 'q14_part_9',
       'q14_part_10', 'q14_part_11', 'q14_other', 'q15', 'q17_part_1',
       'q17_part_2', 'q17_part_3', 'q17_part_4', 'q17_part_5', 'q17_part_6',
       'q17_part_7', 'q17_part_8', 'q17_part_9', 'q17_part_10', 'q17_part_11',
       'q17_other', 'q20', 'q21', 'q22', 'q30', 'q32', 'q33', 'q37_part_1',
       'q37_part_2', 'q37_part_3', 'q37_part_4', 'q37_part_5', 'q37_part_6',
       'q37_part_7', 'q37_part_8', 'q37_part_9', 'q37_part_10', 'q37_part_11',
       'q37_other', 'q38', 'q39_part_1', 'q39_part_2', 'q39_part_3',
       'q39_part_4', 'q39_part_5', 

In [238]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, xgbc.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8287


In [239]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(random_state=84)
lgbm.fit(X_dev, y_dev_encoded)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 8583, number of used features: 80
[LightGBM] [Info] Start training from score -0.881991
[LightGBM] [Info] Start training from score -0.807441
[LightGBM] [Info] Start training from score -1.999641
[LightGBM] [Info] Start training from score -5.368659


In [240]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, lgbm.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8364


In [241]:
from catboost import CatBoostClassifier
categorical_features = X_dev.columns.tolist()
catboost = CatBoostClassifier(random_state=84, cat_features=categorical_features)
catboost.fit(X_dev, y_dev_encoded)

Learning rate set to 0.088296
0:	learn: 1.2945819	total: 19.3ms	remaining: 19.3s
1:	learn: 1.2217874	total: 41ms	remaining: 20.5s
2:	learn: 1.1622689	total: 58.9ms	remaining: 19.6s
3:	learn: 1.1081777	total: 78.2ms	remaining: 19.5s
4:	learn: 1.0661290	total: 99.8ms	remaining: 19.9s
5:	learn: 1.0265370	total: 121ms	remaining: 20s
6:	learn: 0.9924280	total: 143ms	remaining: 20.3s
7:	learn: 0.9638557	total: 164ms	remaining: 20.4s
8:	learn: 0.9370651	total: 186ms	remaining: 20.5s
9:	learn: 0.9148388	total: 208ms	remaining: 20.6s
10:	learn: 0.8959417	total: 224ms	remaining: 20.2s
11:	learn: 0.8767641	total: 243ms	remaining: 20s
12:	learn: 0.8639732	total: 252ms	remaining: 19.1s
13:	learn: 0.8494394	total: 269ms	remaining: 18.9s
14:	learn: 0.8357664	total: 289ms	remaining: 19s
15:	learn: 0.8231348	total: 311ms	remaining: 19.1s
16:	learn: 0.8122122	total: 334ms	remaining: 19.3s
17:	learn: 0.8024805	total: 355ms	remaining: 19.4s
18:	learn: 0.7947657	total: 378ms	remaining: 19.5s
19:	learn: 0.7

<catboost.core.CatBoostClassifier at 0x7e549ba441c0>

In [242]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, catboost.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8447


### HYPERPARAMETER OPTIMIZATION USING GRID SEARCH


In [149]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#### Preparing the hyperparameter space and performing GridSearch CV

In [150]:
results_df = pd.DataFrame(np.zeros(shape=(4, 8)))

# Initialize the DataFrame without specifying initial values
results_df = pd.DataFrame(columns=["Model_Name", "Training accuracy", "Testing accuracy", "Precision", "Recall", "F1_score", "AUC", "AP"])
results_df.index.name = "Model"

# Add model names
results_df["Model_Name"] = ["Gradient Boosting", "XGBoost", "LightGBM", "CatBoost"]


In [151]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Define the StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=84)

# Hyperparameter tuning with GridSearchCV and StratifiedKFold
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

gbm = GradientBoostingClassifier(random_state=84)
grid_search_gbm = GridSearchCV(gbm, param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search_gbm.fit(X_dev, y_dev_encoded)

# Fit the model with the best parameters
best_gbm = grid_search_gbm.best_estimator_
best_gbm.fit(X_dev, y_dev_encoded)

In [152]:
print(grid_search_gbm.best_params_)

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}


In [153]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, best_gbm.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8254


In [244]:
roc_auc_gbm = roc_auc_score(y_test_encoded, best_gbm.predict_proba(X_test), average='weighted', multi_class='ovr')

In [252]:
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# # Assuming best_gbm is your trained model, X_test is your test set, and y_test_encoded is the encoded true labels
# y_pred_proba = best_gbm.predict_proba(X_test)
# y_pred = best_gbm.predict(X_test)
# y_pred_encoded = l_enc.transform(y_pred)
# y_dev_pred = best_gbm.predict(X_dev)
# y_dev_pred_encoded = l_enc.transform(y_dev_pred)

# # Accuracy
# accuracy_training = accuracy_score(y_dev_encoded, y_dev_pred_encoded)
# accuracy_testing = accuracy_score(y_test_encoded, y_pred_encoded)

# # Precision, Recall, F1-score
# precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
# recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
# f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

# # ROC-AUC Score
# roc_auc_gbm = roc_auc_score(y_test_encoded, best_gbm.predict_proba(X_test), average='weighted', multi_class='ovr')

# # Average Precision (AP)
# from sklearn.preprocessing import label_binarize
# y_test_encoded_bin = label_binarize(y_test_encoded, classes=[0, 1, 2, 3])
# average_precision = average_precision_score(y_test_encoded_bin, best_gbm.predict_proba(X_test), average='weighted')

# results_df.loc[0, 'Training accuracy'] = accuracy_training
# results_df.loc[0, 'Testing accuracy'] = accuracy_testing
# results_df.loc[0, 'Precision'] = precision
# results_df.loc[0, 'Recall'] = recall
# results_df.loc[0, 'F1_score'] = f1
# results_df.loc[0, 'AUC'] = roc_auc_gbm
# results_df.loc[0, 'AP'] = average_precision

# # Display the updated DataFrame
# results_df

In [253]:
# import xgboost as xgb
# from sklearn.model_selection import StratifiedKFold, GridSearchCV

# # Define the StratifiedKFold
# stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=84)

# # Hyperparameter tuning with GridSearchCV and StratifiedKFold
# param_grid = {
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [50, 100, 200],
#     'max_depth': [3, 5, 7],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [0, 0.1, 0.5]
# }

# xgbc = xgb.XGBClassifier(random_state=84,enable_categorical=True)
# grid_search_xgb = GridSearchCV(xgbc, param_grid, cv=stratified_kfold, scoring='accuracy')
# grid_search_xgb.fit(X_dev, y_dev_encoded)

XGBoostError: ignored

In [254]:
# Fit the model with the best parameters
best_xgbc = grid_search_xgb.best_estimator_
best_xgbc.fit(X_dev, y_dev_encoded)

In [255]:
# print(grid_search_xgb.best_params_)

{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.8}


In [256]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, best_xgbc.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8393


In [257]:
roc_auc_xgb = roc_auc_score(y_test_encoded, best_xgbc.predict_proba(X_test), average='weighted', multi_class='ovr')

In [258]:
# y_pred_proba = grid_search_xgb.predict_proba(X_test)
# y_pred = grid_search_xgb.predict(X_test)
# y_pred_encoded = l_enc.transform(y_pred)
# y_dev_pred = grid_search_xgb.predict(X_dev)
# y_dev_pred_encoded = l_enc.transform(y_dev_pred)

# # Accuracy
# accuracy_training = accuracy_score(y_dev_encoded, y_dev_pred_encoded)
# accuracy_testing = accuracy_score(y_test_encoded, y_pred_encoded)

# # Precision, Recall, F1-score
# precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
# recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
# f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

# # ROC-AUC Score
# roc_auc = roc_auc_score(y_test_encoded, grid_search_xgb.predict_proba(X_test), average='weighted', multi_class='ovr')

# # Average Precision (AP)
# from sklearn.preprocessing import label_binarize
# y_test_encoded_bin = label_binarize(y_test_encoded, classes=[0, 1, 2, 3])
# average_precision = average_precision_score(y_test_encoded_bin, grid_search_xgb.predict_proba(X_test), average='weighted')

# results_df.loc[1, 'Training accuracy'] = accuracy_training
# results_df.loc[1, 'Testing accuracy'] = accuracy_testing
# results_df.loc[1, 'Precision'] = precision
# results_df.loc[1, 'Recall'] = recall
# results_df.loc[1, 'F1_score'] = f1
# results_df.loc[1, 'AUC'] = roc_auc
# results_df.loc[1, 'AP'] = average_precision

# # Display the updated DataFrame
# results_df

In [259]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Define the StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=84)

# Hyperparameter tuning with GridSearchCV and StratifiedKFold
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

lgbm = lgb.LGBMClassifier(random_state=84)
grid_search_lgbm = GridSearchCV(lgbm, param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search_lgbm.fit(X_dev, y_dev_encoded)

# Fit the model with the best parameters
best_lgbm = grid_search_lgbm.best_estimator_
best_lgbm.fit(X_dev, y_dev_encoded)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 6866, number of used features: 80
[LightGBM] [Info] Start training from score -0.882074
[LightGBM] [Info] Start training from score -0.807513
[LightGBM] [Info] Start training from score -1.999152
[LightGBM] [Info] Start training from score -5.368601
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 349
[LightGBM] [Info] Number of data points in the train set: 6866, number of used features: 80
[LightGBM] [Info]

In [260]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, best_lgbm.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

The ROC-AUC score for this model is: 0.8369


In [261]:
print(grid_search_lgbm.best_params_)

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}


In [262]:
roc_auc_lgbm = roc_auc_score(y_test_encoded, best_lgbm.predict_proba(X_test), average='weighted', multi_class='ovr')

In [263]:
# y_pred_proba = grid_search_lgbm.predict_proba(X_test)
# y_pred = grid_search_lgbm.predict(X_test)
# y_pred_encoded = l_enc.transform(y_pred)
# y_dev_pred = grid_search_lgbm.predict(X_dev)
# y_dev_pred_encoded = l_enc.transform(y_dev_pred)

# # Accuracy
# accuracy_training = accuracy_score(y_dev_encoded, y_dev_pred_encoded)
# accuracy_testing = accuracy_score(y_test_encoded, y_pred_encoded)

# # Precision, Recall, F1-score
# precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
# recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
# f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

# # ROC-AUC Score
# roc_auc = roc_auc_score(y_test_encoded, grid_search_lgbm.predict_proba(X_test), average='weighted', multi_class='ovr')

# # Average Precision (AP)
# from sklearn.preprocessing import label_binarize
# y_test_encoded_bin = label_binarize(y_test_encoded, classes=[0, 1, 2, 3])
# average_precision = average_precision_score(y_test_encoded_bin, grid_search_lgbm.predict_proba(X_test), average='weighted')

# results_df.loc[2, 'Training accuracy'] = accuracy_training
# results_df.loc[2, 'Testing accuracy'] = accuracy_testing
# results_df.loc[2, 'Precision'] = precision
# results_df.loc[2, 'Recall'] = recall
# results_df.loc[2, 'F1_score'] = f1
# results_df.loc[2, 'AUC'] = roc_auc
# results_df.loc[2, 'AP'] = average_precision

# # Display the updated DataFrame
# results_df

In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV

# Define the StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=84)

# Hyperparameter tuning with GridSearchCV and StratifiedKFold
param_grid = {
    'learning_rate': [0.01,0.001, 0.1, 0.2],
    'n_estimators': [50, 100, 200,300],
    'max_depth': [3, 5, 7,15]
}

categorical_features = X_dev.columns.tolist()
catboost = CatBoostClassifier(random_state=84, cat_features=categorical_features)

grid_search_ct = GridSearchCV(catboost, param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search_ct.fit(X_dev, y_dev_encoded)

# Fit the model with the best parameters
best_catboost = grid_search_ct.best_estimator_
best_catboost.fit(X_dev, y_dev_encoded)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
81:	learn: 1.0262467	total: 1.27s	remaining: 279ms
82:	learn: 1.0239752	total: 1.29s	remaining: 264ms
83:	learn: 1.0215896	total: 1.31s	remaining: 249ms
84:	learn: 1.0195037	total: 1.33s	remaining: 234ms
85:	learn: 1.0173233	total: 1.34s	remaining: 218ms
86:	learn: 1.0151099	total: 1.35s	remaining: 202ms
87:	learn: 1.0129493	total: 1.37s	remaining: 187ms
88:	learn: 1.0107448	total: 1.39s	remaining: 172ms
89:	learn: 1.0085518	total: 1.41s	remaining: 157ms
90:	learn: 1.0064753	total: 1.43s	remaining: 141ms
91:	learn: 1.0043713	total: 1.45s	remaining: 126ms
92:	learn: 1.0003152	total: 1.47s	remaining: 110ms
93:	learn: 0.9963052	total: 1.48s	remaining: 94.8ms
94:	learn: 0.9924524	total: 1.5s	remaining: 79.1ms
95:	learn: 0.9884147	total: 1.52s	remaining: 63.4ms
96:	learn: 0.9845565	total: 1.54s	remaining: 47.7ms
97:	learn: 0.9807021	total: 1.56s	remaining: 31.9ms
98:	learn: 0.9769340	total: 1.58s	remaining: 16ms
99:	learn: 0.9

In [None]:
print(f"The ROC-AUC score for this model is: {roc_auc_score(y_test_encoded, best_catboost.predict_proba(X_test), average='weighted', multi_class='ovr'):.4f}")

In [None]:
print(grid_search_ct.best_params_)

In [None]:
roc_auc_catboost = roc_auc_score(y_test_encoded, best_catboost.predict_proba(X_test), average='weighted', multi_class='ovr')

In [None]:
# y_pred_proba = grid_search_ct.predict_proba(X_test)
# y_pred = grid_search_ct.predict(X_test)
# y_pred_encoded = l_enc.transform(y_pred)
# y_dev_pred = grid_search_ct.predict(X_dev)
# y_dev_pred_encoded = l_enc.transform(y_dev_pred)

# # Accuracy
# accuracy_training = accuracy_score(y_dev_encoded, y_dev_pred_encoded)
# accuracy_testing = accuracy_score(y_test_encoded, y_pred_encoded)

# # Precision, Recall, F1-score
# precision = precision_score(y_test_encoded, y_pred_encoded, average='weighted')
# recall = recall_score(y_test_encoded, y_pred_encoded, average='weighted')
# f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')

# # ROC-AUC Score
# roc_auc = roc_auc_score(y_test_encoded, grid_search_ct.predict_proba(X_test), average='weighted', multi_class='ovr')

# # Average Precision (AP)
# from sklearn.preprocessing import label_binarize
# y_test_encoded_bin = label_binarize(y_test_encoded, classes=[0, 1, 2, 3])
# average_precision = average_precision_score(y_test_encoded_bin, grid_search_ct.predict_proba(X_test), average='weighted')

# results_df.loc[0, 'Training accuracy'] = accuracy_training
# results_df.loc[0, 'Testing accuracy'] = accuracy_testing
# results_df.loc[0, 'Precision'] = precision
# results_df.loc[0, 'Recall'] = recall
# results_df.loc[0, 'F1_score'] = f1
# results_df.loc[0, 'AUC'] = roc_auc
# results_df.loc[0, 'AP'] = average_precision

# # Display the updated DataFrame
# results_df

In [None]:
def BoostingBT_ROC_scores():
  return {"Gradient Boosting":roc_auc_gbm, "XGBoost":roc_auc_xgb, "LightGBM":roc_auc_lgbm, "CatBoost":roc_auc_catboost}