In [10]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.calibration import LabelEncoder
from sklearn.metrics import roc_auc_score

In [13]:
from scikeras.wrappers import KerasClassifier

In [14]:
%matplotlib inline
sns.set_theme()

In [15]:
df = pd.read_csv("kaggle_survey_2020_responses.csv")

In [16]:
salary_data = df.drop(columns = ["time_from_start_to_finish_seconds"])

In [17]:
salary_data = salary_data.dropna(subset = ["q24"])

In [18]:
salary_data_as_num = pd.DataFrame()

In [19]:
def convert_to_category(col_name: str, order_rules: list, data):
    data[col_name] = pd.Categorical(data[col_name], order_rules)

In [20]:
def convert_to_category_no_specified_order(col_name, data):
    if sum(data[col_name].isna().astype(int)) > 0:
        data[col_name].fillna("No response", inplace = True)
    
    order = list(set(data[col_name]))
    convert_to_category(col_name, order, data)

In [21]:
def convert_category_to_code(col_name: str, data, inplace = False):
    if inplace:
        data[col_name] = data[col_name].cat.codes + 1 # because NaN automatically becomes -1
    else:
        return data[col_name].cat.codes + 1

In [22]:
def process_column(col_name: str, order_rules = None, data = salary_data, num_data = salary_data_as_num):
    if order_rules:
        convert_to_category(col_name, order_rules, data)
    else:
        convert_to_category_no_specified_order(col_name, data)
    num_data[col_name] = convert_category_to_code(col_name, data)

In [23]:
def one_hot_column_to_binary(col_name, data = salary_data):
    data[col_name].fillna(0, inplace = True)
    data[col_name].mask(data[col_name] != 0, 1, inplace = True)

In [24]:
def process_one_hot_encoded_columns(columns, data = salary_data, num_data = salary_data_as_num):
    for col in columns:
        one_hot_column_to_binary(col, data)
        num_data[col] = data[col]

In [25]:
def column_text_to_binary(col_name, data = salary_data, num_data = salary_data_as_num):
    data[col_name] = data[col_name].notna().astype(int)
    num_data[col_name] = data[col_name]

In [26]:
def combine_multiple_columns_into_one_binary(columns, new_col_name, data = salary_data, num_data = salary_data_as_num):
    for col_name in columns:
        one_hot_column_to_binary(col_name)
        
    data[new_col_name] = data[columns].sum(axis = 1)
    data[new_col_name] = data[new_col_name].astype(int)
    
    data[new_col_name].mask(data[new_col_name] > 0, 1, inplace = True)
    num_data[new_col_name] = data[new_col_name]

## Q24 Target Column

### v1: original bins

In [27]:
q24_order = ["$0-999",
             '1,000-1,999',
             '2,000-2,999',
             '3,000-3,999',
             '4,000-4,999',
             '5,000-7,499',
             '7,500-9,999',
             '10,000-14,999',
             '15,000-19,999',
             '20,000-24,999',
             '25,000-29,999',
             '30,000-39,999',
             '40,000-49,999',
             '50,000-59,999',
              '60,000-69,999',
              '70,000-79,999',
              '80,000-89,999',
              '90,000-99,999',
            '100,000-124,999',
            '125,000-149,999',
            '150,000-199,999',
             '200,000-249,999',
             '250,000-299,999',
              '300,000-500,000',
              '> $500,000'
             ]

## v2: numerical

In [28]:
q24_mapped_to = [
    0, #"$0-999",
    0, #        '1,000-1,999',
    0, #         '2,000-2,999',
    0, #         '3,000-3,999',
    0, #         '4,000-4,999',
    0, #        '5,000-7,499',
    0, #         '7,500-9,999',
    0, #         '10,000-14,999',
    0, #         '15,000-19,999',
    0, #         '20,000-24,999',
    0, #         '25,000-29,999',
    0, #         '30,000-39,999',
    0, #         '40,000-49,999',
    0, #         '50,000-59,999',
    0, #          '60,000-69,999',
    0, #          '70,000-79,999',
    0, #          '80,000-89,999',
    0, #          '90,000-99,999',
    1, #        '100,000-124,999',
    1, #        '125,000-149,999',
    1, #        '150,000-199,999',
    1, #         '200,000-249,999',
    1, #         '250,000-299,999',
    1, #          '300,000-500,000',
    1 #          '> $500,000'
]

In [29]:
q24_mapping = dict(zip(q24_order, q24_mapped_to))

In [30]:
test_col = salary_data["q24"].copy()
test_col

1        100,000-124,999
2          15,000-19,999
3        125,000-149,999
8          70,000-79,999
11         30,000-39,999
              ...       
20024        2,000-2,999
20029      15,000-19,999
20033             $0-999
20034             $0-999
20035             $0-999
Name: q24, Length: 10729, dtype: object

In [31]:
test_col = test_col.replace(q24_mapping)

In [32]:
salary_data["q24"] = test_col
salary_data_as_num["q24"] = test_col

## Q1 Age

In [33]:
salary_data["q1"].value_counts()

q1
25-29    2350
30-34    1979
35-39    1467
22-24    1424
40-44    1042
45-49     771
50-54     536
18-21     498
60-69     309
55-59     301
70         52
Name: count, dtype: int64

In [34]:
q1_order = [
    "18-21",
    "22-24",
    "25-29",
    "30-34",
    "35-39",
    "40-44",
    "45-49",
    "50-54",
    "55-59",
    "60-69",
    "70"
]

In [35]:
convert_to_category("q1", q1_order, salary_data)

In [36]:
salary_data_as_num["q1"] = convert_category_to_code("q1", salary_data, False)
salary_data_as_num

Unnamed: 0,q24,q1
1,1,4
2,0,5
3,1,4
8,0,5
11,0,5
...,...,...
20024,0,5
20029,0,5
20033,0,4
20034,0,2


In [37]:
salary_data_as_num["q1"].value_counts()

q1
3     2350
4     1979
5     1467
2     1424
6     1042
7      771
8      536
1      498
10     309
9      301
11      52
Name: count, dtype: int64

## Q2 Gender

In [38]:
salary_data["q2"].value_counts()

q2
Man                        8872
Woman                      1683
Prefer not to say           131
Prefer to self-describe      23
Nonbinary                    20
Name: count, dtype: int64

In [39]:
q2_order = ['Man', "Woman", "Nonbinary", 'Prefer to self-describe', 'Prefer not to say']

In [40]:
convert_to_category("q2", q2_order, salary_data)

In [41]:
salary_data_as_num["q2"] = convert_category_to_code("q2", salary_data, False)
salary_data_as_num

Unnamed: 0,q24,q1,q2
1,1,4,1
2,0,5,1
3,1,4,1
8,0,5,1
11,0,5,1
...,...,...,...
20024,0,5,1
20029,0,5,1
20033,0,4,1
20034,0,2,1


In [42]:
salary_data_as_num["q2"].value_counts()

q2
1    8872
2    1683
5     131
4      23
3      20
Name: count, dtype: int64

## Q3 Country

In [43]:
convert_to_category_no_specified_order("q3", salary_data)

In [44]:
salary_data_as_num["q3"] = convert_category_to_code("q3", salary_data)

## Q4 Education

In [45]:
salary_data["q4"].value_counts()

q4
Master’s degree                                                      4879
Bachelor’s degree                                                    3013
Doctoral degree                                                      1718
Professional degree                                                   470
Some college/university study without earning a bachelor’s degree     385
I prefer not to answer                                                158
No formal education past high school                                  106
Name: count, dtype: int64

In [46]:
q4_order = ["No formal education past high school",
            "Some college/university study without earning a bachelor’s degree",
            "Professional degree",
            "Bachelor’s degree",
            "Master’s degree",
            "Doctoral degree",
            "I prefer not to answer"
            ]

In [47]:
convert_to_category("q4", q4_order, salary_data)

In [48]:
salary_data_as_num["q4"] = convert_category_to_code("q4", salary_data)

In [49]:
salary_data_as_num["q4"].value_counts()

q4
5    4879
4    3013
6    1718
3     470
2     385
7     158
1     106
Name: count, dtype: int64

## Q5 Job Title

In [50]:
salary_data["q5"].value_counts()

q5
Data Scientist               2398
Software Engineer            1620
Other                        1508
Data Analyst                 1260
Research Scientist           1028
Machine Learning Engineer     918
Business Analyst              678
Product/Project Manager       590
Data Engineer                 369
Statistician                  248
DBA/Database Engineer         112
Name: count, dtype: int64

In [51]:
convert_to_category_no_specified_order("q5", salary_data)

In [52]:
salary_data_as_num["q5"] = convert_category_to_code("q5", salary_data)

## Q6 Years Coding

In [53]:
q6_order = [
 'I have never written code',
 '< 1 years',
 '1-2 years',
 '3-5 years',
 '5-10 years',
 '10-20 years',
 '20+ years']

In [54]:
process_column("q6", q6_order)

## Q7 Language

In [55]:
q7_columns = [
     'q7_part_1',
 'q7_part_2',
 'q7_part_3',
 'q7_part_4',
 'q7_part_5',
 'q7_part_6',
 'q7_part_7',
 'q7_part_8',
 'q7_part_9',
 'q7_part_10',
 'q7_part_11',
 'q7_part_12',
 'q7_other'
]

In [56]:
process_one_hot_encoded_columns(q7_columns)

## Q11 Computing Platform

In [57]:
process_column("q11")

## Q12 Specialized Hardware

In [58]:
q12_columns = [
    'q12_part_1',
 'q12_part_2',
 'q12_part_3',
 'q12_other'
]

In [59]:
process_one_hot_encoded_columns(q12_columns)

## Q14 Visualization

In [60]:
q14_columns = [
    'q14_part_1',
 'q14_part_2',
 'q14_part_3',
 'q14_part_4',
 'q14_part_5',
 'q14_part_6',
 'q14_part_7',
 'q14_part_8',
 'q14_part_9',
 'q14_part_10',
 'q14_part_11',
 'q14_other'
]

In [61]:
process_one_hot_encoded_columns(q14_columns)

## Q15 Years ML

In [62]:
q15_order = [
    'I do not use machine learning methods',
    'Under 1 year',
    '1-2 years',
    '2-3 years',
    '3-4 years',
    '4-5 years',
    '5-10 years',
    '10-20 years',
    '20 or more years'
]

In [63]:
process_column("q15", q15_order)

## Q17 ML Algorithms

In [64]:
q17_columns = [
    'q17_part_1',
 'q17_part_2',
 'q17_part_3',
 'q17_part_4',
 'q17_part_5',
 'q17_part_6',
 'q17_part_7',
 'q17_part_8',
 'q17_part_9',
 'q17_part_10',
 'q17_part_11',
 'q17_other'
]

In [65]:
process_one_hot_encoded_columns(q17_columns)

## Q20 Company Size

In [66]:
q20_order = [
    '0-49 employees',
    '50-249 employees',
    '250-999 employees',
    '1000-9,999 employees',
    '10,000 or more employees'
]

In [67]:
process_column("q20", q20_order)

## Q21 Datascience Workloads

In [68]:
q21_order = [
    '0',
    '1-2',
    '3-4',
    '5-9',
    '10-14',
    '15-19',
    '20'
]

In [69]:
process_column("q21", q21_order)

## Q22 Incorporating ML

i'm not super sure about the proper "order" for this question. Feel free to change this if you find it more appropriate. Just please let the chat know in case it affects others' encoding.

In [70]:
q22_order = [
    'I do not know',
    'No (we do not use ML methods)',
    'We are exploring ML methods (and may one day put a model into production)',
    'We use ML methods for generating insights (but do not put working models into production)',
    'We recently started using ML methods (i.e., models in production for less than 2 years)',
    'We have well established ML methods (i.e., models in production for more than 2 years)'
]

In [71]:
process_column("q22", q22_order)

## Q30 Big Data Products

In [72]:
column_text_to_binary("q30")

## Q32 Business Intelligence Tools

In [73]:
column_text_to_binary("q32")

## Q33 Automated ML Tools

In [74]:
q33_columns = [
    'q33_a_part_1',
 'q33_a_part_2',
 'q33_a_part_3',
 'q33_a_part_4',
 'q33_a_part_5',
 'q33_a_part_6',
 'q33_a_part_7',
 'q33_a_other'
]

In [75]:
combine_multiple_columns_into_one_binary(q33_columns, "q33")

## Q37 Data Science Courses

In [76]:
q37_columns = [
    'q37_part_1',
 'q37_part_2',
 'q37_part_3',
 'q37_part_4',
 'q37_part_5',
 'q37_part_6',
 'q37_part_7',
 'q37_part_8',
 'q37_part_9',
 'q37_part_10',
 'q37_part_11',
 'q37_other'
]

In [77]:
process_one_hot_encoded_columns(q37_columns)

## Q38 Primary Data Analysis Tool

In [78]:
process_column("q38")

## Q39 Media Sources

In [79]:
q39_columns = [
    'q39_part_1',
 'q39_part_2',
 'q39_part_3',
 'q39_part_4',
 'q39_part_5',
 'q39_part_6',
 'q39_part_7',
 'q39_part_8',
 'q39_part_9',
 'q39_part_10',
 'q39_part_11',
 'q39_other'
]

In [80]:
process_one_hot_encoded_columns(q39_columns)

## Dropped Columns

In [81]:
one_hot_dropped = [
    'q33_a_part_1',
 'q33_a_part_2',
 'q33_a_part_3',
 'q33_a_part_4',
 'q33_a_part_5',
 'q33_a_part_6',
 'q33_a_part_7',
 'q33_a_other',
]

In [82]:
dropped_questions = [
    "q8",
    'q9_part_1',
 'q9_part_2',
 'q9_part_3',
 'q9_part_4',
 'q9_part_5',
 'q9_part_6',
 'q9_part_7',
 'q9_part_8',
 'q9_part_9',
 'q9_part_10',
 'q9_part_11',
 'q9_other',
 'q10_part_1',
 'q10_part_2',
 'q10_part_3',
 'q10_part_4',
 'q10_part_5',
 'q10_part_6',
 'q10_part_7',
 'q10_part_8',
 'q10_part_9',
 'q10_part_10',
 'q10_part_11',
 'q10_part_12',
 'q10_part_13',
 'q10_other',
 "q13",
 'q16_part_1',
 'q16_part_2',
 'q16_part_3',
 'q16_part_4',
 'q16_part_5',
 'q16_part_6',
 'q16_part_7',
 'q16_part_8',
 'q16_part_9',
 'q16_part_10',
 'q16_part_11',
 'q16_part_12',
 'q16_part_13',
 'q16_part_14',
 'q16_part_15',
 'q16_other',
 'q18_part_1',
 'q18_part_2',
 'q18_part_3',
 'q18_part_4',
 'q18_part_5',
 'q18_part_6',
 'q18_other',
 'q19_part_1',
 'q19_part_2',
 'q19_part_3',
 'q19_part_4',
 'q19_part_5',
 'q19_other',
 'q23_part_1',
 'q23_part_2',
 'q23_part_3',
 'q23_part_4',
 'q23_part_5',
 'q23_part_6',
 'q23_part_7',
 'q23_other',
 'q25',
 'q26_a_part_1',
 'q26_a_part_2',
 'q26_a_part_3',
 'q26_a_part_4',
 'q26_a_part_5',
 'q26_a_part_6',
 'q26_a_part_7',
 'q26_a_part_8',
 'q26_a_part_9',
 'q26_a_part_10',
 'q26_a_part_11',
 'q26_a_other',
 'q27_a_part_1',
 'q27_a_part_2',
 'q27_a_part_3',
 'q27_a_part_4',
 'q27_a_part_5',
 'q27_a_part_6',
 'q27_a_part_7',
 'q27_a_part_8',
 'q27_a_part_9',
 'q27_a_part_10',
 'q27_a_part_11',
 'q27_a_other',
 'q28_a_part_1',
 'q28_a_part_2',
 'q28_a_part_3',
 'q28_a_part_4',
 'q28_a_part_5',
 'q28_a_part_6',
 'q28_a_part_7',
 'q28_a_part_8',
 'q28_a_part_9',
 'q28_a_part_10',
 'q28_a_other',
 'q29_a_part_1',
 'q29_a_part_2',
 'q29_a_part_3',
 'q29_a_part_4',
 'q29_a_part_5',
 'q29_a_part_6',
 'q29_a_part_7',
 'q29_a_part_8',
 'q29_a_part_9',
 'q29_a_part_10',
 'q29_a_part_11',
 'q29_a_part_12',
 'q29_a_part_13',
 'q29_a_part_14',
 'q29_a_part_15',
 'q29_a_part_16',
 'q29_a_part_17',
 'q29_a_other',
 'q31_a_part_1',
 'q31_a_part_2',
 'q31_a_part_3',
 'q31_a_part_4',
 'q31_a_part_5',
 'q31_a_part_6',
 'q31_a_part_7',
 'q31_a_part_8',
 'q31_a_part_9',
 'q31_a_part_10',
 'q31_a_part_11',
 'q31_a_part_12',
 'q31_a_part_13',
 'q31_a_part_14',
 'q31_a_other',
 'q34_a_part_1',
 'q34_a_part_2',
 'q34_a_part_3',
 'q34_a_part_4',
 'q34_a_part_5',
 'q34_a_part_6',
 'q34_a_part_7',
 'q34_a_part_8',
 'q34_a_part_9',
 'q34_a_part_10',
 'q34_a_part_11',
 'q34_a_other',
 'q35_a_part_1',
 'q35_a_part_2',
 'q35_a_part_3',
 'q35_a_part_4',
 'q35_a_part_5',
 'q35_a_part_6',
 'q35_a_part_7',
 'q35_a_part_8',
 'q35_a_part_9',
 'q35_a_part_10',
 'q35_a_other',
 'q36_part_1',
 'q36_part_2',
 'q36_part_3',
 'q36_part_4',
 'q36_part_5',
 'q36_part_6',
 'q36_part_7',
 'q36_part_8',
 'q36_part_9',
 'q36_other',
]

In [83]:
part_b_dropped = [
    'q26_b_part_1',
 'q26_b_part_2',
 'q26_b_part_3',
 'q26_b_part_4',
 'q26_b_part_5',
 'q26_b_part_6',
 'q26_b_part_7',
 'q26_b_part_8',
 'q26_b_part_9',
 'q26_b_part_10',
 'q26_b_part_11',
 'q26_b_other',
 'q27_b_part_1',
 'q27_b_part_2',
 'q27_b_part_3',
 'q27_b_part_4',
 'q27_b_part_5',
 'q27_b_part_6',
 'q27_b_part_7',
 'q27_b_part_8',
 'q27_b_part_9',
 'q27_b_part_10',
 'q27_b_part_11',
 'q27_b_other',
 'q28_b_part_1',
 'q28_b_part_2',
 'q28_b_part_3',
 'q28_b_part_4',
 'q28_b_part_5',
 'q28_b_part_6',
 'q28_b_part_7',
 'q28_b_part_8',
 'q28_b_part_9',
 'q28_b_part_10',
 'q28_b_other',
 'q29_b_part_1',
 'q29_b_part_2',
 'q29_b_part_3',
 'q29_b_part_4',
 'q29_b_part_5',
 'q29_b_part_6',
 'q29_b_part_7',
 'q29_b_part_8',
 'q29_b_part_9',
 'q29_b_part_10',
 'q29_b_part_11',
 'q29_b_part_12',
 'q29_b_part_13',
 'q29_b_part_14',
 'q29_b_part_15',
 'q29_b_part_16',
 'q29_b_part_17',
 'q29_b_other',
 'q31_b_part_1',
 'q31_b_part_2',
 'q31_b_part_3',
 'q31_b_part_4',
 'q31_b_part_5',
 'q31_b_part_6',
 'q31_b_part_7',
 'q31_b_part_8',
 'q31_b_part_9',
 'q31_b_part_10',
 'q31_b_part_11',
 'q31_b_part_12',
 'q31_b_part_13',
 'q31_b_part_14',
 'q31_b_other',
 'q33_b_part_1',
 'q33_b_part_2',
 'q33_b_part_3',
 'q33_b_part_4',
 'q33_b_part_5',
 'q33_b_part_6',
 'q33_b_part_7',
 'q33_b_other',
 'q34_b_part_1',
 'q34_b_part_2',
 'q34_b_part_3',
 'q34_b_part_4',
 'q34_b_part_5',
 'q34_b_part_6',
 'q34_b_part_7',
 'q34_b_part_8',
 'q34_b_part_9',
 'q34_b_part_10',
 'q34_b_part_11',
 'q34_b_other',
 'q35_b_part_1',
 'q35_b_part_2',
 'q35_b_part_3',
 'q35_b_part_4',
 'q35_b_part_5',
 'q35_b_part_6',
 'q35_b_part_7',
 'q35_b_part_8',
 'q35_b_part_9',
 'q35_b_part_10',
 'q35_b_other'
]

In [84]:
salary_data = salary_data.drop(columns = one_hot_dropped)

In [85]:
salary_data = salary_data.drop(columns = part_b_dropped)

In [86]:
salary_data_selected_questions = salary_data.drop(columns = dropped_questions)

# DNN Model

In [87]:
X = salary_data_as_num.drop(columns = ["q24"])
y = salary_data_as_num["q24"]

In [88]:
y

1        1
2        0
3        1
8        0
11       0
        ..
20024    0
20029    0
20033    0
20034    0
20035    0
Name: q24, Length: 10729, dtype: int64

In [89]:
x_dev, x_test, y_dev, y_test = train_test_split(X, y, test_size = 0.2, random_state = 6)

In [90]:
x_dev = tf.convert_to_tensor(x_dev.astype("int64"))
x_test = tf.convert_to_tensor(x_test.astype("int64"))
y_dev = tf.convert_to_tensor(y_dev.astype("int64"))
y_test = tf.convert_to_tensor(y_test.astype("int64"))

2023-11-29 12:24:10.494877: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-11-29 12:24:10.494935: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2023-11-29 12:24:10.494949: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2023-11-29 12:24:10.495354: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-29 12:24:10.495861: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [99]:
class DNN:
    
    def __init__(self, x_dev = x_dev, y_dev = y_dev, x_test = x_test, y_test = y_test):
        self.x_dev = x_dev
        self.y_dev = y_dev
        self.x_test = x_test
        self.y_test = y_test
        
        self.layers = [Dense(32, input_shape = (80, ), activation = "relu")]
        
        self.optimizer = "adam"
        self.loss = "sparse_categorical_crossentropy"
        self.metrics =  ["accuracy"]
        
        self.batch_size = 100
        self.epochs = 50
        
    def customize_first_layer(self, node_count = 32):
        self.layers = [Dense(node_count, input_shape = (80, ), activation = "relu")]
        
    def add_one_dense_layer(self, node_count = 32):
        self.layers.append(Dense(node_count, activation = "relu"))
        
    def customize_middle_layers(self, layers):
        self.layers.extend(layers)
    
    def customize_compile(self,
                          optimizer = "adam",
                          loss = "sparse_categorical_crossentropy",
                          metrics = ["accuracy"]):
        self.optimizer = optimizer
        self.loss = loss
        self.metrics = metrics
    
    def customize_fit(self,
                      batch_size = 100,
                      epochs = 50
                      ):
        self.batch_size = batch_size
        self.epochs = epochs
    
    def build_compile_and_evaluate(self, metric_to_return = "accuracy", selection_criteria = max):
        # final layer must be softmax and outputs 2
        # TODO: MAKE SURE TO UPDATE THIS FOR EACH DENSE
        self.layers.append(Dense(2, activation = "softmax"))
        
        self.model = Sequential(self.layers)
        self.model.compile(optimizer = self.optimizer,
                      loss = self.loss,
                      metrics = self.metrics)
        fit_history = self.model.fit(self.x_dev, self.y_dev,
                                batch_size = self.batch_size,
                                epochs = self.epochs,
                                validation_split = 0.2,
                                verbose = 1
                                )
        self.fit_history = pd.DataFrame(fit_history.history)
        return selection_criteria(self.fit_history[metric_to_return])
        #return self.fit_history
    
    def get_model_summary(self):
        self.model.summary()
        
    def get_fit_history(self):
        return self.fit_history
        
    def evaluate_model_with_test(self):
        return self.model.evaluate(self.x_test, self.y_test, verbose = 0)

In [92]:
d = DNN()
d.customize_fit(epochs = 2)
d.add_one_dense_layer()
d.build_compile_and_evaluate()

Epoch 1/2


2023-11-29 12:24:11.097769: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




Epoch 2/2


0.8408097624778748

## 2 Layer Random Search

In [93]:
results = []

for i in range(10):
    random.seed(i)
    r1 = random.randint(2, 12)
    r2 = random.randint(2, 12)
    
    l1 = 2 ** r1
    l2 = 2 ** r2
    
    print("---- building model for layer width of " + str(l1) + " and " + str(l2))
    d = DNN()
    d.customize_first_layer(l1)
    d.add_one_dense_layer(l2)
    
    d.customize_fit(epochs = 2) ## TODO: REMOVE
    
    val_acc_result = d.build_compile_and_evaluate()
    test_loss, test_accuracy = d.evaluate_model_with_test()
    
    results.append((l1, l2, val_acc_result, "Validation Accuracy"))
    results.append((l1, l2, test_loss, "Test Loss"))
    results.append((l1, l2, test_accuracy, "Test Accuracy"))
    
results = pd.DataFrame(results)
results = results.rename(columns = {
    0: "Layer 1 Width",
    1: "Layer 2 Width",
    2: "Metrics Value",
    3: "Metrics Type"
})
results

---- building model for layer width of 256 and 256


Epoch 1/2
Epoch 2/2
---- building model for layer width of 16 and 2048
Epoch 1/2
Epoch 2/2
---- building model for layer width of 4 and 8
Epoch 1/2
Epoch 2/2
---- building model for layer width of 32 and 2048
Epoch 1/2
Epoch 2/2
---- building model for layer width of 32 and 64
Epoch 1/2
Epoch 2/2
---- building model for layer width of 2048 and 64
Epoch 1/2
Epoch 2/2
---- building model for layer width of 2048 and 8
Epoch 1/2
Epoch 2/2
---- building model for layer width of 128 and 16
Epoch 1/2
Epoch 2/2
---- building model for layer width of 32 and 128
Epoch 1/2
Epoch 2/2
---- building model for layer width of 512 and 2048
Epoch 1/2
Epoch 2/2


Unnamed: 0,Layer 1 Width,Layer 2 Width,Metrics Value,Metrics Type
0,256,256,0.847655,Validation Accuracy
1,256,256,0.53955,Test Loss
2,256,256,0.731594,Test Accuracy
3,16,2048,0.85552,Validation Accuracy
4,16,2048,0.42316,Test Loss
5,16,2048,0.853215,Test Accuracy
6,4,8,0.512234,Validation Accuracy
7,4,8,0.5199,Test Loss
8,4,8,0.83411,Test Accuracy
9,32,2048,0.851005,Validation Accuracy


In [94]:
validation_accuracy_only = results[results["Metrics Type"] == "Validation Accuracy"]
validation_accuracy_only

Unnamed: 0,Layer 1 Width,Layer 2 Width,Metrics Value,Metrics Type
0,256,256,0.847655,Validation Accuracy
3,16,2048,0.85552,Validation Accuracy
6,4,8,0.512234,Validation Accuracy
9,32,2048,0.851005,Validation Accuracy
12,32,64,0.857996,Validation Accuracy
15,2048,64,0.859452,Validation Accuracy
18,2048,8,0.859452,Validation Accuracy
21,128,16,0.852461,Validation Accuracy
24,32,128,0.85319,Validation Accuracy
27,512,2048,0.811972,Validation Accuracy


In [95]:
y.value_counts()

q24
0    9226
1    1503
Name: count, dtype: int64

In [103]:
d = DNN()
d.customize_fit(epochs = 2)
d.customize_first_layer(4)
d.add_one_dense_layer(8)
d.build_compile_and_evaluate()
y_pred_prob = d.model.predict(x_test)
#roc_auc_score(y_test, y_pred)

Epoch 1/2
Epoch 2/2


In [104]:
y_pred = []

for i in range(len(y_pred_prob)):
    y_pred.append(np.argmax(y_pred_prob[i]))

y_pred

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,


In [105]:
roc_auc_score(y_test, y_pred)

0.4717982504937001