In [65]:
import pandas as pd
import numpy as np
import pprint

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, RandomizedSearchCV, GridSearchCV, StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras import layers



In [66]:
dfRaw = pd.read_csv('/Users/taddbackus/School/fall23/qtw/cs7/final_project(5).csv')
print(dfRaw.head())

         x0        x1         x2        x3  ...       x47       x48        x49  y
0 -0.166563 -3.961588   4.621113  2.481908  ... -7.689696  0.151589  -8.040166  0
1 -0.149894 -0.585676  27.839856  4.152333  ... -4.896678 -0.320283  16.719974  0
2 -0.321707 -1.429819  12.251561  6.586874  ... -7.428573 -2.090804  -7.869421  0
3 -0.245594  5.076677 -24.149632  3.637307  ...  5.361375  1.806070  -7.670847  0
4 -0.273366  0.306326 -11.352593  1.676758  ... -0.208351 -0.894942  15.724742  1

[5 rows x 51 columns]


In [67]:
print(dfRaw.describe())

                  x0             x1  ...            x49              y
count  159974.000000  159975.000000  ...  159968.000000  160000.000000
mean       -0.001028       0.001358  ...      -0.674224       0.401231
std         0.371137       6.340632  ...      15.036738       0.490149
min        -1.592635     -26.278302  ...     -65.791191       0.000000
25%        -0.251641      -4.260973  ...     -10.931753       0.000000
50%        -0.002047       0.004813  ...      -0.574410       0.000000
75%         0.248532       4.284220  ...       9.651072       1.000000
max         1.600849      27.988178  ...      66.877604       1.000000

[8 rows x 46 columns]


In [68]:
print(dfRaw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160000 entries, 0 to 159999
Data columns (total 51 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   x0      159974 non-null  float64
 1   x1      159975 non-null  float64
 2   x2      159962 non-null  float64
 3   x3      159963 non-null  float64
 4   x4      159974 non-null  float64
 5   x5      159963 non-null  float64
 6   x6      159974 non-null  float64
 7   x7      159973 non-null  float64
 8   x8      159979 non-null  float64
 9   x9      159970 non-null  float64
 10  x10     159957 non-null  float64
 11  x11     159970 non-null  float64
 12  x12     159964 non-null  float64
 13  x13     159969 non-null  float64
 14  x14     159966 non-null  float64
 15  x15     159965 non-null  float64
 16  x16     159974 non-null  float64
 17  x17     159973 non-null  float64
 18  x18     159960 non-null  float64
 19  x19     159965 non-null  float64
 20  x20     159962 non-null  float64
 21  x21     15

In [69]:
df = dfRaw.copy()

In [70]:
catColumns = df.select_dtypes(include=['object']).columns

for col in catColumns:
    uniqueValues = df[col].unique()
    uniqueCount = len(uniqueValues)
    print(f"Unique values in {col} ({uniqueCount} unique values): \n{uniqueValues}\n\n")

Unique values in x24 (4 unique values): 
['euorpe' 'asia' 'america' nan]


Unique values in x29 (13 unique values): 
['July' 'Aug' 'Jun' 'May' 'sept.' 'Apr' 'Nov' 'Oct' nan 'Mar' 'Feb' 'Dev'
 'January']


Unique values in x30 (6 unique values): 
['tuesday' 'wednesday' 'thurday' 'monday' 'friday' nan]


Unique values in x32 (13 unique values): 
['0.0%' '-0.02%' '-0.01%' '0.01%' '-0.03%' '0.02%' '-0.0%' '-0.04%' nan
 '0.03%' '0.04%' '-0.05%' '0.05%']


Unique values in x37 (129199 unique values): 
['$1313.96' '$1962.78' '$430.47' ... '$1588.65' '$439.21' '$-1229.34']




In [71]:
def missing_counts(data):
    mCount = 0
    for i in data:
        if data[i].isnull().sum() > 0:
            print(i,':',data[i].isnull().sum(),'missing')
            print('    -',(data[i].isnull().sum() / len(data)) * 100, '%')
            print('=====================')
            mCount += 1
    if mCount == 0:
        print('No Missing Values')

In [72]:
missing_counts(df)

x0 : 26 missing
    - 0.01625 %
x1 : 25 missing
    - 0.015625 %
x2 : 38 missing
    - 0.02375 %
x3 : 37 missing
    - 0.023125 %
x4 : 26 missing
    - 0.01625 %
x5 : 37 missing
    - 0.023125 %
x6 : 26 missing
    - 0.01625 %
x7 : 27 missing
    - 0.016875 %
x8 : 21 missing
    - 0.013125 %
x9 : 30 missing
    - 0.01875 %
x10 : 43 missing
    - 0.026875 %
x11 : 30 missing
    - 0.01875 %
x12 : 36 missing
    - 0.0225 %
x13 : 31 missing
    - 0.019375 %
x14 : 34 missing
    - 0.021249999999999998 %
x15 : 35 missing
    - 0.021875000000000002 %
x16 : 26 missing
    - 0.01625 %
x17 : 27 missing
    - 0.016875 %
x18 : 40 missing
    - 0.025 %
x19 : 35 missing
    - 0.021875000000000002 %
x20 : 38 missing
    - 0.02375 %
x21 : 29 missing
    - 0.018125 %
x22 : 27 missing
    - 0.016875 %
x23 : 47 missing
    - 0.029375000000000002 %
x24 : 28 missing
    - 0.017499999999999998 %
x25 : 22 missing
    - 0.01375 %
x26 : 36 missing
    - 0.0225 %
x27 : 30 missing
    - 0.01875 %
x28 : 35 missin

### Imputing categorical data

In [73]:
columnsToEncode = ['x24','x29','x30']
for i in columnsToEncode:
    df[i] = df[i].fillna('unknown')

In [74]:
missing_counts(df)

x0 : 26 missing
    - 0.01625 %
x1 : 25 missing
    - 0.015625 %
x2 : 38 missing
    - 0.02375 %
x3 : 37 missing
    - 0.023125 %
x4 : 26 missing
    - 0.01625 %
x5 : 37 missing
    - 0.023125 %
x6 : 26 missing
    - 0.01625 %
x7 : 27 missing
    - 0.016875 %
x8 : 21 missing
    - 0.013125 %
x9 : 30 missing
    - 0.01875 %
x10 : 43 missing
    - 0.026875 %
x11 : 30 missing
    - 0.01875 %
x12 : 36 missing
    - 0.0225 %
x13 : 31 missing
    - 0.019375 %
x14 : 34 missing
    - 0.021249999999999998 %
x15 : 35 missing
    - 0.021875000000000002 %
x16 : 26 missing
    - 0.01625 %
x17 : 27 missing
    - 0.016875 %
x18 : 40 missing
    - 0.025 %
x19 : 35 missing
    - 0.021875000000000002 %
x20 : 38 missing
    - 0.02375 %
x21 : 29 missing
    - 0.018125 %
x22 : 27 missing
    - 0.016875 %
x23 : 47 missing
    - 0.029375000000000002 %
x25 : 22 missing
    - 0.01375 %
x26 : 36 missing
    - 0.0225 %
x27 : 30 missing
    - 0.01875 %
x28 : 35 missing
    - 0.021875000000000002 %
x31 : 39 missin

In [75]:
df['x37'] = df['x37'].str.replace('$','').astype(float)
df['x32'] = df['x32'].str.replace('%','').astype(float)

  df['x37'] = df['x37'].str.replace('$','').astype(float)


In [76]:
missingData = []
for i in df:
    if df[i].isnull().sum() > 0:
        missingData.append(i)

In [77]:
for i in missingData:
    df[i].fillna(df[i].median(),inplace=True)
missing_counts(df)

No Missing Values


In [78]:
df = pd.get_dummies(df,
                    columns=columnsToEncode,
                    prefix=columnsToEncode)

# Model Setup

In [79]:
X = df.drop(columns='y')
y = df['y']
print(df.shape)
print(X.shape)
print(y.shape)

(160000, 71)
(160000, 70)
(160000,)


In [80]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Score Functions

In [81]:
valueCounts = df['y'].value_counts()
print(valueCounts)
lowestTotalCost = valueCounts[0] * 40 + valueCounts[1] * 100
print(lowestTotalCost)

0    95803
1    64197
Name: y, dtype: int64
10251820


In [124]:
def threshold_test(probabilities, th):
    return [1 if prob >= th else 0 for prob in probabilities]
def cost_score(confMatrix):
    return (confMatrix[0,1] * 40 + confMatrix[1][0] * 100)
def find_cost(yTrue, yProb):
    thresholds = np.linspace(0,1,101)
    lowestTh = 1
    lowestCost = lowestTotalCost
    for t in thresholds:
        confMatrix = confusion_matrix(yTrue, threshold_test(yProb,t))
        cost = cost_score(confMatrix)
        if cost < lowestCost:
            lowestCost = cost
            lowestTh = t
            lowestCM = confMatrix
        print('Threshold:',t)
        print('Total Money Lost:',cost)
        print('=====================')
    return lowestCost, lowestTh, lowestCM

In [83]:
def cross_validation_run(model):
    probs = cross_val_predict(model,
                              X,
                              y,
                              cv=5,
                              n_jobs=-1,
                              method='predict_proba')
    return probs[:,1]

# Random Forest
### Cost: 704,380

In [25]:
params = {'criterion':['gini','entropy'],
          'max_depth':[5,10,15,20,30,40],
          'min_samples_split':[16,12,10,8,6],
          'min_samples_leaf':[6,5,4,3,2],
          'max_features':[5,10,15,20,30,40],
          'class_weight':[None,'balanced','balanced_subsample']}

rfModel = RandomForestClassifier(n_estimators=10)

rfSearchModel = RandomizedSearchCV(rfModel,
                                   params,
                                   n_iter=20,
                                   cv=5,
                                   n_jobs=-1,
                                   verbose=True)
rfSearchModel.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [26]:
print('RF Search Accuracy:', rfSearchModel.best_score_)
rfBestParams = rfSearchModel.best_params_
print(rfBestParams)

RF Search Accuracy: 0.9202375
{'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 40, 'max_depth': 20, 'criterion': 'entropy', 'class_weight': 'balanced'}


In [38]:
rfModel = RandomForestClassifier(n_estimators=50, **rfBestParams)
rfProb = cross_validation_run(rfModel)

In [39]:
print(find_cost(rfProb))

Threshold: 0.0
Total Money Lost: 3832120
Threshold: 0.01
Total Money Lost: 3474100
Threshold: 0.02
Total Money Lost: 3215600
Threshold: 0.03
Total Money Lost: 2949860
Threshold: 0.04
Total Money Lost: 2716540
Threshold: 0.05
Total Money Lost: 2503160
Threshold: 0.06
Total Money Lost: 2320700
Threshold: 0.07
Total Money Lost: 2156340
Threshold: 0.08
Total Money Lost: 2012400
Threshold: 0.09
Total Money Lost: 1879440
Threshold: 0.1
Total Money Lost: 1761320
Threshold: 0.11
Total Money Lost: 1660440
Threshold: 0.12
Total Money Lost: 1566480
Threshold: 0.13
Total Money Lost: 1476900
Threshold: 0.14
Total Money Lost: 1399540
Threshold: 0.15
Total Money Lost: 1328660
Threshold: 0.16
Total Money Lost: 1267240
Threshold: 0.17
Total Money Lost: 1206780
Threshold: 0.18
Total Money Lost: 1153160
Threshold: 0.19
Total Money Lost: 1104260
Threshold: 0.2
Total Money Lost: 1059500
Threshold: 0.21
Total Money Lost: 1015180
Threshold: 0.22
Total Money Lost: 977580
Threshold: 0.23
Total Money Lost: 9407

# Logistic Regression
### Cost: 2,798,840

In [32]:
logModel = LogisticRegression()

In [33]:
params = {'penalty':['l1','l2','elasticnet'],
          'C':[0.001,0.01,0.1,1,10,100],
          'l1_ratio':[0,0.01,0.25,0.5,0.75,1],
          'solver':['lbfgs','sag','saga','newton-cholesky'],
          'class_weight':[None,'balanced']}

logSearchModel = RandomizedSearchCV(logModel,
                                 params,
                                 n_iter=20,
                                 cv=5,
                                 n_jobs=-1,
                                 verbose=True)
logSearchModel.fit(X,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


45 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/taddbackus/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cholesky supports only 'l2' or 'n

In [35]:
print(logSearchModel.best_score_)
logBestParams = logSearchModel.best_params_
print(logBestParams)

0.7083250000000001
{'solver': 'newton-cholesky', 'penalty': 'l2', 'l1_ratio': 0.01, 'class_weight': 'balanced', 'C': 0.001}


In [40]:
logModel = LogisticRegression(**logBestParams)
logProb = cross_validation_run(logModel)



In [41]:
print(find_cost(logProb))

Threshold: 0.0
Total Money Lost: 3832120
Threshold: 0.01
Total Money Lost: 3832220
Threshold: 0.02
Total Money Lost: 3832580
Threshold: 0.03
Total Money Lost: 3832480
Threshold: 0.04
Total Money Lost: 3832940
Threshold: 0.05
Total Money Lost: 3833720
Threshold: 0.06
Total Money Lost: 3833940
Threshold: 0.07
Total Money Lost: 3833020
Threshold: 0.08
Total Money Lost: 3832260
Threshold: 0.09
Total Money Lost: 3829540
Threshold: 0.1
Total Money Lost: 3826360
Threshold: 0.11
Total Money Lost: 3819600
Threshold: 0.12
Total Money Lost: 3810980
Threshold: 0.13
Total Money Lost: 3797700
Threshold: 0.14
Total Money Lost: 3780500
Threshold: 0.15
Total Money Lost: 3760740
Threshold: 0.16
Total Money Lost: 3737960
Threshold: 0.17
Total Money Lost: 3708220
Threshold: 0.18
Total Money Lost: 3674320
Threshold: 0.19
Total Money Lost: 3639360
Threshold: 0.2
Total Money Lost: 3595420
Threshold: 0.21
Total Money Lost: 3551060
Threshold: 0.22
Total Money Lost: 3504680
Threshold: 0.23
Total Money Lost: 345

# Linear SVC
### Cost: 3,795,080

In [47]:
cValues = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}
svmModel = LinearSVC(dual=True)
svmSearchModel = GridSearchCV(svmModel,
                                cValues,
                                cv=5,
                                n_jobs=-1,
                                verbose=True)
svmSearchModel.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




In [48]:
print(svmSearchModel.best_score_)
svmBestParams = svmSearchModel.best_params_
print(svmBestParams)

0.7021000000000001
{'C': 0.01}


In [50]:
svmModel = LinearSVC(dual=True,**svmBestParams)
svmProb = cross_val_predict(svmModel,
                            X,
                            y,
                            cv=5,
                            n_jobs=-1)
#svmProb = cross_validation_run(svmModel)



In [51]:
print(find_cost(svmProb))

Threshold: 0.0
Total Money Lost: 3832120
Threshold: 0.01
Total Money Lost: 3795080
Threshold: 0.02
Total Money Lost: 3795080
Threshold: 0.03
Total Money Lost: 3795080
Threshold: 0.04
Total Money Lost: 3795080
Threshold: 0.05
Total Money Lost: 3795080
Threshold: 0.06
Total Money Lost: 3795080
Threshold: 0.07
Total Money Lost: 3795080
Threshold: 0.08
Total Money Lost: 3795080
Threshold: 0.09
Total Money Lost: 3795080
Threshold: 0.1
Total Money Lost: 3795080
Threshold: 0.11
Total Money Lost: 3795080
Threshold: 0.12
Total Money Lost: 3795080
Threshold: 0.13
Total Money Lost: 3795080
Threshold: 0.14
Total Money Lost: 3795080
Threshold: 0.15
Total Money Lost: 3795080
Threshold: 0.16
Total Money Lost: 3795080
Threshold: 0.17
Total Money Lost: 3795080
Threshold: 0.18
Total Money Lost: 3795080
Threshold: 0.19
Total Money Lost: 3795080
Threshold: 0.2
Total Money Lost: 3795080
Threshold: 0.21
Total Money Lost: 3795080
Threshold: 0.22
Total Money Lost: 3795080
Threshold: 0.23
Total Money Lost: 379

# SGD
### Cost: 2,819,680

In [52]:
sgdModel = SGDClassifier(early_stopping=True)

In [53]:
params = {'loss':['log_loss'],
          'alpha':[0.0001,0.001,0.01,0.1,1,10,100]
          }
sgdSearch = RandomizedSearchCV(sgdModel,
                               params,
                               cv=5,
                               scoring='accuracy')
sgdSearch.fit(X,y)



In [54]:
print(sgdSearch.best_score_)
sgdBestParams = sgdSearch.best_params_
print(sgdBestParams)

0.7014875
{'loss': 'log_loss', 'alpha': 0.01}


In [55]:
sgdModel = SGDClassifier(early_stopping=True, **sgdBestParams)
sgdProb = cross_validation_run(sgdModel)

In [56]:
print(find_cost(sgdProb))

Threshold: 0.0
Total Money Lost: 3832120
Threshold: 0.01
Total Money Lost: 3832420
Threshold: 0.02
Total Money Lost: 3832640
Threshold: 0.03
Total Money Lost: 3833940
Threshold: 0.04
Total Money Lost: 3834820
Threshold: 0.05
Total Money Lost: 3835440
Threshold: 0.06
Total Money Lost: 3834640
Threshold: 0.07
Total Money Lost: 3832640
Threshold: 0.08
Total Money Lost: 3826160
Threshold: 0.09
Total Money Lost: 3813160
Threshold: 0.1
Total Money Lost: 3796320
Threshold: 0.11
Total Money Lost: 3769760
Threshold: 0.12
Total Money Lost: 3738860
Threshold: 0.13
Total Money Lost: 3699340
Threshold: 0.14
Total Money Lost: 3651960
Threshold: 0.15
Total Money Lost: 3599920
Threshold: 0.16
Total Money Lost: 3538880
Threshold: 0.17
Total Money Lost: 3475980
Threshold: 0.18
Total Money Lost: 3405020
Threshold: 0.19
Total Money Lost: 3332820
Threshold: 0.2
Total Money Lost: 3259680
Threshold: 0.21
Total Money Lost: 3193480
Threshold: 0.22
Total Money Lost: 3124020
Threshold: 0.23
Total Money Lost: 305

In [103]:
y = np.array(y)

In [116]:
skf = StratifiedKFold(n_splits=5,shuffle=True)
#skf.get_n_splits(X,y)
cvDict = {}

for i, (trainIDX, testIDX) in enumerate(skf.split(X,y)):
    X_train, X_test = X[trainIDX], X[testIDX]
    y_train, y_test = y[trainIDX], y[testIDX]
    cvDict[f'Split {i}'] = [X_train,y_train,X_test,y_test]
    #cvDict[f'Test {i}'] = [X_test,y_test]

pprint.pprint(cvDict)

{'Split 0': [array([[-0.44605786, -0.62505897,  0.43485253, ...,  2.17340491,
        -0.01369435, -1.31783158],
       [-0.40114163, -0.09259046,  2.18432376, ..., -0.46010755,
        -0.01369435,  0.75882231],
       [-0.86411733, -0.22573363,  1.00978676, ..., -0.46010755,
        -0.01369435,  0.75882231],
       ...,
       [ 2.22714248,  0.75755837,  1.75647722, ..., -0.46010755,
        -0.01369435,  0.75882231],
       [-2.15965573,  0.84562115,  0.63243641, ..., -0.46010755,
        -0.01369435,  0.75882231],
       [ 0.91689652,  1.20006241,  0.49115971, ..., -0.46010755,
        -0.01369435,  0.75882231]]),
             array([0, 0, 0, ..., 0, 1, 0]),
             array([[-0.73385659,  0.0481012 , -0.76872425, ...,  2.17340491,
        -0.01369435, -1.31783158],
       [-0.6081221 ,  1.79003085,  0.90277881, ..., -0.46010755,
        -0.01369435,  0.75882231],
       [-1.15928713, -0.15395949, -0.4718607 , ..., -0.46010755,
        -0.01369435,  0.75882231],
       ...,
   

In [118]:
for i,j in cvDict.items():
    print(i)
    print(j[0].shape)
    print(j[1].shape)
    print(j[2].shape)
    print(j[3].shape)

Split 0
(128000, 70)
(128000,)
(32000, 70)
(32000,)
Split 1
(128000, 70)
(128000,)
(32000, 70)
(32000,)
Split 2
(128000, 70)
(128000,)
(32000, 70)
(32000,)
Split 3
(128000, 70)
(128000,)
(32000, 70)
(32000,)
Split 4
(128000, 70)
(128000,)
(32000, 70)
(32000,)


# XG Boost
### Cost: 638,500

In [154]:
num_round = 5000
params = {'objective':'binary:logistic',
          'max_depth':8,
          'eta':0.1,
          'eval_metric':'error'}

in cv dict    
0 = xtrain    
1 = ytrain    
2 = xtest    
3 = ytest    

In [152]:
#xgbscores = pd.DataFrame()
xgbCost = []
for i,j in cvDict.items():
    print('====================')
    print(i)
    dTrain = xgb.DMatrix(j[0], label=j[1])
    dTest = xgb.DMatrix(j[2], label=j[3])
    evalList = [(dTest,'test'),(dTrain,'train')]
    
    xgbModel = xgb.train(params=params,
                         dtrain=dTrain,
                         num_boost_round=num_round,
                         verbose_eval=False,
                         early_stopping_rounds=5,
                         evals=evalList)
    xgbProb = xgbModel.predict(dTest)
    #xgbProb[i] = xgbModel.predict(dTest)
    splitCost, splitTH, splitCM = find_cost(j[3],xgbProb)
    xgbCost.append(splitCost)

    print('====================')

Split 0
Threshold: 0.0
Total Money Lost: 766440
Threshold: 0.01
Total Money Lost: 269780
Threshold: 0.02
Total Money Lost: 210300
Threshold: 0.03
Total Money Lost: 181760
Threshold: 0.04
Total Money Lost: 165120
Threshold: 0.05
Total Money Lost: 153520
Threshold: 0.06
Total Money Lost: 146140
Threshold: 0.07
Total Money Lost: 139560
Threshold: 0.08
Total Money Lost: 135500
Threshold: 0.09
Total Money Lost: 131800
Threshold: 0.1
Total Money Lost: 128460
Threshold: 0.11
Total Money Lost: 124500
Threshold: 0.12
Total Money Lost: 121400
Threshold: 0.13
Total Money Lost: 119520
Threshold: 0.14
Total Money Lost: 117080
Threshold: 0.15
Total Money Lost: 115900
Threshold: 0.16
Total Money Lost: 114360
Threshold: 0.17
Total Money Lost: 113300
Threshold: 0.18
Total Money Lost: 112580
Threshold: 0.19
Total Money Lost: 112080
Threshold: 0.2
Total Money Lost: 110960
Threshold: 0.21
Total Money Lost: 111580
Threshold: 0.22
Total Money Lost: 111500
Threshold: 0.23
Total Money Lost: 112440
Threshold: 

In [153]:
print(sum(xgbCost))

558140


best xg    
max depth 8   
eta 0.1   
533880

# Neural Net
### Cost: 290,860

In [130]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=5)
nnModel = tf.keras.Sequential()
nnModel.add(tf.keras.Input(shape=(70,)))
nnModel.add(layers.Dense(128,activation='relu'))
nnModel.add(layers.Dense(128,activation='relu'))
nnModel.add(layers.Dropout(0.3))
nnModel.add(layers.Dense(128,activation='relu'))
nnModel.add(layers.Dense(128,activation='relu'))
nnModel.add(layers.Dropout(0.3))
nnModel.add(layers.Dense(1,activation='sigmoid'))

2023-12-01 10:06:16.855802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [131]:
nnCost = []
for i,j in cvDict.items():
    print('====================')
    print(i)
    xTrain, xVal, yTrain, yVal = train_test_split(j[0],
                                                  j[1],
                                                  test_size=0.1)
    nnModel.compile(optimizer=tf.keras.optimizers.Adam(),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
    nnFit = nnModel.fit(xTrain,
                        yTrain,
                        epochs=1000,
                        batch_size=32,
                        callbacks=[callback],
                        validation_data=[xVal,yVal])
    nnProb = nnModel.predict(j[2])
    splitCost, splitTH, splitCM = find_cost(j[3],nnProb)
    nnCost.append(splitCost)
    

Split 0
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Threshold: 0.0
Total Money Lost: 766440
Threshold: 0.01
Total Money Lost: 359480
Threshold: 0.02
Total Money Lost: 246460
Threshold: 0.03
Total Money Lost: 186300
Threshold: 0.04
Total Money Lost: 152800
Threshold: 0.05
Total Money Lost: 130220
Threshold: 0.06
Total Money Lost: 115680
Threshold: 0.07
Total Money Lost: 105260
Threshold: 0.08
Total Money Lost: 99540
Threshold: 0.09
Total Money Lost: 94860
Threshold: 0.1
Total Money Lost: 92040
Threshold: 0.11
Total Money Lost: 89820
Threshold: 0.12
Total Money Lost: 86500
Threshold: 0.13
Total Money Lost: 85280
Threshold: 0.14
Total Money Lost: 83600
Threshold: 0.15
Total Money Lost: 83060
Threshold: 0.16
Total Money Lost: 82500
Threshold: 0.17
Total Money Lost: 81320
Threshold: 0.18
Total Money Lost: 809

In [132]:
print(sum(nnCost))

290860
