In [36]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

In [6]:
df = pd.read_pickle('../data/clean_data.pickle')

In [20]:
df['Sex'] = df.Sex.apply(lambda x: 1 if x=='Female' else 0 if x=='Male' else pd.nan)

In [21]:
pca_pipe = Pipeline(
    [
        ('scaler', MinMaxScaler()),
        ('pca', PCA(n_components=100))        
    ]
)

In [22]:
pca_cols = [col for col in df.columns.tolist() if 'timestamp' in col]

In [23]:
preprocessor = ColumnTransformer(
    [('pca_pipe', pca_pipe, pca_cols)],
    remainder='passthrough'
)

In [28]:
model = Pipeline(
    [
        ('preproc', preprocessor),
        ('clf', CatBoostClassifier())
    ]
)

In [29]:
df.target.value_counts()

target
1    370
0     80
Name: count, dtype: int64

In [30]:
X = df.drop(columns='target')
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [31]:
model.fit(X_train, y_train)

Learning rate set to 0.006475
0:	learn: 0.6892266	total: 64.3ms	remaining: 1m 4s
1:	learn: 0.6852699	total: 69.3ms	remaining: 34.6s
2:	learn: 0.6814971	total: 74ms	remaining: 24.6s
3:	learn: 0.6769464	total: 78.4ms	remaining: 19.5s
4:	learn: 0.6724192	total: 81.7ms	remaining: 16.3s
5:	learn: 0.6692472	total: 84.8ms	remaining: 14s
6:	learn: 0.6659109	total: 87.9ms	remaining: 12.5s
7:	learn: 0.6628276	total: 91.6ms	remaining: 11.4s
8:	learn: 0.6597160	total: 96ms	remaining: 10.6s
9:	learn: 0.6556079	total: 98.9ms	remaining: 9.79s
10:	learn: 0.6526058	total: 102ms	remaining: 9.17s
11:	learn: 0.6493147	total: 105ms	remaining: 8.64s
12:	learn: 0.6463419	total: 108ms	remaining: 8.18s
13:	learn: 0.6427390	total: 111ms	remaining: 7.82s
14:	learn: 0.6390849	total: 114ms	remaining: 7.48s
15:	learn: 0.6359512	total: 117ms	remaining: 7.2s
16:	learn: 0.6327013	total: 120ms	remaining: 6.93s
17:	learn: 0.6294947	total: 123ms	remaining: 6.7s
18:	learn: 0.6260531	total: 126ms	remaining: 6.51s
19:	learn

0,1,2
,steps,"[('preproc', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('pca_pipe', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_components,100
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [33]:
balanced_accuracy_score(y_train, model.predict(X_train))

1.0

In [34]:
balanced_accuracy_score(y_test, model.predict(X_test))

0.5294117647058824

In [35]:
balanced_accuracy_score(y, model.predict(X))

0.9

In [38]:
cm = confusion_matrix(y_test, model.predict(X_test))

In [None]:
tn, fp, fn, tp = cm.ravel()

print(f"\nTrue Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")


True Positives (TP): 96
True Negatives (TN): 1
False Positives (FP): 16
False Negatives (FN): 0


In [40]:
c = len(y_test)

print(f"\nTrue Positives (TP): {tp/c}")
print(f"True Negatives (TN): {tn/c}")
print(f"False Positives (FP): {fp/c}")
print(f"False Negatives (FN): {fn/c}")


True Positives (TP): 0.8495575221238938
True Negatives (TN): 0.008849557522123894
False Positives (FP): 0.1415929203539823
False Negatives (FN): 0.0


In [41]:
model_data_path = '../data/model.pickle'

In [42]:
import pickle

In [43]:
with open(model_data_path, 'wb') as f:
    pickle.dump(model, f)