In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
import random
import pickle
from src.models.train import load_cnc_features, prepare_cnc_data

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
path_data_dir = Path.cwd().parent.parent / "data"

print(path_data_dir)

/home/tim/Documents/feat-store/data


In [37]:
# load feature dataframe

label_file_name = (
    "high_level_labels_MASTER_update2020-08-06_new-jan-may-data_with_case.csv"
)

df = load_cnc_features(
    path_data_dir, 
    path_processed_dir=path_data_dir / "processed/cnc/cnc_features_comp",
    feat_file_name="cnc_features_54_comp.csv", 
    label_file_name=label_file_name,
)

print(df.shape)

(43973, 756)


In [38]:
# prepare data

META_LABEL_COLS = ["id", "unix_date", "tool_no", "index_no", "case_tool_54"]

(
    df,
    dataprep_method,
    meta_label_cols,
    cnc_indices_keep,
    cnc_cases_drop,
) = prepare_cnc_data(
    df,
    dataprep_method="cnc_standard_index_select",
    meta_label_cols=META_LABEL_COLS,
    cnc_indices_keep=[2, 3, 4, 5, 6, 7],
    cnc_cases_drop=None,
)

print(df.shape)

df_test = df[df["case_tool_54"].isin([9, 17, 21, 23, 25, 31, 35, 15])]
print(df_test.shape)

# columns to keep
selected_feat_cols = ['current__cwt_coefficients__coeff_5__w_2__widths_(2, 5, 10, 20)', 'current__fft_coefficient__attr_"imag"__coeff_50', 'current__fft_coefficient__attr_"imag"__coeff_59', 'current__fft_coefficient__attr_"angle"__coeff_33', 'current__agg_linear_trend__attr_"slope"__chunk_len_5__f_agg_"min"', 'current__augmented_dickey_fuller__attr_"usedlag"__autolag_"AIC"', 'current__fft_coefficient__attr_"abs"__coeff_19', 'current__c3__lag_3', 'current__agg_linear_trend__attr_"intercept"__chunk_len_5__f_agg_"mean"', 'current__index_mass_quantile__q_0.3']



(29544, 756)
(7411, 756)


In [39]:
df.head()

Unnamed: 0,id,current__time_reversal_asymmetry_statistic__lag_1,current__time_reversal_asymmetry_statistic__lag_2,current__time_reversal_asymmetry_statistic__lag_3,current__c3__lag_1,current__c3__lag_2,current__c3__lag_3,current__cid_ce__normalize_True,current__cid_ce__normalize_False,current__symmetry_looking__r_0.0,...,"current__matrix_profile__feature_""max""__threshold_0.98","current__matrix_profile__feature_""mean""__threshold_0.98","current__matrix_profile__feature_""median""__threshold_0.98","current__matrix_profile__feature_""25""__threshold_0.98","current__matrix_profile__feature_""75""__threshold_0.98",unix_date,tool_no,index_no,case_tool_54,y
1,1540298934_54_2,-6912358.0,-13106910.0,-19315070.0,6473963000.0,6450330000.0,6453858000.0,9.929669,8008.548807,0.0,...,7.838717,3.066461,1.857761,1.624711,4.336304,1540298934,54,2,1,0
2,1540298934_54_5,-270712.8,-452583.9,-518830.6,3376005000.0,3375504000.0,3373084000.0,4.38928,2097.344273,0.0,...,3.672813,2.370622,2.496121,1.804606,2.813998,1540298934,54,5,1,0
3,1540298934_54_3,-38746.79,18835.53,98456.39,2574682000.0,2572420000.0,2568122000.0,7.268473,2443.065083,0.0,...,3.613585,1.835852,1.862852,1.627676,2.047316,1540298934,54,3,1,0
6,1540298934_54_4,-65383.21,-7659.306,40525.51,4650431000.0,4649158000.0,4646561000.0,5.674855,2679.747003,0.0,...,4.504875,2.893055,3.04039,2.601408,3.31569,1540298934,54,4,1,0
7,1540298934_54_6,-354431.4,-600889.4,-806040.1,2749130000.0,2747418000.0,2743708000.0,4.874719,2010.304952,0.0,...,3.648908,2.125288,2.19723,1.760687,2.458047,1540298934,54,6,1,0


In [40]:
# load models

# scale and get predictions/probabilities
model_name = "model_401172_rf_2022-08-05-0003-19_cnc.pkl"
scaler_name = "scaler_401172_rf_2022-08-05-0003-19_cnc.pkl"

# load sklearn scaler from scaler file
with open(scaler_name, "rb") as f:
    scaler = pickle.load(f)

# load the model
with open(model_name, "rb") as f:
    model = pickle.load(f)


In [41]:
feat_cols_all = df_test.drop(columns=META_LABEL_COLS+["y"]).columns

In [44]:
if scaler is not None:
    features = scaler.transform(df_test.drop(columns=META_LABEL_COLS+["y"]).values)
    x_test = pd.DataFrame(features, columns=feat_cols_all)
else:
    x_test = df_test.drop(columns=META_LABEL_COLS+["y"])
    print("No scaler used.")

x_test = x_test[selected_feat_cols].values

probabilities = model.predict_proba(x_test)
predictions = model.predict(x_test)

In [46]:
predictions.sum()

307

In [59]:
df_pred = df_test.copy().drop(columns=feat_cols_all)
df_pred["y_pred"] = predictions

# create new columns with probabilities, one for each probability
for i, col in enumerate(model.classes_):
    df_pred[f"proba_{col}"] = probabilities[:, i]

df_pred = df_pred.sort_values(["unix_date", "index_no",], ascending=True)

df_pred.head()

Unnamed: 0,id,unix_date,tool_no,index_no,case_tool_54,y,y_pred,proba_0,proba_1
24813,1548261254_54_2,1548261254,54,2,9,0,0,1.0,0.0
24818,1548261254_54_3,1548261254,54,3,9,0,0,0.995035,0.004965
24820,1548261254_54_4,1548261254,54,4,9,0,0,0.995127,0.004873
24817,1548261254_54_5,1548261254,54,5,9,0,0,1.0,0.0
24812,1548261254_54_6,1548261254,54,6,9,0,0,1.0,0.0


In [60]:
# save df_pred to csv
df_pred.to_csv(
    "predictions_401172_rf_2022-08-05-0003-19_cnc.csv", index=False
)

In [54]:
df_pred.head()

Unnamed: 0,id,unix_date,tool_no,index_no,case_tool_54,y,y_pred,proba_0,proba_1
1046,1548261767_54_6,1548261767,54,6,9,0,0,1.0,0.0
1047,1548261767_54_3,1548261767,54,3,9,0,0,1.0,0.0
1048,1548261767_54_4,1548261767,54,4,9,0,0,0.990084,0.009916
1050,1548261767_54_2,1548261767,54,2,9,0,0,1.0,0.0
1052,1548261767_54_7,1548261767,54,7,9,0,0,1.0,0.0


In [69]:
df_pred[(df_pred["y"]==0) & (df_pred["y_pred"]==1)].groupby("case_tool_54").agg({"y_pred": "count"}).reset_index(drop=False).rename(columns={"y_pred": "false_positive_count"})

Unnamed: 0,case_tool_54,false_positives
0,17,1
1,23,31
2,25,5
3,35,21


In [70]:
df_pred[(df_pred["y"]==1) & (df_pred["y_pred"]==0)].groupby("case_tool_54").agg({"y_pred": "count"}).reset_index(drop=False).rename(columns={"y_pred": "false_negative_count"})

Unnamed: 0,case_tool_54,false_negative_count
0,15,6
1,31,84
