# Model XYZ
 Using aggregated invoice data

Summary here...

In [1]:
import pandas as pd
import numpy as np

import pyarrow
import fastparquet

from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt

#from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report, accuracy_score, recall_score, precision_score, roc_auc_score

# for saving model to disk
import time
from skops.io import dump, load, get_untrusted_types

## Run data cleaning script, generate parquet files. 
Requires pyarrow and fastparquet (see requirements.txt)

In [2]:
# Open and run **ETL_export_parquet_invoice-level_+_client-level_mean.py**

## Import parquet files

In [3]:
# Import clean data 
# - Each row represents one invoice.
df_train_non_agg = pd.read_parquet("data/df_train_non_agg.parquet")
df_final_test_non_agg = pd.read_parquet("data/df_final_test_non_agg.parquet") # this is not our "test" data, we will still need to perform the test-train split

# Import clean data
# - Each row represents one client. 
# - Invoices are summarised. 
# - The "consommation_level_x" columns show mean energy consumption per client.
df_train_agg = pd.read_parquet("data/df_train_agg.parquet")
df_final_test_agg = pd.read_parquet("data/df_final_test_agg.parquet") # this is not our "test" data, we will still need to perform the test-train split

In [4]:
# Choose dataset to use in this document
df = df_train_agg
df_final_test = df_final_test_agg

In [5]:
df.head()

Unnamed: 0,creation_date,target,transactions_count,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,counter_type_GAZ,counter_type_ELEC,...,counter_code_450,counter_code_453,counter_code_467,counter_code_483,counter_code_5,counter_code_506,counter_code_532,counter_code_565,counter_code_600,counter_code_65
0,34699,Not Fraud,35.0,1.0,352.4,10.571429,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False
1,37405,Not Fraud,37.0,1.0,557.540541,0.0,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False
2,31484,Not Fraud,18.0,1.0,798.611111,37.888889,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False
3,35257,Not Fraud,20.0,1.0,1.2,0.0,0.0,0.0,False,True,...,False,False,False,False,False,False,False,False,False,False
4,41926,Not Fraud,14.0,1.0,663.714286,104.857143,117.357143,36.714286,False,True,...,False,False,False,False,False,False,False,False,False,False


In [6]:
df.describe()

Unnamed: 0,creation_date,transactions_count,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,reading_remarque,counter_statue
count,135493.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0,128000.0
mean,37530.781435,34.615531,1.001211,407.277247,117.61988,27.496823,75.918575,8.781734,0.010695
std,4215.414221,25.78289,0.183005,342.787037,756.054243,96.626119,833.338704,0.726306,0.160331
min,28161.0,1.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0
25%,34346.0,13.0,1.0,220.530312,0.0,0.0,0.0,9.0,0.0
50%,38614.0,31.5,1.0,355.161002,6.892857,0.0,0.0,9.0,0.0
75%,41003.0,52.0,1.0,533.078947,66.688702,5.560244,0.0,9.0,0.0
max,43718.0,434.0,40.0,34024.0,115683.0,2400.0,79179.777778,9.0,5.0


## Split Data to Train and Test

In [18]:
# Make sure final_test has same dummy columns in as training data (filled with False)
def add_missing_dummy_columns_and_fill_false(train, final_test): 

    cols_missing_from_final_test = (set(train) - set(final_test)) - set(['target'])

    for cols in cols_missing_from_final_test:   
        final_test[cols] = False
    print(f"Adding new cols to final_test: {cols_missing_from_final_test}")

add_missing_dummy_columns_and_fill_false(df, df_final_test)

Adding new cols to final_test: set()


In [19]:
X = df.loc[:, ~df.columns.isin(["target"])]
y = df["target"]
X_final_test = df_final_test

# make sure column order the same in X and X_final_test
X, X_final_test = X.align(X_final_test, join="right", axis=1)

RSEED = 42

# 30% examples in test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=RSEED
)


## Imputing missing values

In [107]:
# % of missing values for each column in Training data
(100 * df.isnull().sum() / df.count()).sort_values(ascending=False)

counter_type_ELEC       5.853906
transactions_count      5.853906
counter_coefficient     5.853906
consommation_level_1    5.853906
consommation_level_2    5.853906
                          ...   
region_309              0.000000
region_308              0.000000
region_307              0.000000
region_306              0.000000
counter_code_65         0.000000
Length: 89, dtype: float64

In [108]:
# % of missing values for each column in the final test data. Compare with training data above to see if similar proportion.
(100 * df_final_test.isnull().sum() / df_final_test.count()).sort_values(ascending=False)

transactions_count      5.814716
consommation_level_4    5.814716
counter_statue          5.814716
counter_type_ELEC       5.814716
counter_type_GAZ        5.814716
                          ...   
region_310              0.000000
region_309              0.000000
region_308              0.000000
region_307              0.000000
counter_code_307        0.000000
Length: 88, dtype: float64

In [134]:
# What does the data look like for rows with missing data
df_missing_values = df[pd.isnull(df["transactions_count"]) == True]

In [20]:
X_train

Unnamed: 0,creation_date,transactions_count,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,counter_type_GAZ,counter_type_ELEC,reading_remarque,...,counter_code_483,counter_code_5,counter_code_506,counter_code_532,counter_code_565,counter_code_600,counter_code_65,region_199,counter_code_307,counter_code_305
119285,38610,41.0,1.0,398.195122,2.390244,0.000000,0.0,False,True,9.0,...,False,False,False,False,False,False,False,False,False,False
19300,38428,41.0,1.0,636.439024,13.585366,0.000000,0.0,False,True,9.0,...,False,False,False,False,False,False,False,False,False,False
21012,33821,70.0,1.0,677.085714,0.857143,0.000000,0.0,True,True,9.0,...,False,False,False,False,False,False,False,False,False,False
122551,39862,4.0,1.0,0.000000,0.000000,0.000000,0.0,True,False,6.0,...,False,False,False,False,False,False,False,False,False,False
98915,35256,74.0,1.0,639.121622,107.635135,1.932432,0.0,True,True,9.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97141,30606,61.0,1.0,501.754098,31.180328,6.655738,0.0,True,True,9.0,...,False,False,False,False,False,False,False,False,False,False
113043,39620,2.0,1.0,842.500000,1442.500000,0.000000,0.0,False,True,6.0,...,False,False,False,False,False,False,False,False,False,False
117510,42408,9.0,1.0,299.666667,0.000000,0.000000,0.0,False,True,9.0,...,False,False,False,False,False,False,False,False,False,False
14944,40743,10.0,1.0,0.400000,0.000000,0.000000,0.0,False,True,9.0,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Pipeline

cat_features = list(X_train.columns[X_train.dtypes==object])
num_features = list(X_train.columns[X_train.dtypes!=object])

from sklearn.pipeline import Pipeline

# Pipeline for numerical features
# Initiating Pipeline and calling one step after another
# each step is built as a list of (key, value)
# key is the name of the processing step
# value is an estimator object (processing step)
num_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(missing_values=np.nan, strategy='mean')), # replace 0s with means 
   # ('std_scaler', StandardScaler())
])

# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
   # ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

# Complete pipeline for numerical and categorical features
# 'ColumnTransformer' applies transformers (num_pipeline/ cat_pipeline)
# to specific columns of an array or DataFrame (num_features/cat_features)
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Train Model

## Metrics for Best Model

In [None]:
# conv matrix

# f1 score

# Save best model for future use

In [None]:
# https://scikit-learn.org/stable/model_persistence.html#skops-persistence

timestamp_for_filename = time.strftime("%Y-%m %d_%H%M%S")
model_name = "example_model"

filepath = f"./models/{model_name} {timestamp_for_filename}.skops"

obj = dump(best_model, filepath)
