In [2]:
pip install pandas numpy scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

## Load the client data
client_df = pd.read_csv('data_files/client.csv')
#client_arr = np.array(client_df)

## Load the invoice data
invoice_df = pd.read_csv('data_files/invoice.csv')
#invoice_arr = np.array(invoice_df)


In [4]:
# Convert date columns to datetime format
client_df['date'] = pd.to_datetime(client_df['date'], errors='coerce')
invoice_df['date'] = pd.to_datetime(invoice_df['date'], errors='coerce')

# Check if any date conversions failed (i.e., resulted in NaT)
print(client_df[client_df['date'].isna()])
print(invoice_df[invoice_df['date'].isna()])


  client_df['date'] = pd.to_datetime(client_df['date'], errors='coerce')
  invoice_df['date'] = pd.to_datetime(invoice_df['date'], errors='coerce')


Empty DataFrame
Columns: [region, date, dis, id, catg, target]
Index: []
Empty DataFrame
Columns: [id, date, tarif_type, counter_statue, reading_remarque, consommation_level_4, months_number, counter_type, counter_coefficient, consommation_level_1, consommation_level_2, consommation_level_3]
Index: []


In [5]:
## Identify missing data

print(client_df.isnull().sum())
print(invoice_df.isnull().sum())

region    0
date      0
dis       0
id        0
catg      0
target    0
dtype: int64
id                      0
date                    0
tarif_type              0
counter_statue          0
reading_remarque        0
consommation_level_4    0
months_number           0
counter_type            0
counter_coefficient     0
consommation_level_1    0
consommation_level_2    0
consommation_level_3    0
dtype: int64


It can be observed that there are no missing data!

In [6]:
## Merging data sets based on id variable

merged_df = pd.merge(invoice_df, client_df, on='id', how='inner')
#merged_arr = np.array(merged_df)

## Preview the merged dataframe
print(merged_df.head())


   id     date_x  tarif_type  counter_statue  reading_remarque  \
0   0 2014-03-24          11               0                 8   
1   0 2013-03-29          11               0                 6   
2   0 2015-03-23          11               0                 8   
3   0 2015-07-13          11               0                 8   
4   0 2016-11-17          11               0                 9   

   consommation_level_4  months_number counter_type  counter_coefficient  \
0                     0              4         ELEC                    1   
1                     0              4         ELEC                    1   
2                     0              4         ELEC                    1   
3                     0              4         ELEC                    1   
4                     0             12         ELEC                    1   

   consommation_level_1  consommation_level_2  consommation_level_3  region  \
0                    82                     0                     0

In [7]:
# Converting counter_statue into a categorical variable
merged_df['counter_statue'] = merged_df['counter_statue'].astype('category').cat.codes

# Adding a new column for year and month for each row
merged_df['year'] = merged_df['date_x'].dt.year
merged_df['month'] = merged_df['date_x'].dt.month

In [8]:
from sklearn.model_selection import train_test_split

# Define features and target
xval = merged_df.drop(columns=['target','date_x', 'date_y', 'counter_type'])
yval = merged_df['target']

# Split the data train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(xval, yval, test_size=0.3, random_state=0)

print('xtrain shape:', xtrain.shape)
print('xtest shape:', xtest.shape)
print('ytrain shape:', ytrain.shape)
print('ytest shape:', ytest.shape)

xtrain shape: (350455, 15)
xtest shape: (150196, 15)
ytrain shape: (350455,)
ytest shape: (150196,)


NOTE TO SELF: 
Current task is to standardise the data, but need to be mindful of DATE column

In [9]:
## Standardising the train and test data

from sklearn.preprocessing import StandardScaler
std_x = StandardScaler()
xtrain = std_x.fit_transform(xtrain)
xtest = std_x.transform(xtest)

print('xtrain shape:', xtrain.shape)
print('xtest shape:', xtest.shape)

xtrain shape: (350455, 15)
xtest shape: (150196, 15)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize fraud distribution
sns.countplot(x='target', data=client_df)
plt.title('Fraud vs Non-Fraud Distribution')
plt.show()

# Visualize counter status and its impact on fraud
sns.countplot(x='counter_statue', hue='target', data=merged_df)
plt.title('Counter Status vs Fraud')
plt.show()

TRAINING LOGISTIC REGRESSION MODEL


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the logistic regression model
logit_reg = LogisticRegression(random_state=0)

# Fit the model to the training data
logit_reg.fit(xtrain, ytrain)


In [17]:
y_pred = logit_reg.predict(xtest)

# Print evaluation metrics
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))

Accuracy: 0.9374350848224986
Confusion Matrix:
 [[140798     13]
 [  9384      1]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97    140811
           1       0.07      0.00      0.00      9385

    accuracy                           0.94    150196
   macro avg       0.50      0.50      0.48    150196
weighted avg       0.88      0.94      0.91    150196



In [18]:
# Retrieve feature names (after scaling)
feature_names = merged_df.drop(columns=['target', 'id', 'date_x', 'date_y']).columns

# Print coefficients with feature names
for feature, coef in zip(feature_names, log_reg.coef_[0]):
    print(f"{feature}: {coef:.4f}")

tarif_type: -0.0474
counter_statue: -0.0166
reading_remarque: 0.0231
consommation_level_4: 0.0342
months_number: -0.0268
counter_type: -0.9813
counter_coefficient: -0.2804
consommation_level_1: 0.0523
consommation_level_2: 0.0050
consommation_level_3: 0.1262
region: 0.1203
dis: 0.2671
catg: 0.1266
year: -0.0182
month: -0.0119


EXPERIMENTING WITH NEURAL NETWORKS

In [21]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-macosx_12_0_arm64.whl (239.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.4/239.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting numpy<2.1.0,>=1.26.0
  Downloading numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting absl-py>=1.0.0
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3
  Downloading protobuf-5.28.3-cp38-abi3-macosx_10_9_universal2.whl (414 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [22]:
import keras
import tensorflow as tf
from keras.utils import to_categorical
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten

  if dtype.type == np.bool:
