# conseption et realisation d'un dashbord et d'un modele de detection de fraude des données de la Direction des Grandes Entreprise

## Preparation des données

### Importation des modules 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### parametrages des modules 

In [2]:
sns.set_theme()
#pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

### selection, affichage et filtrage des fichiers

In [3]:
TVA = pd.read_excel('VraiTVA.xlsx')
#TVA

In [4]:
ERA = pd.read_excel('VraiERA.xlsx')
#ERA

In [5]:
jointure = pd.merge(ERA, TVA, on='BP', how='inner')
#jointure = jointure[['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'Chiffre d’affaire (C.A)', 'Total TVA anuelle']]  

In [6]:
jointure.rename(columns={'Chiffre d’affaire (C.A)': 'ChAff'}, inplace=True)
jointure.rename(columns={'Total TVA anuelle': 'Total TVA anunelle'}, inplace=True)
#jointure.head()

### créer la colonne feature et cible


In [7]:
jointure['feature'] =  jointure['ChAff']  - jointure['Total TVA anunelle'] 

In [8]:
def estime(row):
    if row['ChAff'] == 0 :
        return 'sus'
    elif abs(row['feature']) > 0.2 * row['ChAff']:
        return 'fraude'
    else:
        return 'bon'

jointure['cible'] = jointure.apply(estime, axis=1)

In [9]:
jointure.head()

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible
0,2000000147,DIW Alger est,607047,/,0,4381490000.0,4710011000.0,5358351000.0,4811742000.0,4835182000.0,4467203000.0,3692317000.0,3713033000.0,5274394000.0,6056911000.0,4706919000.0,5852010000.0,57859560000.0,-57859560000.0,sus
1,2000000200,DIW Alger est,608001,/,93115698,644181.0,1478702.0,2304256.0,1226741.0,21477610.0,33439220.0,9184847.0,4261077.0,3033041.0,4966790.0,5425418.0,5673810.0,93115700.0,0.0,bon
2,2000007178,DIW Bordj Bou Arréridj,110202,/,155423274,4872747.0,4275042.0,3918504.0,6082352.0,7774514.0,27067300.0,22407960.0,8738202.0,5113770.0,5666626.0,26437940.0,33068320.0,155423300.0,0.0,bon
3,2000009570,DIW Alger est,405105,/,5798168362,423208600.0,396558100.0,411129300.0,628866000.0,313280200.0,410063900.0,333267400.0,561521000.0,713319000.0,495072400.0,596409800.0,527251500.0,5809947000.0,-11778630.0,bon
4,2000011509,DIW Alger est,409001,/,2062602782,160768700.0,293782400.0,332108000.0,123468900.0,105158600.0,233051700.0,130518600.0,172088600.0,159896900.0,143759600.0,74881360.0,132886200.0,2062370000.0,233138.0,bon


## Over sample de la donnée fraude dans le data set

In [27]:
import pandas as pd

# Assuming 'jointure' is your DataFrame and 'cible' is the target column

# Get the counts of each class
fraud_count = jointure['cible'].value_counts()
print(fraud_count)

# Specify the class you want to oversample, in this case, 'fraude'
target_class = 'fraude'

# Define the number of samples you want for the 'fraude' class after oversampling
# Here, you can use a multiplier to specify how many times larger the 'fraude' class should be
multiplier = 18
target_sample_count = fraud_count[target_class] * multiplier

# Calculate the additional samples you need
additional_samples = target_sample_count - fraud_count[target_class]

# Filter the 'fraude' class
fraude_data = jointure[jointure['cible'] == target_class]

# Randomly sample from the 'fraude' data
oversampled_fraude = fraude_data.sample(n=additional_samples, replace=True, random_state=42)

# Append the oversampled data to the original DataFrame
jointure_oversampled = pd.concat([jointure, oversampled_fraude])

# Shuffle the dataset to mix the rows up
jointure_oversampled = jointure_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)


cible
bon       669
fraude    371
sus       230
Name: count, dtype: int64


### Verification du nouveau data set

In [28]:
print(jointure_oversampled['cible'].value_counts())

cible
fraude    6678
bon        669
sus        230
Name: count, dtype: int64


In [29]:
print(jointure_oversampled.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7577 entries, 0 to 7576
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   7577 non-null   int64  
 1   Wilaya               7577 non-null   object 
 2   Code CNRC            7577 non-null   object 
 3   Code ONS             7577 non-null   object 
 4   ChAff                7577 non-null   int64  
 5   Total TVA Janvier    7577 non-null   float64
 6   Total TVA Février    7577 non-null   float64
 7   Total TVA Mars       7577 non-null   float64
 8   Total TVA Avril      7577 non-null   float64
 9   Total TVA Mai        7577 non-null   float64
 10  Total TVA Juin       7577 non-null   float64
 11  Total TVA Juillet    7577 non-null   float64
 12  Total TVA Août       7577 non-null   float64
 13  Total TVA Septembre  7577 non-null   float64
 14  Total TVA Octobre    7577 non-null   float64
 15  Total TVA Novembre   7577 non-null   f

## Encodage categoriel & Normalisation/Standardisation 

### encodage categoriel des wilaya

In [30]:
wilaya_count = jointure_oversampled['Wilaya'].value_counts()
print(wilaya_count)

Wilaya
DIW Alger centre          3430
DIW Alger est             1266
DIW Alger ouest            544
DIW Ouargla                324
DIW Oran Est               266
DIW Constantine            210
DIW Bordj Bou Arréridj     179
DIW Sidi Bel Abbes         159
DIW Blida                  154
DIW Boumerdès              139
DIW Batna                  112
DIW Béjaïa                  78
DIW Tipaza                  59
DIW Annaba                  54
DIW Sétif                   54
DIW Tlemcen                 53
DIW Médéa                   44
DIW El Oued                 44
Non affecté                 42
DIW M'Sila                  41
DIW Relizane                41
DIW Chlef                   40
DIW Mostaganem              39
DIW Jijel                   26
DIW Bouira                  25
DIW Mascara                 25
DIW Biskra                  23
DIW Aïn Témouchent          22
DIW Mila                    18
DIW Aïn Defla               15
DIW Khenchela               13
DIW Skikda                  12
D

In [31]:
# Clean and standardize the Wilaya names
jointure_oversampled['Standardized_Wilaya'] = jointure_oversampled['Wilaya'].str.replace('DIW ', '')

# Label Encoding
wilaya_mapping = {name: i + 1 for i, name in enumerate(jointure_oversampled['Standardized_Wilaya'].unique())}
jointure_oversampled['Wilaya_encoded'] = jointure_oversampled['Standardized_Wilaya'].map(wilaya_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled[['Wilaya', 'Standardized_Wilaya', 'Wilaya_encoded']])

                Wilaya Standardized_Wilaya  Wilaya_encoded
0     DIW Alger centre        Alger centre               1
1            DIW Blida               Blida               2
2        DIW Boumerdès           Boumerdès               3
3     DIW Alger centre        Alger centre               1
4     DIW Alger centre        Alger centre               1
...                ...                 ...             ...
7572  DIW Alger centre        Alger centre               1
7573       DIW Tlemcen             Tlemcen              24
7574  DIW Alger centre        Alger centre               1
7575   DIW Alger ouest         Alger ouest               9
7576     DIW Alger est           Alger est               4

[7577 rows x 3 columns]


### encodage categoriel des Codes ONS et CNRC

In [32]:
# Label Encoding for 'code CNRC'
cnrc_unique = pd.unique(jointure_oversampled['Code CNRC'])
code_cnrc_mapping = {code: idx + 1 for idx, code in enumerate(cnrc_unique)}
jointure_oversampled['code CNRC_encoded'] = jointure_oversampled['Code CNRC'].map(code_cnrc_mapping)

# Label Encoding for 'code ONS'
ons_unique = pd.unique(jointure_oversampled['Code ONS'])
code_ons_mapping = {code: idx + 1 for idx, code in enumerate(ons_unique)}
jointure_oversampled['code ONS_encoded'] = jointure_oversampled['Code ONS'].map(code_ons_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled)

              BP            Wilaya Code CNRC Code ONS        ChAff  \
0     2000045699  DIW Alger centre         /        /   2368236415   
1     2000045485         DIW Blida         /        /   4972529420   
2     2000045851     DIW Boumerdès    613203     4329      3270000   
3     2000045333  DIW Alger centre    615051        /     79528342   
4     2000046049  DIW Alger centre    613203     4329   1904035846   
...          ...               ...       ...      ...          ...   
7572  2000045279  DIW Alger centre         /        /     68491535   
7573  2000045636       DIW Tlemcen         /        /   2161481995   
7574  2000047138  DIW Alger centre         /        /  42416962978   
7575  2000045829   DIW Alger ouest         /        /    113184009   
7576  2000075783     DIW Alger est    406204        /  11655291128   

      Total TVA Janvier  Total TVA Février  Total TVA Mars  Total TVA Avril  \
0           208271800.0        375822309.0     138881343.0      258853851.0   
1

### encodage categoriel de target ENFIN

In [33]:
target_mapping = {
    'bon': 0,      # Typically, the "normal" class can be zero
    'fraude': 1,   # Positive class can be one, especially if it's a binary classification
    'sus': 2       # Additional class can take the next integer
}

# Apply the mapping to the target column
jointure_oversampled['target_encoded'] = jointure_oversampled['cible'].map(target_mapping)

# Display the DataFrame to check the new column
print(jointure_oversampled[['cible', 'target_encoded']])

       cible  target_encoded
0        bon               0
1        bon               0
2     fraude               1
3     fraude               1
4     fraude               1
...      ...             ...
7572  fraude               1
7573  fraude               1
7574  fraude               1
7575  fraude               1
7576  fraude               1

[7577 rows x 2 columns]


### verification du Data set :SOB: + supression des columns not needed 

In [34]:
jointure_oversampled

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible,Standardized_Wilaya,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045699,DIW Alger centre,/,/,2368236415,208271800.0,375822309.0,138881343.0,258853851.0,146356450.0,181860698.0,100697034.0,212282367.0,158430688.0,158680673.0,1.821715e+08,2.455568e+08,2.367865e+09,3.709372e+05,bon,Alger centre,1,1,1,0
1,2000045485,DIW Blida,/,/,4972529420,347527855.0,555571991.0,395049064.0,366687004.0,231591882.0,537435575.0,568663251.0,194194923.0,447391115.0,513004750.0,2.806076e+08,3.837142e+08,4.821439e+09,1.510902e+08,bon,Blida,2,1,1,0
2,2000045851,DIW Boumerdès,613203,4329,3270000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,3.270000e+06,fraude,Boumerdès,3,2,2,1
3,2000045333,DIW Alger centre,615051,/,79528342,512000.0,613949.0,1235900.0,2616950.0,3739900.0,10332893.0,9307997.0,351800.0,5810800.0,339512.0,6.856620e+05,5.082900e+06,4.063026e+07,3.889808e+07,fraude,Alger centre,1,3,1,1
4,2000046049,DIW Alger centre,613203,4329,1904035846,83431802.0,513724859.0,332714153.0,118568816.0,22208827.0,226770091.0,67667519.0,243446498.0,436709543.0,41580023.0,6.279605e+07,1.990076e+08,2.348626e+09,-4.445900e+08,fraude,Alger centre,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7572,2000045279,DIW Alger centre,/,/,68491535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,6.849154e+07,fraude,Alger centre,1,1,1,1
7573,2000045636,DIW Tlemcen,/,/,2161481995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,2.161482e+09,fraude,Tlemcen,24,1,1,1
7574,2000047138,DIW Alger centre,/,/,42416962978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,4.241696e+10,fraude,Alger centre,1,1,1,1
7575,2000045829,DIW Alger ouest,/,/,113184009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1.131840e+08,fraude,Alger ouest,9,1,1,1


In [35]:
print(jointure_oversampled.columns)

Index(['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'ChAff', 'Total TVA Janvier',
       'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
       'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet',
       'Total TVA Août', 'Total TVA Septembre', 'Total TVA Octobre',
       'Total TVA Novembre', 'Total TVA Décembre', 'Total TVA anunelle',
       'feature', 'cible', 'Standardized_Wilaya', 'Wilaya_encoded',
       'code CNRC_encoded', 'code ONS_encoded', 'target_encoded'],
      dtype='object')


In [36]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Wilaya', 'Code CNRC', 'Code ONS','cible'])

In [37]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Standardized_Wilaya'])

In [38]:
jointure_oversampled

Unnamed: 0,BP,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045699,2368236415,208271800.0,375822309.0,138881343.0,258853851.0,146356450.0,181860698.0,100697034.0,212282367.0,158430688.0,158680673.0,1.821715e+08,2.455568e+08,2.367865e+09,3.709372e+05,1,1,1,0
1,2000045485,4972529420,347527855.0,555571991.0,395049064.0,366687004.0,231591882.0,537435575.0,568663251.0,194194923.0,447391115.0,513004750.0,2.806076e+08,3.837142e+08,4.821439e+09,1.510902e+08,2,1,1,0
2,2000045851,3270000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,3.270000e+06,3,2,2,1
3,2000045333,79528342,512000.0,613949.0,1235900.0,2616950.0,3739900.0,10332893.0,9307997.0,351800.0,5810800.0,339512.0,6.856620e+05,5.082900e+06,4.063026e+07,3.889808e+07,1,3,1,1
4,2000046049,1904035846,83431802.0,513724859.0,332714153.0,118568816.0,22208827.0,226770091.0,67667519.0,243446498.0,436709543.0,41580023.0,6.279605e+07,1.990076e+08,2.348626e+09,-4.445900e+08,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7572,2000045279,68491535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,6.849154e+07,1,1,1,1
7573,2000045636,2161481995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,2.161482e+09,24,1,1,1
7574,2000047138,42416962978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,4.241696e+10,1,1,1,1
7575,2000045829,113184009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,1.131840e+08,9,1,1,1


### normalisation et standardisation 
 vu que nous allons faire a random forest and/or a regression tree, il serait preferable de standardiser les données 

In [39]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7577 entries, 0 to 7576
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   7577 non-null   int64  
 1   ChAff                7577 non-null   int64  
 2   Total TVA Janvier    7577 non-null   float64
 3   Total TVA Février    7577 non-null   float64
 4   Total TVA Mars       7577 non-null   float64
 5   Total TVA Avril      7577 non-null   float64
 6   Total TVA Mai        7577 non-null   float64
 7   Total TVA Juin       7577 non-null   float64
 8   Total TVA Juillet    7577 non-null   float64
 9   Total TVA Août       7577 non-null   float64
 10  Total TVA Septembre  7577 non-null   float64
 11  Total TVA Octobre    7577 non-null   float64
 12  Total TVA Novembre   7577 non-null   float64
 13  Total TVA Décembre   7577 non-null   float64
 14  Total TVA anunelle   7577 non-null   float64
 15  feature              7577 non-null   f

In [40]:
columns_to_standardize = [
    'ChAff',
    'Total TVA Janvier', 'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
    'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet', 'Total TVA Août',
    'Total TVA Septembre', 'Total TVA Octobre', 'Total TVA Novembre', 'Total TVA Décembre',
    'Total TVA anunelle'
]

# Apply standardization
for column in columns_to_standardize:
    jointure_oversampled[column] = (jointure_oversampled[column] - jointure_oversampled[column].mean()) / jointure_oversampled[column].std()

# Now your columns are standardized
print(jointure_oversampled[columns_to_standardize].head())

      ChAff  Total TVA Janvier  Total TVA Février  Total TVA Mars  \
0 -0.070108          -0.107860          -0.016986       -0.149337   
1 -0.064099          -0.061838           0.071686       -0.034652   
2 -0.075564          -0.176690          -0.202383       -0.211513   
3 -0.075388          -0.176521          -0.202081       -0.210960   
4 -0.071179          -0.149117           0.051043       -0.062559   

   Total TVA Avril  Total TVA Mai  Total TVA Juin  Total TVA Juillet  \
0        -0.093202      -0.144344       -0.126753          -0.162302   
1        -0.056872      -0.107698        0.031998           0.053146   
2        -0.180413      -0.207269       -0.207948          -0.208662   
3        -0.179532      -0.205661       -0.203334          -0.204377   
4        -0.140466      -0.197721       -0.106703          -0.177509   

   Total TVA Août  Total TVA Septembre  Total TVA Octobre  Total TVA Novembre  \
0       -0.110689            -0.143377          -0.145779           -0.

## MODEL SKLEARN RANDOM FOREST 

In [41]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7577 entries, 0 to 7576
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   7577 non-null   int64  
 1   ChAff                7577 non-null   float64
 2   Total TVA Janvier    7577 non-null   float64
 3   Total TVA Février    7577 non-null   float64
 4   Total TVA Mars       7577 non-null   float64
 5   Total TVA Avril      7577 non-null   float64
 6   Total TVA Mai        7577 non-null   float64
 7   Total TVA Juin       7577 non-null   float64
 8   Total TVA Juillet    7577 non-null   float64
 9   Total TVA Août       7577 non-null   float64
 10  Total TVA Septembre  7577 non-null   float64
 11  Total TVA Octobre    7577 non-null   float64
 12  Total TVA Novembre   7577 non-null   float64
 13  Total TVA Décembre   7577 non-null   float64
 14  Total TVA anunelle   7577 non-null   float64
 15  feature              7577 non-null   f

In [42]:

X = jointure_oversampled.drop(['feature'], axis=1) 
y = jointure_oversampled['target_encoded']  

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test data
rf_predictions = rf_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00      1350
           2       1.00      1.00      1.00        36

    accuracy                           1.00      1516
   macro avg       1.00      1.00      1.00      1516
weighted avg       1.00      1.00      1.00      1516

Random Forest Confusion Matrix:
[[ 130    0    0]
 [   0 1350    0]
 [   0    0   36]]


In [43]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# The best estimator after hyperparameter tuning
best_rf = grid_search.best_estimator_

# Evaluate the best estimator on the test data
predictions = best_rf.predict(X_test)
print("Optimized RandomForest Classification Report:")
print(classification_report(y_test, predictions))


Optimized RandomForest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00      1350
           2       1.00      1.00      1.00        36

    accuracy                           1.00      1516
   macro avg       1.00      1.00      1.00      1516
weighted avg       1.00      1.00      1.00      1516

