# conseption et realisation d'un dashbord et d'un modele de detection de fraude des données de la Direction des Grandes Entreprise

## Preparation des données

### Importation des modules 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

### parametrages des modules 

In [2]:
sns.set_theme()
#pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

### selection, affichage et filtrage des fichiers

In [3]:
TVA = pd.read_excel('VraiTVA.xlsx')
#TVA

In [4]:
ERA = pd.read_excel('VraiERA.xlsx')
#ERA

In [5]:
jointure = pd.merge(ERA, TVA, on='BP', how='inner')
#jointure = jointure[['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'Chiffre d’affaire (C.A)', 'Total TVA anuelle']]  

In [6]:
jointure.rename(columns={'Chiffre d’affaire (C.A)': 'ChAff'}, inplace=True)
jointure.rename(columns={'Total TVA anuelle': 'Total TVA anunelle'}, inplace=True)
#jointure.head()

### créer la colonne feature et cible


In [7]:
jointure['feature'] =  jointure['ChAff']  - jointure['Total TVA anunelle'] 

In [8]:
def estime(row):
    if row['ChAff'] == 0 :
        return 'sus'
    elif abs(row['feature']) > 0.2 * row['ChAff']:
        return 'fraude'
    else:
        return 'bon'

jointure['cible'] = jointure.apply(estime, axis=1)

In [9]:
jointure.head()

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible
0,2000000147,DIW Alger est,607047,/,0,4381490000.0,4710011000.0,5358351000.0,4811742000.0,4835182000.0,4467203000.0,3692317000.0,3713033000.0,5274394000.0,6056911000.0,4706919000.0,5852010000.0,57859560000.0,-57859560000.0,sus
1,2000000200,DIW Alger est,608001,/,93115698,644181.0,1478702.0,2304256.0,1226741.0,21477610.0,33439220.0,9184847.0,4261077.0,3033041.0,4966790.0,5425418.0,5673810.0,93115700.0,0.0,bon
2,2000007178,DIW Bordj Bou Arréridj,110202,/,155423274,4872747.0,4275042.0,3918504.0,6082352.0,7774514.0,27067300.0,22407960.0,8738202.0,5113770.0,5666626.0,26437940.0,33068320.0,155423300.0,0.0,bon
3,2000009570,DIW Alger est,405105,/,5798168362,423208600.0,396558100.0,411129300.0,628866000.0,313280200.0,410063900.0,333267400.0,561521000.0,713319000.0,495072400.0,596409800.0,527251500.0,5809947000.0,-11778630.0,bon
4,2000011509,DIW Alger est,409001,/,2062602782,160768700.0,293782400.0,332108000.0,123468900.0,105158600.0,233051700.0,130518600.0,172088600.0,159896900.0,143759600.0,74881360.0,132886200.0,2062370000.0,233138.0,bon


## Over sample de la donnée fraude dans le data set

In [10]:
# Get the counts of each class
fraud_count = jointure['cible'].value_counts()
print(fraud_count)

# Specify the class you want to oversample, in this case, 'fraude'
target_class = 'fraude'

# Define the number of samples you want for the 'fraude' class after oversampling
# Here, you can use a multiplier to specify how many times larger the 'fraude' class should be
multiplier = 2
target_sample_count = fraud_count[target_class] * multiplier

# Calculate the additional samples you need
additional_samples = target_sample_count - fraud_count[target_class]

# Filter the 'fraude' class
fraude_data = jointure[jointure['cible'] == target_class]

# Randomly sample from the 'fraude' data
oversampled_fraude = fraude_data.sample(n=additional_samples, replace=True, random_state=42)

# Append the oversampled data to the original DataFrame
jointure_oversampled = pd.concat([jointure, oversampled_fraude])

# Shuffle the dataset to mix the rows up
jointure_oversampled = jointure_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)


cible
bon       669
fraude    371
sus       230
Name: count, dtype: int64


### Verification du nouveau data set

In [11]:
print(jointure_oversampled['cible'].value_counts())

cible
fraude    742
bon       669
sus       230
Name: count, dtype: int64


In [12]:
print(jointure_oversampled.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1641 entries, 0 to 1640
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1641 non-null   int64  
 1   Wilaya               1641 non-null   object 
 2   Code CNRC            1641 non-null   object 
 3   Code ONS             1641 non-null   object 
 4   ChAff                1641 non-null   int64  
 5   Total TVA Janvier    1641 non-null   float64
 6   Total TVA Février    1641 non-null   float64
 7   Total TVA Mars       1641 non-null   float64
 8   Total TVA Avril      1641 non-null   float64
 9   Total TVA Mai        1641 non-null   float64
 10  Total TVA Juin       1641 non-null   float64
 11  Total TVA Juillet    1641 non-null   float64
 12  Total TVA Août       1641 non-null   float64
 13  Total TVA Septembre  1641 non-null   float64
 14  Total TVA Octobre    1641 non-null   float64
 15  Total TVA Novembre   1641 non-null   f

## Encodage categoriel & Normalisation/Standardisation 

### encodage categoriel des wilaya

In [13]:
wilaya_count = jointure_oversampled['Wilaya'].value_counts()
print(wilaya_count)

Wilaya
DIW Alger centre          673
DIW Alger est             261
DIW Alger ouest           133
DIW Oran Est               85
DIW Ouargla                61
DIW Blida                  53
DIW Constantine            40
DIW Boumerdès              40
DIW Bordj Bou Arréridj     30
DIW Sétif                  29
DIW Béjaïa                 26
DIW Sidi Bel Abbes         26
DIW Batna                  24
DIW Annaba                 22
DIW Tlemcen                13
DIW Skikda                 12
DIW Tipaza                 11
DIW Tizi Ouzou             10
DIW M'Sila                  8
DIW Mostaganem              8
DIW Bouira                  8
DIW Jijel                   7
DIW Chlef                   6
DIW Médéa                   6
DIW Aïn Témouchent          5
DIW El Oued                 5
DIW Oum el-Bouaghi          5
DIW Biskra                  5
Non affecté                 4
DIW Mila                    4
DIW Mascara                 4
DIW Relizane                3
DIW Saïda                   2
DIW

In [14]:
# Clean and standardize the Wilaya names
jointure_oversampled['Standardized_Wilaya'] = jointure_oversampled['Wilaya'].str.replace('DIW ', '')

# Label Encoding
wilaya_mapping = {name: i + 1 for i, name in enumerate(jointure_oversampled['Standardized_Wilaya'].unique())}
jointure_oversampled['Wilaya_encoded'] = jointure_oversampled['Standardized_Wilaya'].map(wilaya_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled[['Wilaya', 'Standardized_Wilaya', 'Wilaya_encoded']])

                Wilaya Standardized_Wilaya  Wilaya_encoded
0     DIW Alger centre        Alger centre               1
1          DIW Ouargla             Ouargla               2
2          DIW Ouargla             Ouargla               2
3            DIW Sétif               Sétif               3
4      DIW Constantine         Constantine               4
...                ...                 ...             ...
1636  DIW Alger centre        Alger centre               1
1637  DIW Alger centre        Alger centre               1
1638   DIW Alger ouest         Alger ouest               9
1639     DIW Alger est           Alger est               5
1640      DIW Relizane            Relizane              33

[1641 rows x 3 columns]


### encodage categoriel des Codes ONS et CNRC

In [15]:
# Label Encoding for 'code CNRC'
cnrc_unique = pd.unique(jointure_oversampled['Code CNRC'])
code_cnrc_mapping = {code: idx + 1 for idx, code in enumerate(cnrc_unique)}
jointure_oversampled['code CNRC_encoded'] = jointure_oversampled['Code CNRC'].map(code_cnrc_mapping)

# Label Encoding for 'code ONS'
ons_unique = pd.unique(jointure_oversampled['Code ONS'])
code_ons_mapping = {code: idx + 1 for idx, code in enumerate(ons_unique)}
jointure_oversampled['code ONS_encoded'] = jointure_oversampled['Code ONS'].map(code_ons_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled)

              BP            Wilaya Code CNRC Code ONS        ChAff  \
0     2000045556  DIW Alger centre         /        /  19169949866   
1     2000045784       DIW Ouargla    601401        /  13529381423   
2     2000046284       DIW Ouargla         /        /   1381173351   
3     2000045447         DIW Sétif         /        /    980327896   
4     2000045765   DIW Constantine         /        /  11514626226   
...          ...               ...       ...      ...          ...   
1636  2000046633  DIW Alger centre    613125     4321     22988920   
1637  2000046563  DIW Alger centre         /        /       900298   
1638  2000045829   DIW Alger ouest         /        /    113184009   
1639  2000045855     DIW Alger est    613203     4329     96508199   
1640  2000046628      DIW Relizane    613203     4329            0   

      Total TVA Janvier  Total TVA Février  Total TVA Mars  Total TVA Avril  \
0          1.372017e+09       2.054031e+09    1.389913e+09     1.271843e+09   
1

### encodage categoriel de target ENFIN

In [16]:
target_mapping = {
    'bon': 0,      # Typically, the "normal" class can be zero
    'fraude': 1,   # Positive class can be one, especially if it's a binary classification
    'sus': 2       # Additional class can take the next integer
}

# Apply the mapping to the target column
jointure_oversampled['target_encoded'] = jointure_oversampled['cible'].map(target_mapping)

# Display the DataFrame to check the new column
print(jointure_oversampled[['cible', 'target_encoded']])

       cible  target_encoded
0        bon               0
1     fraude               1
2     fraude               1
3        bon               0
4     fraude               1
...      ...             ...
1636  fraude               1
1637  fraude               1
1638  fraude               1
1639  fraude               1
1640     sus               2

[1641 rows x 2 columns]


### verification du Data set :SOB: + supression des columns not needed 

In [17]:
jointure_oversampled

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible,Standardized_Wilaya,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045556,DIW Alger centre,/,/,19169949866,1.372017e+09,2.054031e+09,1.389913e+09,1.271843e+09,1.259380e+09,1.572574e+09,1.278637e+09,1.591988e+09,1.447266e+09,2.002831e+09,1.594309e+09,2.198231e+09,1.903302e+10,1.369299e+08,bon,Alger centre,1,1,1,0
1,2000045784,DIW Ouargla,601401,/,13529381423,3.713920e+06,1.144565e+07,1.490171e+06,1.051046e+07,2.133098e+07,1.047435e+06,6.569350e+05,2.211274e+06,7.731177e+06,1.056250e+06,4.151800e+06,1.654013e+07,8.188619e+07,1.344750e+10,fraude,Ouargla,2,2,1,1
2,2000046284,DIW Ouargla,/,/,1381173351,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.381173e+09,fraude,Ouargla,2,1,1,1
3,2000045447,DIW Sétif,/,/,980327896,6.712789e+07,5.456508e+07,9.936844e+07,1.081006e+08,9.438972e+07,1.129405e+08,5.925930e+07,6.431806e+07,5.594104e+07,6.818184e+07,6.572074e+07,1.304146e+08,9.803279e+08,-2.000000e+00,bon,Sétif,3,1,1,0
4,2000045765,DIW Constantine,/,/,11514626226,1.197369e+09,1.782966e+07,6.589633e+06,4.311954e+06,6.481398e+06,1.275021e+07,5.625770e+06,1.381263e+07,1.134318e+07,1.808584e+07,3.897775e+07,3.519033e+07,1.368367e+09,1.014626e+10,fraude,Constantine,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1636,2000046633,DIW Alger centre,613125,4321,22988920,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.954700e+06,2.954700e+06,2.003422e+07,fraude,Alger centre,1,3,2,1
1637,2000046563,DIW Alger centre,/,/,900298,9.700161e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.027761e+06,0.000000e+00,1.293981e+06,0.000000e+00,0.000000e+00,0.000000e+00,1.003234e+08,-9.942305e+07,fraude,Alger centre,1,1,1,1
1638,2000045829,DIW Alger ouest,/,/,113184009,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.131840e+08,fraude,Alger ouest,9,1,1,1
1639,2000045855,DIW Alger est,613203,4329,96508199,3.481244e+07,5.147958e+06,0.000000e+00,3.107339e+06,0.000000e+00,8.224781e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.253155e+08,-2.880734e+07,fraude,Alger est,5,5,3,1


In [18]:
print(jointure_oversampled.columns)

Index(['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'ChAff', 'Total TVA Janvier',
       'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
       'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet',
       'Total TVA Août', 'Total TVA Septembre', 'Total TVA Octobre',
       'Total TVA Novembre', 'Total TVA Décembre', 'Total TVA anunelle',
       'feature', 'cible', 'Standardized_Wilaya', 'Wilaya_encoded',
       'code CNRC_encoded', 'code ONS_encoded', 'target_encoded'],
      dtype='object')


In [19]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Wilaya', 'Code CNRC', 'Code ONS','cible'])

In [20]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Standardized_Wilaya'])

In [21]:
jointure_oversampled

Unnamed: 0,BP,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045556,19169949866,1.372017e+09,2.054031e+09,1.389913e+09,1.271843e+09,1.259380e+09,1.572574e+09,1.278637e+09,1.591988e+09,1.447266e+09,2.002831e+09,1.594309e+09,2.198231e+09,1.903302e+10,1.369299e+08,1,1,1,0
1,2000045784,13529381423,3.713920e+06,1.144565e+07,1.490171e+06,1.051046e+07,2.133098e+07,1.047435e+06,6.569350e+05,2.211274e+06,7.731177e+06,1.056250e+06,4.151800e+06,1.654013e+07,8.188619e+07,1.344750e+10,2,2,1,1
2,2000046284,1381173351,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.381173e+09,2,1,1,1
3,2000045447,980327896,6.712789e+07,5.456508e+07,9.936844e+07,1.081006e+08,9.438972e+07,1.129405e+08,5.925930e+07,6.431806e+07,5.594104e+07,6.818184e+07,6.572074e+07,1.304146e+08,9.803279e+08,-2.000000e+00,3,1,1,0
4,2000045765,11514626226,1.197369e+09,1.782966e+07,6.589633e+06,4.311954e+06,6.481398e+06,1.275021e+07,5.625770e+06,1.381263e+07,1.134318e+07,1.808584e+07,3.897775e+07,3.519033e+07,1.368367e+09,1.014626e+10,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1636,2000046633,22988920,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.954700e+06,2.954700e+06,2.003422e+07,1,3,2,1
1637,2000046563,900298,9.700161e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.027761e+06,0.000000e+00,1.293981e+06,0.000000e+00,0.000000e+00,0.000000e+00,1.003234e+08,-9.942305e+07,1,1,1,1
1638,2000045829,113184009,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.131840e+08,9,1,1,1
1639,2000045855,96508199,3.481244e+07,5.147958e+06,0.000000e+00,3.107339e+06,0.000000e+00,8.224781e+07,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,1.253155e+08,-2.880734e+07,5,5,3,1


### normalisation et standardisation 
 vu que nous allons faire a random forest and/or a regression tree, il serait preferable de standardiser les données 

In [22]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1641 entries, 0 to 1640
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1641 non-null   int64  
 1   ChAff                1641 non-null   int64  
 2   Total TVA Janvier    1641 non-null   float64
 3   Total TVA Février    1641 non-null   float64
 4   Total TVA Mars       1641 non-null   float64
 5   Total TVA Avril      1641 non-null   float64
 6   Total TVA Mai        1641 non-null   float64
 7   Total TVA Juin       1641 non-null   float64
 8   Total TVA Juillet    1641 non-null   float64
 9   Total TVA Août       1641 non-null   float64
 10  Total TVA Septembre  1641 non-null   float64
 11  Total TVA Octobre    1641 non-null   float64
 12  Total TVA Novembre   1641 non-null   float64
 13  Total TVA Décembre   1641 non-null   float64
 14  Total TVA anunelle   1641 non-null   float64
 15  feature              1641 non-null   f

In [23]:
columns_to_standardize = [
    'ChAff',
    'Total TVA Janvier', 'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
    'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet', 'Total TVA Août',
    'Total TVA Septembre', 'Total TVA Octobre', 'Total TVA Novembre', 'Total TVA Décembre',
    'Total TVA anunelle'
]

# Apply standardization
for column in columns_to_standardize:
    jointure_oversampled[column] = (jointure_oversampled[column] - jointure_oversampled[column].mean()) / jointure_oversampled[column].std()

# Now your columns are standardized
print(jointure_oversampled[columns_to_standardize].head())

      ChAff  Total TVA Janvier  Total TVA Février  Total TVA Mars  \
0 -0.001033           0.304097           0.691865        0.337203   
1 -0.019700          -0.190335          -0.195918       -0.210195   
2 -0.059902          -0.191677          -0.200893       -0.210782   
3 -0.061229          -0.167421          -0.177177       -0.171605   
4 -0.026367           0.240989          -0.193143       -0.208184   

   Total TVA Avril  Total TVA Mai  Total TVA Juin  Total TVA Juillet  \
0         0.275089       0.268076        0.421634           0.332041   
1        -0.184589      -0.186417       -0.203015          -0.196504   
2        -0.188419      -0.194247       -0.203432          -0.196776   
3        -0.149023      -0.159597       -0.158540          -0.172267   
4        -0.186848      -0.191868       -0.198364          -0.194449   

   Total TVA Août  Total TVA Septembre  Total TVA Octobre  Total TVA Novembre  \
0        0.434711             0.362270           0.569706            0.

## MODEL SKLEARN RANDOM FOREST 

In [24]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1641 entries, 0 to 1640
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1641 non-null   int64  
 1   ChAff                1641 non-null   float64
 2   Total TVA Janvier    1641 non-null   float64
 3   Total TVA Février    1641 non-null   float64
 4   Total TVA Mars       1641 non-null   float64
 5   Total TVA Avril      1641 non-null   float64
 6   Total TVA Mai        1641 non-null   float64
 7   Total TVA Juin       1641 non-null   float64
 8   Total TVA Juillet    1641 non-null   float64
 9   Total TVA Août       1641 non-null   float64
 10  Total TVA Septembre  1641 non-null   float64
 11  Total TVA Octobre    1641 non-null   float64
 12  Total TVA Novembre   1641 non-null   float64
 13  Total TVA Décembre   1641 non-null   float64
 14  Total TVA anunelle   1641 non-null   float64
 15  feature              1641 non-null   f

In [25]:

X = jointure_oversampled.drop(['target_encoded'], axis=1) 
y = jointure_oversampled['target_encoded']  

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test data
rf_predictions = rf_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.94       122
           1       0.97      0.93      0.95       164
           2       1.00      0.98      0.99        43

    accuracy                           0.95       329
   macro avg       0.96      0.96      0.96       329
weighted avg       0.96      0.95      0.95       329

Random Forest Confusion Matrix:
[[119   3   0]
 [ 11 153   0]
 [  0   1  42]]
