# conseption et realisation d'un dashbord et d'un modele de detection de fraude des données de la Direction des Grandes Entreprise

## Preparation des données

### Importation des modules 

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

### parametrages des modules 

In [2]:
sns.set_theme()
#pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

### selection, affichage et filtrage des fichiers

In [3]:
TVA = pd.read_excel('VraiTVA.xlsx')
#TVA

In [4]:
ERA = pd.read_excel('VraiERA.xlsx')
#ERA

In [5]:
jointure = pd.merge(ERA, TVA, on='BP', how='inner')
#jointure = jointure[['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'Chiffre d’affaire (C.A)', 'Total TVA anuelle']]  

In [6]:
jointure.rename(columns={'Chiffre d’affaire (C.A)': 'ChAff'}, inplace=True)
jointure.rename(columns={'Total TVA anuelle': 'Total TVA anunelle'}, inplace=True)
#jointure.head()

### créer la colonne feature et cible


In [7]:
jointure['feature'] =  jointure['ChAff']  - jointure['Total TVA anunelle'] 

In [8]:
def estime(row):
    if row['ChAff'] == 0 or row['Total TVA anunelle'] ==0 :
        return 'sus'
    elif abs(row['feature']) > 1000:
        return 'fraude'
    else:
        return 'bon'

jointure['cible'] = jointure.apply(estime, axis=1)


In [9]:
jointure.head()

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible
0,2000000147,DIW Alger est,607047,/,0,4381490000.0,4710011000.0,5358351000.0,4811742000.0,4835182000.0,4467203000.0,3692317000.0,3713033000.0,5274394000.0,6056911000.0,4706919000.0,5852010000.0,57859560000.0,-57859560000.0,sus
1,2000000200,DIW Alger est,608001,/,93115698,644181.0,1478702.0,2304256.0,1226741.0,21477610.0,33439220.0,9184847.0,4261077.0,3033041.0,4966790.0,5425418.0,5673810.0,93115700.0,0.0,bon
2,2000007178,DIW Bordj Bou Arréridj,110202,/,155423274,4872747.0,4275042.0,3918504.0,6082352.0,7774514.0,27067300.0,22407960.0,8738202.0,5113770.0,5666626.0,26437940.0,33068320.0,155423300.0,0.0,bon
3,2000009570,DIW Alger est,405105,/,5798168362,423208600.0,396558100.0,411129300.0,628866000.0,313280200.0,410063900.0,333267400.0,561521000.0,713319000.0,495072400.0,596409800.0,527251500.0,5809947000.0,-11778630.0,fraude
4,2000011509,DIW Alger est,409001,/,2062602782,160768700.0,293782400.0,332108000.0,123468900.0,105158600.0,233051700.0,130518600.0,172088600.0,159896900.0,143759600.0,74881360.0,132886200.0,2062370000.0,233138.0,fraude


## Over sample de la donnée fraude dans le data set

In [10]:
# Get the counts of each class
fraud_count = jointure['cible'].value_counts()
print(fraud_count)

# Specify the class you want to oversample, in this case, 'fraude'
target_class = 'fraude'

# Define the number of samples you want for the 'fraude' class after oversampling
# Here, you can use a multiplier to specify how many times larger the 'fraude' class should be
multiplier = 2
target_sample_count = fraud_count[target_class] * multiplier

# Calculate the additional samples you need
additional_samples = target_sample_count - fraud_count[target_class]

# Filter the 'fraude' class
fraude_data = jointure[jointure['cible'] == target_class]

# Randomly sample from the 'fraude' data
oversampled_fraude = fraude_data.sample(n=additional_samples, replace=True, random_state=42)

# Append the oversampled data to the original DataFrame
jointure_oversampled = pd.concat([jointure, oversampled_fraude])

# Shuffle the dataset to mix the rows up
jointure_oversampled = jointure_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)


cible
fraude    720
sus       340
bon       210
Name: count, dtype: int64


### Verification du nouveau data set

In [11]:
print(jointure_oversampled['cible'].value_counts())

cible
fraude    1440
sus        340
bon        210
Name: count, dtype: int64


In [12]:
print(jointure_oversampled.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1990 non-null   int64  
 1   Wilaya               1990 non-null   object 
 2   Code CNRC            1990 non-null   object 
 3   Code ONS             1990 non-null   object 
 4   ChAff                1990 non-null   int64  
 5   Total TVA Janvier    1990 non-null   float64
 6   Total TVA Février    1990 non-null   float64
 7   Total TVA Mars       1990 non-null   float64
 8   Total TVA Avril      1990 non-null   float64
 9   Total TVA Mai        1990 non-null   float64
 10  Total TVA Juin       1990 non-null   float64
 11  Total TVA Juillet    1990 non-null   float64
 12  Total TVA Août       1990 non-null   float64
 13  Total TVA Septembre  1990 non-null   float64
 14  Total TVA Octobre    1990 non-null   float64
 15  Total TVA Novembre   1990 non-null   f

## Encodage categoriel & Normalisation/Standardisation 

### encodage categoriel des wilaya

In [13]:
wilaya_count = jointure_oversampled['Wilaya'].value_counts()
print(wilaya_count)

Wilaya
DIW Alger centre          849
DIW Alger est             305
DIW Alger ouest           157
DIW Oran Est               94
DIW Blida                  75
DIW Ouargla                65
DIW Constantine            44
DIW Boumerdès              44
DIW Béjaïa                 39
DIW Sétif                  31
DIW Annaba                 29
DIW Batna                  28
DIW Skikda                 20
DIW Sidi Bel Abbes         20
DIW Bordj Bou Arréridj     17
DIW Tlemcen                16
DIW Tizi Ouzou             15
DIW Tipaza                 14
DIW Bouira                 13
DIW Mostaganem             13
DIW M'Sila                 12
DIW Chlef                  12
DIW Jijel                   9
DIW Biskra                  7
DIW Mila                    7
DIW Saïda                   5
DIW Khenchela               5
DIW Guelma                  5
DIW Oum el-Bouaghi          5
DIW El Oued                 4
Non affecté                 4
DIW Oran Ouest              4
DIW Médéa                   3
DIW

In [14]:
# Clean and standardize the Wilaya names
jointure_oversampled['Standardized_Wilaya'] = jointure_oversampled['Wilaya'].str.replace('DIW ', '')

# Label Encoding
wilaya_mapping = {name: i + 1 for i, name in enumerate(jointure_oversampled['Standardized_Wilaya'].unique())}
jointure_oversampled['Wilaya_encoded'] = jointure_oversampled['Standardized_Wilaya'].map(wilaya_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled[['Wilaya', 'Standardized_Wilaya', 'Wilaya_encoded']])

                Wilaya Standardized_Wilaya  Wilaya_encoded
0            DIW Chlef               Chlef               1
1     DIW Alger centre        Alger centre               2
2       DIW Tizi Ouzou          Tizi Ouzou               3
3     DIW Alger centre        Alger centre               2
4     DIW Alger centre        Alger centre               2
...                ...                 ...             ...
1985  DIW Alger centre        Alger centre               2
1986  DIW Alger centre        Alger centre               2
1987   DIW Alger ouest         Alger ouest               4
1988  DIW Alger centre        Alger centre               2
1989      DIW Relizane            Relizane              37

[1990 rows x 3 columns]


### encodage categoriel des Codes ONS et CNRC

In [15]:
# Label Encoding for 'code CNRC'
cnrc_unique = pd.unique(jointure_oversampled['Code CNRC'])
code_cnrc_mapping = {code: idx + 1 for idx, code in enumerate(cnrc_unique)}
jointure_oversampled['code CNRC_encoded'] = jointure_oversampled['Code CNRC'].map(code_cnrc_mapping)

# Label Encoding for 'code ONS'
ons_unique = pd.unique(jointure_oversampled['Code ONS'])
code_ons_mapping = {code: idx + 1 for idx, code in enumerate(ons_unique)}
jointure_oversampled['code ONS_encoded'] = jointure_oversampled['Code ONS'].map(code_ons_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled)

              BP            Wilaya Code CNRC Code ONS       ChAff  \
0     2000045866         DIW Chlef    613203     4329  1317435130   
1     2000045315  DIW Alger centre         /        /  5896802984   
2     2000045159    DIW Tizi Ouzou         /        /    76347682   
3     2000046520  DIW Alger centre    613125     4321   647558407   
4     2000046565  DIW Alger centre         /        /  3340936844   
...          ...               ...       ...      ...         ...   
1985  2000046633  DIW Alger centre    613125     4321    22988920   
1986  2000044945  DIW Alger centre         /        /  1802091898   
1987  2000045829   DIW Alger ouest         /        /   113184009   
1988  2000046267  DIW Alger centre         /        /  2929819984   
1989  2000046628      DIW Relizane    613203     4329           0   

      Total TVA Janvier  Total TVA Février  Total TVA Mars  Total TVA Avril  \
0            44929712.0         60954275.0      59384343.0       56305559.0   
1           5

### encodage categoriel de target ENFIN

In [16]:
target_mapping = {
    'bon': 0,      # Typically, the "normal" class can be zero
    'fraude': 1,   # Positive class can be one, especially if it's a binary classification
    'sus': 2       # Additional class can take the next integer
}

# Apply the mapping to the target column
jointure_oversampled['target_encoded'] = jointure_oversampled['cible'].map(target_mapping)

# Display the DataFrame to check the new column
print(jointure_oversampled[['cible', 'target_encoded']])

       cible  target_encoded
0     fraude               1
1     fraude               1
2        bon               0
3     fraude               1
4     fraude               1
...      ...             ...
1985  fraude               1
1986  fraude               1
1987     sus               2
1988  fraude               1
1989     sus               2

[1990 rows x 2 columns]


### verification du Data set :SOB: + supression des columns not needed 

In [17]:
jointure_oversampled

Unnamed: 0,BP,Wilaya,Code CNRC,Code ONS,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,cible,Standardized_Wilaya,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045866,DIW Chlef,613203,4329,1317435130,44929712.0,60954275.0,59384343.0,56305559.0,52420492.0,54261654.0,62863997.0,59103690.0,61228643.0,109905797.0,112937374.0,99691817.0,8.339874e+08,483447777.0,fraude,Chlef,1,1,1,1
1,2000045315,DIW Alger centre,/,/,5896802984,556520160.0,707600228.0,575909896.0,520202480.0,765348500.0,237845000.0,232349323.0,109386240.0,351546193.0,164726130.0,626152250.0,311000220.0,5.158587e+09,738216364.0,fraude,Alger centre,2,2,2,1
2,2000045159,DIW Tizi Ouzou,/,/,76347682,4448947.0,7517880.0,5758165.0,5380874.0,6528608.0,6413053.0,6931153.0,2396290.0,5816103.0,6855432.0,7628333.0,10672840.0,7.634768e+07,4.0,bon,Tizi Ouzou,3,2,2,0
3,2000046520,DIW Alger centre,613125,4321,647558407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,907735409.0,0.0,0.0,9.077354e+08,-260177002.0,fraude,Alger centre,2,3,3,1
4,2000046565,DIW Alger centre,/,/,3340936844,33961305.0,480871725.0,905873987.0,214487912.0,151801286.0,252157974.0,126354176.0,68191653.0,660484077.0,27099557.0,86182754.0,203845222.0,3.211312e+09,129625216.0,fraude,Alger centre,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,2000046633,DIW Alger centre,613125,4321,22988920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2954700.0,2.954700e+06,20034220.0,fraude,Alger centre,2,3,3,1
1986,2000044945,DIW Alger centre,/,/,1802091898,108453762.0,106550220.0,150698252.0,42561555.0,165152128.0,190382304.0,151130637.0,169358678.0,168960276.0,221934866.0,178735475.0,166549636.0,1.820468e+09,-18375891.0,fraude,Alger centre,2,2,2,1
1987,2000045829,DIW Alger ouest,/,/,113184009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,113184009.0,sus,Alger ouest,4,2,2,2
1988,2000046267,DIW Alger centre,/,/,2929819984,0.0,0.0,0.0,0.0,192945347.0,100547189.0,299271381.0,309586048.0,330413933.0,0.0,818467935.0,733341232.0,2.784573e+09,145246919.0,fraude,Alger centre,2,2,2,1


In [18]:
print(jointure_oversampled.columns)

Index(['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'ChAff', 'Total TVA Janvier',
       'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
       'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet',
       'Total TVA Août', 'Total TVA Septembre', 'Total TVA Octobre',
       'Total TVA Novembre', 'Total TVA Décembre', 'Total TVA anunelle',
       'feature', 'cible', 'Standardized_Wilaya', 'Wilaya_encoded',
       'code CNRC_encoded', 'code ONS_encoded', 'target_encoded'],
      dtype='object')


In [19]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Wilaya', 'Code CNRC', 'Code ONS','cible'])

In [20]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Standardized_Wilaya'])

In [21]:
jointure_oversampled

Unnamed: 0,BP,ChAff,Total TVA Janvier,Total TVA Février,Total TVA Mars,Total TVA Avril,Total TVA Mai,Total TVA Juin,Total TVA Juillet,Total TVA Août,Total TVA Septembre,Total TVA Octobre,Total TVA Novembre,Total TVA Décembre,Total TVA anunelle,feature,Wilaya_encoded,code CNRC_encoded,code ONS_encoded,target_encoded
0,2000045866,1317435130,44929712.0,60954275.0,59384343.0,56305559.0,52420492.0,54261654.0,62863997.0,59103690.0,61228643.0,109905797.0,112937374.0,99691817.0,8.339874e+08,483447777.0,1,1,1,1
1,2000045315,5896802984,556520160.0,707600228.0,575909896.0,520202480.0,765348500.0,237845000.0,232349323.0,109386240.0,351546193.0,164726130.0,626152250.0,311000220.0,5.158587e+09,738216364.0,2,2,2,1
2,2000045159,76347682,4448947.0,7517880.0,5758165.0,5380874.0,6528608.0,6413053.0,6931153.0,2396290.0,5816103.0,6855432.0,7628333.0,10672840.0,7.634768e+07,4.0,3,2,2,0
3,2000046520,647558407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,907735409.0,0.0,0.0,9.077354e+08,-260177002.0,2,3,3,1
4,2000046565,3340936844,33961305.0,480871725.0,905873987.0,214487912.0,151801286.0,252157974.0,126354176.0,68191653.0,660484077.0,27099557.0,86182754.0,203845222.0,3.211312e+09,129625216.0,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985,2000046633,22988920,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2954700.0,2.954700e+06,20034220.0,2,3,3,1
1986,2000044945,1802091898,108453762.0,106550220.0,150698252.0,42561555.0,165152128.0,190382304.0,151130637.0,169358678.0,168960276.0,221934866.0,178735475.0,166549636.0,1.820468e+09,-18375891.0,2,2,2,1
1987,2000045829,113184009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000e+00,113184009.0,4,2,2,2
1988,2000046267,2929819984,0.0,0.0,0.0,0.0,192945347.0,100547189.0,299271381.0,309586048.0,330413933.0,0.0,818467935.0,733341232.0,2.784573e+09,145246919.0,2,2,2,1


### normalisation et standardisation 
 vu que nous allons faire a random forest and/or a regression tree, il serait preferable de standardiser les données 

In [22]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1990 non-null   int64  
 1   ChAff                1990 non-null   int64  
 2   Total TVA Janvier    1990 non-null   float64
 3   Total TVA Février    1990 non-null   float64
 4   Total TVA Mars       1990 non-null   float64
 5   Total TVA Avril      1990 non-null   float64
 6   Total TVA Mai        1990 non-null   float64
 7   Total TVA Juin       1990 non-null   float64
 8   Total TVA Juillet    1990 non-null   float64
 9   Total TVA Août       1990 non-null   float64
 10  Total TVA Septembre  1990 non-null   float64
 11  Total TVA Octobre    1990 non-null   float64
 12  Total TVA Novembre   1990 non-null   float64
 13  Total TVA Décembre   1990 non-null   float64
 14  Total TVA anunelle   1990 non-null   float64
 15  feature              1990 non-null   f

In [23]:
columns_to_standardize = [
    'ChAff',
    'Total TVA Janvier', 'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
    'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet', 'Total TVA Août',
    'Total TVA Septembre', 'Total TVA Octobre', 'Total TVA Novembre', 'Total TVA Décembre',
    'Total TVA anunelle'
]

# Apply standardization
for column in columns_to_standardize:
    jointure_oversampled[column] = (jointure_oversampled[column] - jointure_oversampled[column].mean()) / jointure_oversampled[column].std()

# Now your columns are standardized
print(jointure_oversampled[columns_to_standardize].head())

      ChAff  Total TVA Janvier  Total TVA Février  Total TVA Mars  \
0 -0.067318          -0.182325          -0.196293       -0.211847   
1 -0.045202          -0.027252           0.040918       -0.040062   
2 -0.073312          -0.194596          -0.215895       -0.229681   
3 -0.070553          -0.195944          -0.218653       -0.231597   
4 -0.057545          -0.185650          -0.042253        0.069677   

   Total TVA Avril  Total TVA Mai  Total TVA Juin  Total TVA Juillet  \
0        -0.190568      -0.195906       -0.201176          -0.186317   
1        -0.039577       0.031948       -0.140590          -0.128361   
2        -0.207143      -0.210574       -0.216967          -0.205444   
3        -0.208895      -0.212660       -0.219083          -0.207814   
4        -0.139082      -0.164144       -0.135867          -0.164607   

   Total TVA Août  Total TVA Septembre  Total TVA Octobre  Total TVA Novembre  \
0       -0.196273            -0.205577          -0.192924           -0.

## MODEL SKLEARN KNN 

In [24]:
jointure_oversampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990 entries, 0 to 1989
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BP                   1990 non-null   int64  
 1   ChAff                1990 non-null   float64
 2   Total TVA Janvier    1990 non-null   float64
 3   Total TVA Février    1990 non-null   float64
 4   Total TVA Mars       1990 non-null   float64
 5   Total TVA Avril      1990 non-null   float64
 6   Total TVA Mai        1990 non-null   float64
 7   Total TVA Juin       1990 non-null   float64
 8   Total TVA Juillet    1990 non-null   float64
 9   Total TVA Août       1990 non-null   float64
 10  Total TVA Septembre  1990 non-null   float64
 11  Total TVA Octobre    1990 non-null   float64
 12  Total TVA Novembre   1990 non-null   float64
 13  Total TVA Décembre   1990 non-null   float64
 14  Total TVA anunelle   1990 non-null   float64
 15  feature              1990 non-null   f

In [25]:
X = jointure_oversampled.drop(columns=['target_encoded'])  # Drop the target column to get the features
y = jointure_oversampled['target_encoded']  # Get the target column

# Standardize the features (very important for logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
# For multiclass problems, 'multinomial' option should be used
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
predictions = log_reg.predict(X_test)

# Evaluate the model's performance
print("Logistic Regression Classification Report:")
print(classification_report(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        43
           1       0.69      1.00      0.82       270
           2       0.83      0.06      0.11        85

    accuracy                           0.69       398
   macro avg       0.51      0.35      0.31       398
weighted avg       0.65      0.69      0.58       398

Confusion Matrix:
[[  0  42   1]
 [  0 270   0]
 [  0  80   5]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
