# conseption et realisation d'un dashbord et d'un modele de detection de fraude des données de la Direction des Grandes Entreprise

## Preparation des données

### Importation des modules 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

### parametrages des modules 

In [2]:
sns.set_theme()
#pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

NameError: name 'sns' is not defined

### selection, affichage et filtrage des fichiers

In [None]:
TVA = pd.read_excel('VraiTVA.xlsx')
#TVA

In [None]:
ERA = pd.read_excel('VraiERA.xlsx')
#ERA

In [None]:
jointure = pd.merge(ERA, TVA, on='BP', how='inner')
#jointure = jointure[['BP', 'Wilaya', 'Code CNRC', 'Code ONS', 'Chiffre d’affaire (C.A)', 'Total TVA anuelle']]  

In [None]:
jointure.rename(columns={'Chiffre d’affaire (C.A)': 'ChAff'}, inplace=True)
jointure.rename(columns={'Total TVA anuelle': 'Total TVA anunelle'}, inplace=True)
#jointure.head()

### créer la colonne feature et cible


In [None]:
jointure['feature'] =  jointure['ChAff']  - jointure['Total TVA anunelle'] 

In [None]:
def estime(row):
    if row['ChAff'] == 0 or row['Total TVA anunelle'] ==0 :
        return 'sus'
    elif abs(row['feature']) > 1000:
        return 'fraude'
    else:
        return 'bon'

jointure['cible'] = jointure.apply(estime, axis=1)


In [None]:
jointure.head()

## Over sample de la donnée fraude dans le data set

In [None]:
# Get the counts of each class
fraud_count = jointure['cible'].value_counts()
print(fraud_count)

# Specify the class you want to oversample, in this case, 'fraude'
target_class = 'fraude'

# Define the number of samples you want for the 'fraude' class after oversampling
# Here, you can use a multiplier to specify how many times larger the 'fraude' class should be
multiplier = 2
target_sample_count = fraud_count[target_class] * multiplier

# Calculate the additional samples you need
additional_samples = target_sample_count - fraud_count[target_class]

# Filter the 'fraude' class
fraude_data = jointure[jointure['cible'] == target_class]

# Randomly sample from the 'fraude' data
oversampled_fraude = fraude_data.sample(n=additional_samples, replace=True, random_state=42)

# Append the oversampled data to the original DataFrame
jointure_oversampled = pd.concat([jointure, oversampled_fraude])

# Shuffle the dataset to mix the rows up
jointure_oversampled = jointure_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)


### Verification du nouveau data set

In [None]:
print(jointure_oversampled['cible'].value_counts())

In [None]:
print(jointure_oversampled.info())

## Encodage categoriel & Normalisation/Standardisation 

### encodage categoriel des wilaya

In [None]:
wilaya_count = jointure_oversampled['Wilaya'].value_counts()
print(wilaya_count)

In [None]:
# Clean and standardize the Wilaya names
jointure_oversampled['Standardized_Wilaya'] = jointure_oversampled['Wilaya'].str.replace('DIW ', '')

# Label Encoding
wilaya_mapping = {name: i + 1 for i, name in enumerate(jointure_oversampled['Standardized_Wilaya'].unique())}
jointure_oversampled['Wilaya_encoded'] = jointure_oversampled['Standardized_Wilaya'].map(wilaya_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled[['Wilaya', 'Standardized_Wilaya', 'Wilaya_encoded']])

### encodage categoriel des Codes ONS et CNRC

In [None]:
# Label Encoding for 'code CNRC'
cnrc_unique = pd.unique(jointure_oversampled['Code CNRC'])
code_cnrc_mapping = {code: idx + 1 for idx, code in enumerate(cnrc_unique)}
jointure_oversampled['code CNRC_encoded'] = jointure_oversampled['Code CNRC'].map(code_cnrc_mapping)

# Label Encoding for 'code ONS'
ons_unique = pd.unique(jointure_oversampled['Code ONS'])
code_ons_mapping = {code: idx + 1 for idx, code in enumerate(ons_unique)}
jointure_oversampled['code ONS_encoded'] = jointure_oversampled['Code ONS'].map(code_ons_mapping)

# Display the DataFrame to check the new columns
print(jointure_oversampled)

### encodage categoriel de target ENFIN

In [None]:
target_mapping = {
    'bon': 0,      # Typically, the "normal" class can be zero
    'fraude': 1,   # Positive class can be one, especially if it's a binary classification
    'sus': 2       # Additional class can take the next integer
}

# Apply the mapping to the target column
jointure_oversampled['target_encoded'] = jointure_oversampled['cible'].map(target_mapping)

# Display the DataFrame to check the new column
print(jointure_oversampled[['cible', 'target_encoded']])

### verification du Data set :SOB: + supression des columns not needed 

In [None]:
jointure_oversampled

In [None]:
print(jointure_oversampled.columns)

In [None]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Wilaya', 'Code CNRC', 'Code ONS','cible'])

In [None]:
jointure_oversampled = jointure_oversampled.drop(columns = ['Standardized_Wilaya'])

In [None]:
jointure_oversampled

### normalisation et standardisation 
 vu que nous allons faire a random forest and/or a regression tree, il serait preferable de standardiser les données 

In [None]:
jointure_oversampled.info()

In [None]:
columns_to_standardize = [
    'ChAff',
    'Total TVA Janvier', 'Total TVA Février', 'Total TVA Mars', 'Total TVA Avril',
    'Total TVA Mai', 'Total TVA Juin', 'Total TVA Juillet', 'Total TVA Août',
    'Total TVA Septembre', 'Total TVA Octobre', 'Total TVA Novembre', 'Total TVA Décembre',
    'Total TVA anunelle'
]

# Apply standardization
for column in columns_to_standardize:
    jointure_oversampled[column] = (jointure_oversampled[column] - jointure_oversampled[column].mean()) / jointure_oversampled[column].std()

# Now your columns are standardized
print(jointure_oversampled[columns_to_standardize].head())

## MODEL SKLEARN KNN 

In [None]:
jointure_oversampled.info()

In [None]:
X = jointure_oversampled.drop(columns=['target_encoded'])  # Drop the target column to get the features
y = jointure_oversampled['target_encoded']  # Get the target column

# Standardize the features (very important for logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
# For multiclass problems, 'multinomial' option should be used
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the test set
predictions = log_reg.predict(X_test)

# Evaluate the model's performance
print("Logistic Regression Classification Report:")
print(classification_report(y_test, predictions))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))