In [1]:
# Importation des bibliothèques nécessaires
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import classification_report, roc_curve, auc,accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [2]:
# chargement dataset
data = pd.read_csv("../data/BenignAndMaliciousDataset.csv")


In [3]:
# Analyse exploratoire des données temporelles
# Affichage des premières lignes du jeu de données

print(data.head())

   Domain DNSRecordType  MXDnsResponse  TXTDnsResponse  HasSPFInfo   
0    4455             A          False           False       False  \
1    4456             A          False           False       False   
2    4457             A          False           False       False   
3    4458             A          False           False       False   
4    4459             A          False           False       False   

   HasDkimInfo  HasDmarcInfo     Ip  DomainInAlexaDB  CommonPorts  ...   
0        False         False  16984            False        False  ...  \
1        False         False  16984            False        False  ...   
2        False         False  16984            False        False  ...   
3        False         False  16984            False        False  ...   
4        False         False  16984            False        False  ...   

  ConsoantRatio NumericRatio  SpecialCharRatio  VowelRatio  ConsoantSequence   
0           0.6          0.1               0.0        

In [4]:
# Infos sur le jeu de données (types, valeurs manquantes, etc.)
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Domain               90000 non-null  int64  
 1   DNSRecordType        90000 non-null  object 
 2   MXDnsResponse        90000 non-null  bool   
 3   TXTDnsResponse       90000 non-null  bool   
 4   HasSPFInfo           90000 non-null  bool   
 5   HasDkimInfo          90000 non-null  bool   
 6   HasDmarcInfo         90000 non-null  bool   
 7   Ip                   90000 non-null  int64  
 8   DomainInAlexaDB      90000 non-null  bool   
 9   CommonPorts          90000 non-null  bool   
 10  CountryCode          60948 non-null  object 
 11  RegisteredCountry    12226 non-null  object 
 12  CreationDate         90000 non-null  int64  
 13  LastUpdateDate       90000 non-null  int64  
 14  ASN                  90000 non-null  int64  
 15  HttpResponseCode     90000 non-null 

In [5]:
# Statistiques descriptives

print(data.describe())


             Domain            Ip  CreationDate  LastUpdateDate   
count  90000.000000  90000.000000  90000.000000    90000.000000  \
mean   44999.500000  13479.648033      1.933611        2.365744   
std    25980.906451   4160.266410      1.997232        1.935509   
min        0.000000      0.000000      0.000000        0.000000   
25%    22499.750000  11709.750000      0.000000        0.000000   
50%    44999.500000  14626.000000      0.000000        4.000000   
75%    67499.250000  16984.000000      4.000000        4.000000   
max    89999.000000  16984.000000      4.000000        4.000000   

                 ASN  HttpResponseCode  SubdomainNumber       Entropy   
count   90000.000000      90000.000000     90000.000000  90000.000000  \
mean    23335.808167          0.667033       103.069200      2.866844   
std     37004.865724          1.203285      4243.802846      0.488291   
min        -1.000000          0.000000         0.000000      0.000000   
25%        -1.000000          0

In [6]:
# Les colonnes à garder comme features important apres selection avec l'algo RFECV  avec comme nombre de parametre fixer a neuf
features = [
  'Domain', 'Ip', 'ASN', 
  'HttpResponseCode', 'SubdomainNumber', 'Entropy', 'EntropyOfSubDomains',
  'SpecialCharSequence', 'DomainLength'
]

# La colonne cible
target = 'Class'

# Séparation en features et target
X = data[features]  
y = data[target]


In [9]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

# Entraîner le modèle  
svm = SVC(probability=True).fit(X_train, y_train) 
#prediction sur les donnee de test
y_pred = svm.predict(X_test)

#evaluer la performance 
# print(classification_report(y_test, y_pred))

print(accuracy_score(y_pred,y_test))





0.992


In [10]:
# Optimisation des hyperparametre
hyperparametres_svm = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}

# Initialisation de la GridSearchCV
grid_svm = GridSearchCV(SVC(probability=True), hyperparametres_svm, cv=5, scoring='accuracy')

# Adaptation du  GridSearch aux données d'entraînement
grid_svm.fit(X_train, y_train)

# Afficher hyperparamètres selectionnees comme mailleurs
print("Meilleurs hyperparamètres pour SVM:", grid_svm.best_params_)

# Prédire sur les données de test avec le meilleur modèle
y_pred_svm = grid_svm.predict(X_test)

# Évaluer la performance
print(classification_report(y_test, y_pred_svm))

In [None]:
import pickle

# Sauvegarder le modèle entraîné
pickle.dump(svm, open('svm_model.pkl','wb')) 

In [None]:
# Charger le modèle   
svm = pickle.load(open('svm_model.pkl','rb'))


In [None]:
# Evaluer sur données test    
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))