In [442]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.naive_bayes import GaussianNB


In [443]:
# load the dataset 
df = pd.read_csv("prepared_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Target,fblk,nblk,anchor_exact_keyword,anchor_keyword,pa_score,ref_dom,outb_dom,mon_visits,...,h3kw,alt,altkw,linkin,linkout,urllen,urlkw,txtlen,txtkw,domain_age
0,0,0,466,957,451,948,79,403,10,16000000.0,...,0,5,1,136,14,90,1,2175,55,23
1,1,0,60000,10500,40702,44594,98,5300,123,6700000000.0,...,1,8,1,1723,642,43,1,7626,137,0
2,2,0,1800,2400,634,1493,62,805,35,3900000.0,...,1,23,9,828,74,57,1,2591,217,8
3,3,0,968,1300,589,1300,85,466,1,33100000.0,...,0,3,0,62,5,45,1,1318,41,36
4,4,0,3700,1000,254,761,70,492,6,24700000.0,...,7,48,4,139,49,54,1,2045,103,33


In [444]:
df.drop('Unnamed: 0',axis=1,inplace=True) # drop the unnamed column

In [445]:
target = df['Target']
cols = df.columns.drop('Target')
features = df[cols] 

In [446]:
X_train,X_test,y_train,y_test = train_test_split(features,target,train_size=0.8,random_state=42)

In [447]:
models = {
    'SVM':make_pipeline(StandardScaler(),SVC(C=100,kernel='poly',random_state=44)),
    'DecisionTree':make_pipeline(StandardScaler(),tree.DecisionTreeClassifier()),
    'Naive Bayes':make_pipeline(StandardScaler(),GaussianNB()),
    'Random Forest':make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=300))
}

In [448]:
results = []
for model_name,model in models.items():
    result = {}
    classifier = model.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)
    conf_matrix = confusion_matrix(y_test,y_pred)
    c_report = classification_report(y_test,y_pred)
    result['model']= model_name
    result['confusion_matrix'] = conf_matrix
    result['classification_report'] = c_report
    results.append(result)


In [449]:
for result in results:
    print(result['model'])
    print(result['confusion_matrix'])
    print(result['classification_report'])

SVM
[[42  5  6]
 [10 24 18]
 [15 14 22]]
              precision    recall  f1-score   support

           0       0.63      0.79      0.70        53
           1       0.56      0.46      0.51        52
           2       0.48      0.43      0.45        51

    accuracy                           0.56       156
   macro avg       0.55      0.56      0.55       156
weighted avg       0.56      0.56      0.55       156

DecisionTree
[[44  3  6]
 [ 5 19 28]
 [16 12 23]]
              precision    recall  f1-score   support

           0       0.68      0.83      0.75        53
           1       0.56      0.37      0.44        52
           2       0.40      0.45      0.43        51

    accuracy                           0.55       156
   macro avg       0.55      0.55      0.54       156
weighted avg       0.55      0.55      0.54       156

Naive Bayes
[[15 18 20]
 [ 3 43  6]
 [ 1 43  7]]
              precision    recall  f1-score   support

           0       0.79      0.28      0.42

In [450]:
#fine tune the model 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [451]:
#for the RandomForest 
param_grid = {
    'svc__C': [0.1, 1, 10, 100],               # Regularization parameter
    'svc__gamma': ['scale', 'auto', 0.1, 1],  # Kernel coefficient for 'rbf'
    'svc__kernel': ['linear', 'rbf', 'poly']  # Different kernel types
}

In [452]:
pipeline = make_pipeline(
  StandardScaler(),
  SVC()
)

grid_search = GridSearchCV(estimator=pipeline,scoring='accuracy',param_grid=param_grid,verbose=2)

In [453]:
grid_search.fit(X_train,y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.0s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.0s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.0s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.0s
[CV] END ...svc__C=0.1, svc__gamma=scale, svc__kernel=linear; total time=   0.0s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.0s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.0s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.0s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.0s
[CV] END ......svc__C=0.1, svc__gamma=scale, svc__kernel=rbf; total time=   0.0s
[CV] END .....svc__C=0.1, svc__gamma=scale, svc__kernel=poly; total time=   0.0s
[CV] END .....svc__C=0.1, svc__gamma=scale, svc

KeyboardInterrupt: 

In [None]:
grid_search.best_params_


In [378]:
y_pred = grid_search.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
grid_search.n_splits_

In [None]:
print(confusion_matrix(y_test,y_pred))