In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv('Mendeley_49.csv')

In [3]:
data.head()
data.shape
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 49 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   NumDots numeric                             10000 non-null  int64  
 1   SubdomainLevel numeric                      10000 non-null  int64  
 2   PathLevel numeric                           10000 non-null  int64  
 3   UrlLength numeric                           10000 non-null  int64  
 4   NumDash numeric                             10000 non-null  int64  
 5   NumDashInHostname numeric                   10000 non-null  int64  
 6   AtSymbol numeric                            10000 non-null  int64  
 7   TildeSymbol numeric                         10000 non-null  int64  
 8   NumUnderscore numeric                       10000 non-null  int64  
 9   NumPercent numeric                          10000 non-null  int64  
 10  NumQueryCom

Unnamed: 0,NumDots numeric,SubdomainLevel numeric,PathLevel numeric,UrlLength numeric,NumDash numeric,NumDashInHostname numeric,AtSymbol numeric,TildeSymbol numeric,NumUnderscore numeric,NumPercent numeric,...,IframeOrFrame numeric,MissingTitle numeric,ImagesOnlyInForm numeric,SubdomainLevelRT numeric,UrlLengthRT numeric,PctExtResourceUrlsRT numeric,AbnormalExtFormActionR numeric,ExtMetaScriptLinkRT numeric,PctExtNullSelfRedirectHyperlinksRT numeric,CLASS_LABEL
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.4451,0.5868,3.3003,70.2641,1.818,0.1389,0.0003,0.0131,0.3232,0.0738,...,0.3396,0.0322,0.0304,0.9566,0.0202,0.3533,0.7932,0.1734,0.3141,0.5
std,1.346836,0.751214,1.863241,33.369877,3.106258,0.545744,0.017319,0.113709,1.11466,0.622248,...,0.473597,0.17654,0.171694,0.248037,0.820036,0.888908,0.521019,0.755771,0.897843,0.500025
min,1.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
25%,2.0,0.0,2.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.0,-1.0,1.0,0.0,-1.0,0.0
50%,2.0,1.0,3.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.5
75%,3.0,1.0,4.0,84.0,2.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,21.0,14.0,18.0,253.0,55.0,9.0,1.0,1.0,18.0,19.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
data.isnull().sum()

NumDots numeric                               0
SubdomainLevel numeric                        0
PathLevel numeric                             0
UrlLength numeric                             0
NumDash numeric                               0
NumDashInHostname numeric                     0
AtSymbol numeric                              0
TildeSymbol numeric                           0
NumUnderscore numeric                         0
NumPercent numeric                            0
NumQueryComponents numeric                    0
NumAmpersand numeric                          0
NumHash numeric                               0
NumNumericChars numeric                       0
NoHttps numeric                               0
RandomString numeric                          0
IpAddress numeric                             0
DomainInSubdomains numeric                    0
DomainInPaths numeric                         0
HttpsInHostname numeric                       0
HostnameLength numeric                  

In [5]:
data.isna().sum()

NumDots numeric                               0
SubdomainLevel numeric                        0
PathLevel numeric                             0
UrlLength numeric                             0
NumDash numeric                               0
NumDashInHostname numeric                     0
AtSymbol numeric                              0
TildeSymbol numeric                           0
NumUnderscore numeric                         0
NumPercent numeric                            0
NumQueryComponents numeric                    0
NumAmpersand numeric                          0
NumHash numeric                               0
NumNumericChars numeric                       0
NoHttps numeric                               0
RandomString numeric                          0
IpAddress numeric                             0
DomainInSubdomains numeric                    0
DomainInPaths numeric                         0
HttpsInHostname numeric                       0
HostnameLength numeric                  

In [6]:
X = data.drop(['CLASS_LABEL'], axis=1)
y = data['CLASS_LABEL']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
svm = SVC(kernel='linear', C=1, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print('SVM Model Accuracy:', accuracy_score(y_test, y_pred_svm))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_svm))
print('Classification Report:\n', classification_report(y_test, y_pred_svm))

SVM Model Accuracy: 0.941
Confusion Matrix:
 [[923  65]
 [ 53 959]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       988
           1       0.94      0.95      0.94      1012

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000



In [10]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print('Decision Tree Model Accuracy:', accuracy_score(y_test, y_pred_dt))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_dt))
print('Classification Report:\n', classification_report(y_test, y_pred_dt))

Decision Tree Model Accuracy: 0.9705
Confusion Matrix:
 [[957  31]
 [ 28 984]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97       988
           1       0.97      0.97      0.97      1012

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000



In [11]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('Random Forest Model Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_rf))
print('Classification Report:\n', classification_report(y_test, y_pred_rf))

Random Forest Model Accuracy: 0.982
Confusion Matrix:
 [[970  18]
 [ 18 994]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       988
           1       0.98      0.98      0.98      1012

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000



In [12]:
# Combine the predictions using majority voting
ensemble_pred = np.array([y_pred_svm, y_pred_dt, y_pred_rf])
final_pred = []
for i in range(len(X_test)):
    votes = ensemble_pred[:, i]
    majority_vote = np.bincount(votes).argmax()
    final_pred.append(majority_vote)

# Evaluate the performance of the ensemble model
print('Ensemble Model Accuracy:', accuracy_score(y_test, final_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, final_pred))
print('Classification Report:\n', classification_report(y_test, final_pred))

Ensemble Model Accuracy: 0.9805
Confusion Matrix:
 [[965  23]
 [ 16 996]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       988
           1       0.98      0.98      0.98      1012

    accuracy                           0.98      2000
   macro avg       0.98      0.98      0.98      2000
weighted avg       0.98      0.98      0.98      2000



In [13]:
from sklearn.ensemble import VotingClassifier

# Create a voting classifier with the three models
voting_clf = VotingClassifier(estimators=[
    ('svm', svm), 
    ('dt', dt), 
    ('rf', rf)], 
    voting='hard')

# Fit the voting classifier on the training data
voting_clf.fit(X_train, y_train)

# Evaluate the voting classifier on the test data
accuracy = voting_clf.score(X_test, y_test)
print('Voting Classifier Accuracy:', accuracy)

Voting Classifier Accuracy: 0.9805


In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Create a stacking classifier with the three models
stacking_clf = StackingClassifier(estimators=[
    ('svm', svm), 
    ('dt', dt), 
    ('rf', rf)], 
    final_estimator=LogisticRegression())

# Fit the stacking classifier on the training data
stacking_clf.fit(X_train, y_train)

# Evaluate the stacking classifier on the test data
accuracy = stacking_clf.score(X_test, y_test)
print('Stacking Classifier Accuracy:', accuracy)

Stacking Classifier Accuracy: 0.9825
