In [3]:
import pandas as pd
from scipy.io import arff

In [4]:
# Function to convert byte strings in the data to numeric types
def convert_byte_strings_to_numbers(data):
    converted_data = []
    for row in data:
        converted_row = []
        for value in row:
            if isinstance(value, bytes):
                # Convert byte strings to float (or int if you are sure they are integers)
                converted_value = int(value.decode('utf-8'))
            else:
                converted_value = value
            converted_row.append(converted_value)
        converted_data.append(tuple(converted_row))
    return converted_data

# Load the ARFF file
data, meta = arff.loadarff('../data/Training Dataset.arff')
# Convert byte strings to numbers
converted_data = convert_byte_strings_to_numbers(data)

# Convert to a pandas DataFrame
df = pd.DataFrame(converted_data, columns=meta.names())

In [5]:
df.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [20]:
df.loc[df['Result'] == -1, 'Result'] = 0

In [21]:
X = df.drop('Result',axis=1)
y = df['Result']

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=123,stratify=y,test_size=0.2)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

In [52]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "KNN Classifier": KNeighborsClassifier(),
    "Naive bayes": GaussianNB(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Adaboost Classifiert": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "xgboost Classifier": XGBClassifier(),
    "LightGBM Classifier": LGBMClassifier(),
    "Neural Network": MLPClassifier()
}

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
def evaluate_model(models,X_train,y_train,X_test,y_test):
        evaluated_model = {}
        for key, model in models.items():
            model.fit(X_train,y_train)

            y_pred = model.predict(X_test)
            score = accuracy_score(y_test,y_pred)
            evaluated_model[key] = [model,score]
        return evaluated_model

In [55]:
result = evaluate_model(models,X_train,y_train,X_test,y_test)



In [56]:
def find_best_model(evaluated_models):
        best_model_key = None
        best_model_obj = None
        best_score = -float('inf')
        
        for key, (model, score) in evaluated_models.items():
            if score > best_score:
                best_score = score
                best_model_obj = model
                best_model_key = key
        
        best_model = evaluated_models[best_model_key]
        return best_model_key, best_model_obj, best_score

In [57]:
best_model_key, best_model_obj, best_score = find_best_model(result)

In [58]:
hyperparams_classification = {
    "Logistic Regression": {},  # No hyperparameters to tune
    "KNN Classifier": {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    "Decision Tree Classifier": {
        'max_depth': [3, 5, 7]
    },
    "Random Forest Classifier": {
        'n_estimators': [100, 500],
        'max_depth': [3, 5, 7]
    },
    "Adaboost Classifier": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },
    "Gradient Boosting Classifier": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
        'max_depth': [3, 5, 7]
    },
    "xgboost Classifier": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
        'max_depth': [3, 5, 7]
    },
    "LightGBM Classifier": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0],
        'num_leaves': [31, 127]
    },
    "CatBoost Classifier": {
        'iterations': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.3],
        'depth': [3, 5, 7]
    },
    "SVM": {
        'C': [0.1, 1.0, 10.0],
        'kernel': ['linear', 'rbf']
    },
    "Naive Bayes": {},  # No hyperparameters to tune
    "Neural Network": {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01]
    }
}


In [61]:
from sklearn.model_selection import GridSearchCV

In [65]:
def hyperparameter_tuning(best_model_name, best_model, X_train, y_train):
            if best_model_name in hyperparams_classification:
                param_grid = hyperparams_classification[best_model_name]
                grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, scoring='accuracy', cv=5)
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_
                best_params = grid_search.best_params_
            return best_model

In [66]:
final_model = hyperparameter_tuning(best_model_key, best_model_obj, X_train, y_train)

In [68]:
y_pred = final_model.predict(X_test)
accuracy_score(y_pred,y_test)

0.9710538218000905