In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('combined_dataset.csv')


data = data.sample(frac=1, random_state=42)


X = data['target_url']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial Naive Bayes": MultinomialNB(),
}

#  parameter grids for hyperparameter tuning
param_grids = {
    "Logistic Regression": {'logisticregression__C': [0.1, 1.0, 10.0]},
    "Multinomial Naive Bayes": {},
}


results = []

for model_name, model in models.items():
    pipeline = make_pipeline(CountVectorizer(ngram_range=(1, 2)), model)  # Using unigrams and bigrams
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': model_name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1})


results_df = pd.DataFrame(results)

print(results_df)


                     Model  Accuracy  Precision    Recall  F1 Score
0      Logistic Regression  0.992539   0.992555  0.992539  0.992505
1  Multinomial Naive Bayes  0.996896   0.996907  0.996896  0.996899


In [2]:
import joblib
model = joblib.load('Multinomial_Naive_Bayes_model.joblib')


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [12]:
print(model.predict(["www.instagram.com"]))

['bad']
