In [3]:
import pandas as pd


data = pd.read_csv('/kaggle/input/urldataset/data.csv')


print("First few rows of the dataset:")
print(data.head())

print("\nDimensions of the dataset:")
print(data.shape)


print("\nMissing values in the dataset:")
print(data.isnull().sum())


print("\nDistribution of labels:")
print(data['label'].value_counts())


print("\nStatistics of the dataset:")
print(data.describe())

print("\nUnique values in the 'label' column:")
print(data['label'].unique())


First few rows of the dataset:
                      url label
0  diaryofagameaddict.com   bad
1        espdesign.com.au   bad
2      iamagameaddict.com   bad
3           kalantzis.net   bad
4   slightlyoffcenter.net   bad

Dimensions of the dataset:
(420464, 2)

Missing values in the dataset:
url      0
label    0
dtype: int64

Distribution of labels:
label
good    344821
bad      75643
Name: count, dtype: int64

Statistics of the dataset:
                                                  url   label
count                                          420464  420464
unique                                         411247       2
top     d11m2p9mpffp32.cloudfront.net/main/web_zt.exe    good
freq                                               27  344821

Unique values in the 'label' column:
['bad' 'good']


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('/kaggle/input/urldataset/data.csv')


data = data.sample(frac=0.5, random_state=42)


X = data['url']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial Naive Bayes": MultinomialNB(),
}

#  parameter grids for hyperparameter tuning
param_grids = {
    "Logistic Regression": {'logisticregression__C': [0.1, 1.0, 10.0]},
    "Multinomial Naive Bayes": {},
}


results = []

for model_name, model in models.items():
    pipeline = make_pipeline(CountVectorizer(ngram_range=(1, 2)), model)  # Using unigrams and bigrams
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append({'Model': model_name, 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1})


results_df = pd.DataFrame(results)

print(results_df)


                     Model  Accuracy  Precision    Recall  F1 Score
0      Logistic Regression  0.972055   0.972098  0.972055  0.971398
1  Multinomial Naive Bayes  0.974196   0.974286  0.974196  0.973615


In [17]:
import joblib

for model_name, model in models.items():
    pipeline = make_pipeline(CountVectorizer(), model)
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    joblib.dump(best_model, f'{model_name}_model.pkl')

print("Models saved successfully!")


Models saved successfully!


In [7]:
import joblib

model = joblib.load('trained_model/Logistic_Regression_model.pkl')
X_new = ["google.com"]
# Make predictions
predictions = model.predict(X_new)
print(predictions)



['good']


In [9]:
import socket

ip_address = socket.gethostbyname("google.com" )
print(f"The IP address of {ip_name} is: {ip_address}")


The IP address of google.com is: 172.253.124.138


In [7]:
import socket

print(socket.gethostbyname("apple.com"))

17.253.144.10
