In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import re
import joblib

In [2]:
# Load your labeled dataset
df = pd.read_csv('dataset/urls.csv')

# Load 6000 benign domains from top-1m.csv
benign_df = pd.read_csv('dataset/top-1m.csv', nrows=6000, header=None, names=['rank', 'url'])
benign_df['type'] = 'benign'  # Mark all as benign

# If your model expects full URLs, prepend 'http://' or 'https://'
benign_df['url'] = 'http://' + benign_df['url']

# Select only the columns needed for merging
benign_df = benign_df[['url', 'type']]

# Merge with your main dataset
df = pd.concat([df, benign_df], ignore_index=True)

df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [None]:
# Separate benign and malicious samples
benign = df[df['type'] == 'benign']
malicious = df[df['type'] != 'benign']

# Find the minimum count
min_count = min(len(benign), len(malicious))

# Downsample both to the same size
benign_balanced = benign.sample(n=min_count, random_state=42)
malicious_balanced = malicious.sample(n=min_count, random_state=42)

# Combine and shuffle
df_balanced = pd.concat([benign_balanced, malicious_balanced]).sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['type'].value_counts())

def extract_features(url):
    from urllib.parse import urlparse
    features = {}
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    path = parsed.path
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['has_ip'] = 1 if re.search(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b', url) else 0
    features['has_at'] = 1 if '@' in url else 0
    features['has_hyphen'] = 1 if '-' in url else 0
    features['has_https'] = 1 if url.lower().startswith('https') else 0
    features['domain_length'] = len(domain)
    features['path_length'] = len(path)
    features['num_slash'] = url.count('/')
    features['num_params'] = url.count('=')
    keywords = ['login', 'update', 'verify', 'secure', 'account', 'banking']
    features['has_suspicious_keyword'] = 1 if any(k in url.lower() for k in keywords) else 0
    return pd.Series(features)

features_df = df_balanced['url'].apply(extract_features)
X = features_df
# Map all non-benign types to 1 (malicious)
y = df_balanced['type'].apply(lambda t: 0 if t == 'benign' else 1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [6]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9420415553983217
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     86821
           1       0.93      0.89      0.91     44618

    accuracy                           0.94    131439
   macro avg       0.94      0.93      0.93    131439
weighted avg       0.94      0.94      0.94    131439



In [None]:
import os
print('Saving model to:', os.path.abspath('model.pkl'))
joblib.dump(model, 'model.pkl')
print(df_balanced['type'].value_counts())
print('Model saved successfully!')

Saving model to: c:\Users\thara\OneDrive\Desktop\project\model.pkl
type
benign        434103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64
Model saved successfully!
