In [1]:
# 📦 Install required libraries
!pip install liac-arff tldextract beautifulsoup4 requests python-whois


Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Building wheels for collected packages: liac-arff
  Building wheel for liac-arff (setup.py) ... [?25l[?25hdone
  Created wheel for liac-arff: filename=li

In [2]:
# 📚 Imports
import arff
import pandas as pd
import numpy as np
import requests
import tldextract
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import whois
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datetime import datetime



In [3]:
with open('phishing+websites/Training Dataset.arff', 'r') as f:
    arff_data = arff.load(f)
df = pd.DataFrame(arff_data['data'], columns=[attr[0] for attr in arff_data['attributes']])

# Clean and prepare
df = df.replace('?', np.nan).fillna(0)
for col in df.columns:
    if col != 'Result':
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['Result'] = pd.to_numeric(df['Result'], errors='coerce').replace(-1, 0)
df = df[df['Result'].isin([0, 1])]

X = df.drop(columns=['Result'])
y = df['Result']
feature_columns = X.columns.tolist()
# 👀 Show the first few rows
df.head()


Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,0
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,0
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,0
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,0
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [5]:
# 🧹 Preprocessing
df = df.replace('?', np.nan).fillna(0)
for col in df.columns:
    if col != 'Result':
        df[col] = pd.to_numeric(df[col], errors='coerce')
df['Result'] = pd.to_numeric(df['Result'], errors='coerce').replace(-1, 0)
df = df[df['Result'].isin([0, 1])]


In [6]:
# 📈 Train/Test split
X = df.drop(columns=['Result'])
y = df['Result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# 🌲 Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [8]:
# 🔍 Evaluate
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=["Phishing (0)", "Legitimate (1)"])

# 📈 Model performance
print("📈 Model Evaluation Metrics:\n")
print(f"✅ Accuracy: {accuracy:.4f}")
print("\n🧮 Confusion Matrix:\n", conf_matrix)
print("\n📋 Classification Report:\n", class_report)


📈 Model Evaluation Metrics:

✅ Accuracy: 0.9670

🧮 Confusion Matrix:
 [[ 909   47]
 [  26 1229]]

📋 Classification Report:
                 precision    recall  f1-score   support

  Phishing (0)       0.97      0.95      0.96       956
Legitimate (1)       0.96      0.98      0.97      1255

      accuracy                           0.97      2211
     macro avg       0.97      0.97      0.97      2211
  weighted avg       0.97      0.97      0.97      2211



In [9]:
# 🔍 WHOIS Feature Engineering
def get_whois_info(url):
    try:
        domain = tldextract.extract(url).registered_domain
        whois_info = whois.whois(domain)
        creation_date = whois_info.creation_date
        expiration_date = whois_info.expiration_date

        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        domain_age = (datetime.now() - creation_date).days if creation_date else 0
        registration_length = (expiration_date - creation_date).days if creation_date and expiration_date else 0
        return domain_age, registration_length
    except:
        return 0, 0


In [11]:
# 🌐 Your URLs
phish_urls = [
    "http://toncoinsp.com", "http://www.paypa1.com", "http://www.faceb00k-login.com",
    "http://www.apple-support-security-alert.com", "http://www.bankofamerica-login.com",
    "http://www.g00gle-login-attempt.com", "http://paypal-opladen.be",
    "http://login.microsoftonline.ccisystems.us", "http://login-amazon-account.com",
    "http://apple-grx-support-online.com", "http://renew-netflix.com",
    "http://allegrolokalnie.pl-1751.cfd", "http://mondiai-rps.com/index.php"
]

legit_urls = [
    "https://www.google.com", "https://www.wikipedia.org", "https://www.bbc.com",
    "https://www.amazon.com", "https://www.stackoverflow.com", "https://www.nytimes.com",
    "https://www.khanacademy.org", "https://www.coursera.org", "https://www.nasa.gov",
    "https://www.who.int"
]

# 🔍 Feature extraction
def extract_features_from_url(url):
    parsed = urlparse(url)
    domain = parsed.netloc
    path = parsed.path
    domain_info = tldextract.extract(url).top_domain_under_public_suffix


    def get_whois_info(domain):
        try:
            info = whois.whois(domain)
            creation_date = info.creation_date
            expiration_date = info.expiration_date

            if isinstance(creation_date, list):
                creation_date = creation_date[0]
            if isinstance(expiration_date, list):
                expiration_date = expiration_date[0]

            domain_age = (datetime.now() - creation_date).days if creation_date else 0
            registration_length = (expiration_date - creation_date).days if expiration_date and creation_date else 0
        except:
            domain_age = 0
            registration_length = 0

        return domain_age, registration_length

    domain_age, reg_length = get_whois_info(domain_info)

    return {
        "URL_Length": len(url),
        "URL_Depth": path.count('/'),
        "Redirection": int('//' in url.replace('://', '', 1)),
        "Https": int(parsed.scheme == 'https'),
        "TinyURL": int(len(url) < 20),
        "Prefix/Suffix": int('-' in domain),
        "Domain_Age": domain_age,
        "Domain_Registration_Length": reg_length
    }

# 🔗 Extract features and build dataset
dataset = []

for url in phish_urls:
    features = extract_features_from_url(url)
    features['Label'] = 0  # phishing
    dataset.append(features)

for url in legit_urls:
    features = extract_features_from_url(url)
    features['Label'] = 1  # legitimate
    dataset.append(features)

df = pd.DataFrame(dataset)
df.to_csv("phishing_custom_features.csv", index=False)
print("✅ Dataset created and saved.")

# 🧠 Train Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

X = df.drop("Label", axis=1)
y = df["Label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



✅ Dataset created and saved.
✅ Accuracy: 0.8571428571428571

📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.67      1.00      0.80         2

    accuracy                           0.86         7
   macro avg       0.83      0.90      0.84         7
weighted avg       0.90      0.86      0.86         7


🧮 Confusion Matrix:
 [[4 1]
 [0 2]]


In [12]:
# 🔍 Predict a new URL
def predict_url(url, model):
    features = extract_features_from_url(url)

    # Ensure the features are in a DataFrame with correct column order
    df_features = pd.DataFrame([features])[X.columns]  # Match training columns exactly

    prediction = model.predict(df_features)[0]
    result = "🛡️ Legitimate" if prediction == 1 else "⚠️ Phishing"
    return result

# 🔗 Try it!
test_url = input("🔗 Enter a URL to check if it's phishing: ").strip()
prediction_result = predict_url(test_url, rf)
print(f"\n🔍 The URL is predicted to be: {prediction_result}")



🔍 The URL is predicted to be: ⚠️ Phishing
