In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import json

# Load CSV file
df = pd.read_csv('Newdata/NewTestData.csv')
# Display first 10 rows
print(df.head(10))

                                   URL  URLLength  \
0                 https://www.dasts.dk         19   
1        https://www.applethoughts.com         28   
2      http://www.yellowleos.phpnet.us         30   
3  https://www.madeinchicagomuseum.com         34   
4         https://www.operaamerica.org         27   
5              https://www.sedaily.com         22   
6    http://www.coinbasewalletones.com         32   
7           https://www.absolar.org.br         25   
8             https://www.whschool.org         23   
9     http://www.webmail.yourturbe.org         31   

                        Domain  DomainLength  IsDomainIP  TLD  \
0                 www.dasts.dk            12           0   dk   
1        www.applethoughts.com            21           0  com   
2     www.yellowleos.phpnet.us            24           0   us   
3  www.madeinchicagomuseum.com            27           0  com   
4         www.operaamerica.org            20           0  org   
5              www.sedaily

In [2]:
columns_to_drop = ["URL", "Domain", "TLD", "Title"]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

X = df.drop(columns=["label"])
y = df["label"].astype(int)

print("Label distribution:\n", y.value_counts())

features_to_drop = ['URLSimilarityIndex', 'LineOfCode']
df_filtered = df.drop(columns=features_to_drop)
X_filtered = df_filtered.drop(columns=["label"])
y = df_filtered["label"].astype(int)

feature_columns = X_filtered.columns.tolist()

# Save for later use
with open('feature_columns.json', 'w') as f:
    json.dump(feature_columns,f)

Label distribution:
 label
1    3110
0    1895
Name: count, dtype: int64


In [19]:
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
X_filtered, y, test_size=0.4, stratify=y, random_state=42)
model_filtered = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_filtered.fit(X_train_f, y_train_f)
y_pred_f = model_filtered.predict(X_test_f)
print("Baseline Accuracy:", accuracy_score(y_test_f, y_pred_f))
print(confusion_matrix(y_test_f, y_pred_f))
print(classification_report(y_test_f, y_pred_f))


Baseline Accuracy: 0.9975024975024975
[[ 754    4]
 [   1 1243]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       758
           1       1.00      1.00      1.00      1244

    accuracy                           1.00      2002
   macro avg       1.00      1.00      1.00      2002
weighted avg       1.00      1.00      1.00      2002



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [20]:
import requests
import re
import math
import pandas as pd
import tldextract
from bs4 import BeautifulSoup

def extract_features(url, expected_columns):
    try:
        headers = {'Accept-Encoding': 'identity'}
        response = requests.get(url, headers=headers, timeout=10)
        html = response.text
        status = 1
    except Exception as e:
        print("Error fetching URL:", e)
        html = ""
        status = 0

    soup = BeautifulSoup(html, 'html.parser')
    ext = tldextract.extract(url)
    domain = ext.domain + '.' + ext.suffix if ext.suffix else ext.domain

    def safe_div(a, b):
        return a / b if b != 0 else 0

    title = soup.title.string.strip().lower() if soup.title and soup.title.string else ''
    domain_match_score = int(domain.lower() in title)
    url_match_score = int(ext.domain.lower() in url.lower() and ext.domain.lower() in title)

    features = {
        'URLLength': len(url),
        'DomainLength': len(domain),
        'IsDomainIP': int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', domain))),
        'CharContinuationRate': safe_div(len(re.findall(r'[a-zA-Z]{2,}', url)), len(url)),
        'TLDLegitimateProb': 0.5,  # Placeholder
        'URLCharProb': safe_div(sum(ord(c) for c in url), len(url)),
        'TLDLength': len(ext.suffix),
        'NoOfSubDomain': url.count('.') - 2 if url.count('.') > 1 else 0,
        'HasObfuscation': int("@" in url or "-" in url),
        'NoOfObfuscatedChar': url.count('@') + url.count('-'),
        'ObfuscationRatio': safe_div(url.count('@') + url.count('-'), len(url)),
        'NoOfLettersInURL': sum(c.isalpha() for c in url),
        'LetterRatioInURL': safe_div(sum(c.isalpha() for c in url), len(url)),
        'NoOfDegitsInURL': sum(c.isdigit() for c in url),
        'DegitRatioInURL': safe_div(sum(c.isdigit() for c in url), len(url)),
        'NoOfEqualsInURL': url.count('='),
        'NoOfQMarkInURL': url.count('?'),
        'NoOfAmpersandInURL': url.count('&'),
        'NoOfOtherSpecialCharsInURL': sum(not c.isalnum() and c not in "=?&" for c in url),
        'SpacialCharRatioInURL': safe_div(sum(not c.isalnum() for c in url), len(url)),
        'IsHTTPS': int(url.lower().startswith("https")),
        'LargestLineLength': max([len(line) for line in html.split('\n')]) if html else 0,
        'HasTitle': int(bool(title)),
        'DomainTitleMatchScore': domain_match_score * 100,
        'URLTitleMatchScore': url_match_score * 100,
        'HasFavicon': int(bool(soup.find("link", rel=lambda x: x and 'icon' in x.lower()))),
        'Robots': status,
        'IsResponsive': 1,
        'NoOfURLRedirect': 0,
        'NoOfSelfRedirect': 0,
        'HasDescription': int(bool(soup.find("meta", attrs={"name": "description"}))),
        'NoOfPopup': len(soup.find_all("script", string=re.compile(r"alert\s*\("))),
        'NoOfiFrame': len(soup.find_all("iframe")),
        'HasExternalFormSubmit': int(any("http" in form.get("action", "") and ext.domain not in form.get("action", "") for form in soup.find_all("form"))),
        'HasSocialNet': int(any(site in html.lower() for site in ['facebook', 'twitter', 'linkedin', 'instagram'])),
        'HasSubmitButton': int(bool(soup.find("input", {"type": "submit"}))),
        'HasHiddenFields': int(bool(soup.find("input", {"type": "hidden"}))),
        'HasPasswordField': int(bool(soup.find("input", {"type": "password"}))),
        'Bank': int("bank" in url.lower()),
        'Pay': int("pay" in url.lower()),
        'Crypto': int("crypto" in url.lower()),
        'HasCopyrightInfo': int("copyright" in html.lower()),
        'NoOfImage': len(soup.find_all("img")),
        'NoOfCSS': len(soup.find_all("link", {"rel": "stylesheet"})),
        'NoOfJS': len(soup.find_all("script")),
        'NoOfSelfRef': len([a for a in soup.find_all("a", href=True) if ext.domain in a["href"]]),
        'NoOfEmptyRef': len([a for a in soup.find_all("a", href=True) if a["href"] in ["#", ""]]),
        'NoOfExternalRef': len([a for a in soup.find_all("a", href=True) if "http" in a["href"] and ext.domain not in a["href"]])
    }

    # Ensure all expected columns are included
    for col in expected_columns:
        if col not in features:
            features[col] = 0

    return pd.DataFrame([[features[col] for col in expected_columns]], columns=expected_columns)


In [21]:
known_legit_urls = [
    "https://www.facebook.com/",
    "https://accounts.google.com/",
    "https://twitter.com/",
    "https://www.linkedin.com/",
    "https://github.com/",
    "https://login.microsoftonline.com/",
    "https://www.paypal.com/",
    "https://www.instagram.com/",
    "https://www.apple.com/",
    "https://www.amazon.com/",
    "https://www.netflix.com/",
    "https://www.youtube.com/",
    "https://www.wikipedia.org/"
]
new_legit_samples = []
for url in known_legit_urls:
        try:
            feat = extract_features(url, feature_columns)
            feat["label"] = 1
            new_legit_samples.append(feat)
        except Exception as e:
            print(f"Failed to process {url}: {e}")
df_new_legit = pd.concat(new_legit_samples, ignore_index=True)


In [22]:
df_filtered["label"] = y
df_combined = pd.concat([df_filtered, df_new_legit], ignore_index=True)
X_new = df_combined.drop(columns=["label"])
y_new = df_combined["label"].astype(int)
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
X_new, y_new, test_size=0.25, stratify=y_new, random_state=42
)
model_improved = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_improved.fit(X_train_new, y_train_new)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
y_pred_new = model_improved.predict(X_test_new)
print("Improved Accuracy:", accuracy_score(y_test_new, y_pred_new))
print(confusion_matrix(y_test_new, y_pred_new))
print(classification_report(y_test_new, y_pred_new))


Improved Accuracy: 0.9952191235059761
[[471   3]
 [  3 778]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       474
           1       1.00      1.00      1.00       781

    accuracy                           1.00      1255
   macro avg       0.99      0.99      0.99      1255
weighted avg       1.00      1.00      1.00      1255



In [24]:
with open('feature_columns.json', 'r') as f:
    feature_columns = json.load(f)


url = input("Enter a URL to check: ")
features_df = extract_features(url, feature_columns)

print("Extracted Features:")
print(features_df.T)

pred = model_improved.predict(features_df)[0]
print("\nPrediction:", "Legitimate" if pred == 1 else "Malicious")


Extracted Features:
                                        0
URLLength                       23.000000
DomainLength                    10.000000
IsDomainIP                       0.000000
CharContinuationRate             0.173913
TLDLegitimateProb                0.500000
URLCharProb                     94.173913
TLDLength                        3.000000
NoOfSubDomain                    0.000000
HasObfuscation                   0.000000
NoOfObfuscatedChar               0.000000
ObfuscationRatio                 0.000000
NoOfLettersInURL                17.000000
LetterRatioInURL                 0.739130
NoOfDegitsInURL                  0.000000
DegitRatioInURL                  0.000000
NoOfEqualsInURL                  0.000000
NoOfQMarkInURL                   0.000000
NoOfAmpersandInURL               0.000000
NoOfOtherSpecialCharsInURL       6.000000
SpacialCharRatioInURL            0.260870
IsHTTPS                          1.000000
LargestLineLength           199244.000000
HasTitle      