In [None]:
import pandas as pd
df = pd.read_csv(r'phishing_website_detector\PhiUSIIL_Phishing_URL_Dataset.csv',encoding="utf-8")

In [12]:
df = df.drop(['Bank', 'Pay', 'Crypto'], axis=1)

In [13]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = ["Domain", "TLD", "Title", "URL"]
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save encoders for later use

In [14]:
df['FILENAME'] = df['FILENAME'].astype(str)
df['FILENAME'] = df['FILENAME'].str.extract(r'(\d+\.?\d*)')
df['FILENAME'] = pd.to_numeric(df['FILENAME'], errors='coerce') 
df['FILENAME'].fillna(0, inplace=True) 
print(df.dtypes)  
print(df)

FILENAME                      float64
URL                             int64
URLLength                       int64
Domain                          int64
DomainLength                    int64
IsDomainIP                      int64
TLD                             int64
URLSimilarityIndex            float64
CharContinuationRate          float64
TLDLegitimateProb             float64
URLCharProb                   float64
TLDLength                       int64
NoOfSubDomain                   int64
HasObfuscation                  int64
NoOfObfuscatedChar              int64
ObfuscationRatio              float64
NoOfLettersInURL                int64
LetterRatioInURL              float64
NoOfDegitsInURL                 int64
DegitRatioInURL               float64
NoOfEqualsInURL                 int64
NoOfQMarkInURL                  int64
NoOfAmpersandInURL              int64
NoOfOtherSpecialCharsInURL      int64
SpacialCharRatioInURL         float64
IsHTTPS                         int64
LineOfCode  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FILENAME'].fillna(0, inplace=True)


In [15]:
X1 = df.loc[:,:'IsHTTPS']

In [16]:
df.head(1)

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,HasHiddenFields,HasPasswordField,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.0,209013,31,187114,24,0,231,100.0,1.0,0.522907,...,1,0,1,34,20,28,119,0,124,1


In [17]:
X1 = df.loc[:,:'IsHTTPS']

In [18]:
X1.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS
0,521848.0,209013,31,187114,24,0,231,100.0,1.0,0.522907,...,18,0.581,0,0.0,0,0,0,1,0.032,1
1,31372.0,223747,23,204805,16,0,254,100.0,0.666667,0.03265,...,9,0.391,0,0.0,0,0,0,2,0.087,1
2,597387.0,227579,29,209611,22,0,647,100.0,0.866667,0.028555,...,15,0.517,0,0.0,0,0,0,2,0.069,1
3,554095.0,205082,26,182115,19,0,231,100.0,1.0,0.522907,...,13,0.5,0,0.0,0,0,0,1,0.038,1
4,151578.0,198761,33,173664,26,0,503,100.0,1.0,0.079963,...,20,0.606,0,0.0,0,0,0,1,0.03,1


In [19]:
X2 =df.loc[:, 'Title':] 
X2 = X2.drop('label',axis = 1)

In [20]:
X2.head()

Unnamed: 0,Title,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,Robots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,...,HasSubmitButton,HasHiddenFields,HasPasswordField,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef
0,197871,0.0,0.0,0,1,1,0,0,0,0,...,1,1,0,1,34,20,28,119,0,124
1,93363,55.555556,55.555556,1,1,0,0,0,0,0,...,1,0,0,1,50,9,8,39,0,217
2,187338,46.666667,46.666667,0,1,1,0,0,1,0,...,1,1,0,1,10,2,7,42,2,5
3,81289,0.0,0.0,0,1,1,0,0,0,1,...,1,1,0,1,3,27,15,22,1,31
4,69782,100.0,100.0,0,1,1,1,1,1,0,...,1,1,0,1,244,15,34,72,1,85


In [21]:
Y = df.label

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X1,Y,test_size= 0.2,random_state=42)

In [23]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, Y_train)

In [24]:
from sklearn.ensemble import RandomForestRegressor
RF = RandomForestRegressor(n_estimators=50, random_state=42)
RF.fit(X_train, Y_train)

In [25]:
import pickle

# Save the model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(RF, file)


In [28]:
RF.score(X_test,Y_test)

1.0

In [29]:
linear.score(X_test,Y_test)

0.9261400412766436

In [30]:
X1.dtypes

FILENAME                      float64
URL                             int64
URLLength                       int64
Domain                          int64
DomainLength                    int64
IsDomainIP                      int64
TLD                             int64
URLSimilarityIndex            float64
CharContinuationRate          float64
TLDLegitimateProb             float64
URLCharProb                   float64
TLDLength                       int64
NoOfSubDomain                   int64
HasObfuscation                  int64
NoOfObfuscatedChar              int64
ObfuscationRatio              float64
NoOfLettersInURL                int64
LetterRatioInURL              float64
NoOfDegitsInURL                 int64
DegitRatioInURL               float64
NoOfEqualsInURL                 int64
NoOfQMarkInURL                  int64
NoOfAmpersandInURL              int64
NoOfOtherSpecialCharsInURL      int64
SpacialCharRatioInURL         float64
IsHTTPS                         int64
dtype: objec

In [None]:
from sklearn.linear_model import LogisticRegression
LOG = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1000)

In [None]:
from sklearn.linear_model import Lasso
LAS = Lasso(alpha=1.0, max_iter=1000)

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(criterion='gini', max_depth=None)

In [None]:
from sklearn.tree import DecisionTreeRegressor
DTR = DecisionTreeRegressor(criterion='mse', max_depth=None)

In [None]:
from sklearn.ensemble import RandomForestRegressor
RFF = RandomForestRegressor(n_estimators=10, max_depth=5, random_state=42)

In [None]:
from sklearn.svm import SVR
SVR = SVR(kernel='rbf', C=1.0, gamma='scale')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KN = KNeighborsClassifier(n_neighbors=5, metric='minkowski')

In [None]:
LOG.fit(X_train,Y_train)

In [None]:
LAS.fit(X_train,Y_train)

In [None]:
DT.fit(X_train,Y_train)


In [None]:
RFF.fit(X_train,Y_train)

In [None]:
SVR.fit(X_train,Y_train)

In [None]:
KN.fit(X_train,Y_train)

In [None]:
RFF.score(X_test,Y_test)

In [None]:
LAS.score(X_test,Y_test)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X2,Y,test_size= 0.2,random_state=42)

In [None]:
RF.fit(x_train,y_train)

In [None]:
RF.score(x_test,y_test)

In [None]:
LAS.fit(x_train,y_train)

In [None]:
LOG.fit(x_train,y_train)

In [None]:
DT.fit(x_train,y_train)

In [None]:
KN.fit(x_train,y_train)

In [None]:
X1.head()

In [None]:
import re
import tldextract
from urllib.parse import urlparse
import string
import hashlib

def generate_filename(url):
    hash_object = hashlib.md5(url.encode())
    filename = f"{int(hash_object.hexdigest(), 16) % 10**6}.txt"t"

def extract_url_features(user_url):
    """Extract various features from the given URL and format the output for readability."""
    
    user_url = user_url.strip()  # Remove leading/trailing whitespace
    parsed_url = urlparse(user_url)
    domain = parsed_url.netloc  # Extract the domain from the URL
    
    # Extract subdomain, domain, and TLD
    tld_info = tldextract.extract(user_url)
    tld = tld_info.suffix  
    subdomain = tld_info.subdomain  

    # Generate filename based on the URL
    filename = generate_filename(user_url)

    # URL features
    url_length = len(user_url) - 1
    domain_length = len(domain)
    num_subdomains = domain.count('.')
    is_https = 1 if parsed_url.scheme == "https" else 0

    # Character-based features
    num_letters = sum(char.isalpha() for char in user_url)
    num_digits = sum(char.isdigit() for char in user_url)
    num_special_chars = sum(char in string.punctuation for char in user_url)

    # Ratios
    letter_ratio = round(num_letters / url_length, 3) if url_length > 0 else 0
    digit_ratio = round(num_digits / url_length, 3) if url_length > 0 else 0
    special_char_ratio = round(num_special_chars / url_length, 3) if url_length > 0 else 0

    # Placeholder values (hardcoded)
    placeholder_100 = 100  
    placeholder_zeros = [0] * 7  
    placeholder_values = [0.032, 1, 558, 9381, 1]

    # Format output for better readability
    output = f"""
    FILENAME: {filename}
    URL: {user_url}
    URL Length: {url_length}
    Domain: {domain}
    Domain Length: {domain_length}
    Is Domain an IP: 0
    TLD: {tld}
    URL Similarity Index: {placeholder_100}
    Character Continuation Rate: 1
    Letter Ratio in URL: {letter_ratio}
    Digit Ratio in URL: {digit_ratio}
    Number of Letters in URL: {num_letters}
    Number of Digits in URL: {num_digits}
    Number of Special Characters in URL: {num_special_chars}
    Special Character Ratio: {special_char_ratio}
    Is HTTPS: {is_https}
    Additional Placeholder Values: {placeholder_values}
    """

    return output.strip()

# Example usage
url = "https://www.uni-mainz.de"
print(extract_url_features(url))

In [None]:
import pickle

In [None]:
extract_url_features('https://www.southbankmosaics.com')

['399208.txt',
 30,
 'www.southbankmosaics.com',
 24,
 0,
 'co.uk',
 2,
 100,
 0.8,
 0.0,
 24,
 0,
 0,
 0,
 0,
 6,
 0.2,
 1]

In [None]:
X1['IsDomainIP'].uniqueque()

array([0, 1], dtype=int64)