In [13]:
import pandas as pd # Helps in data manipulation and analysis using DataFrames and Series.

In [14]:
# Load the training data
train = pd.read_csv("../data/train_encoded.csv")
test = pd.read_csv("../data/test_encoded.csv")

# Drop 'URL' column before separating features and target
X_train = train.drop(columns=['Label', 'URL'])  # <<< See: Dropping URL
y_train = train['Label']

X_test = test.drop(columns=['Label', 'URL'])    # <<< See: Dropping URL
y_test = test['Label']

# Train your model again
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Save the model again
import joblib
joblib.dump(rf_model, "../model/RandomForest_model_new.pkl")


['../model/RandomForest_model_new.pkl']

In [15]:
import joblib
import pandas as pd

# 1. Load the saved Random Forest model
rf_model = joblib.load("../model/RandomForest_model_new.pkl")

# 2. Define a function to extract features from a new link
def extract_features(url):
    features = {}
    features['URL'] = url  # <<< Add this line to fix the missing column
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Ask the user to input a new web address (URL)
new_url = input("Enter the web address (URL) to check: ")

# 4. Extract features from the entered URL
url_features = extract_features(new_url)

# 5. Convert the features into a DataFrame
input_df = pd.DataFrame([url_features])

# 6. Drop 'URL' column before prediction (since model expects features only)
input_df = input_df.drop(columns=['URL'])

# 7. Make a prediction
prediction = rf_model.predict(input_df)[0]

# 8. Output the result
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    print("✅ The URL is predicted to be **Legitimate**.")


Enter the web address (URL) to check: 3souls.us/123H/b4rky/b4rky/P1.html?rAtm1d=;8dcd50af3804821120990f23500e12418dcd50af3804821120990f23500e1241
⚠️ The URL is predicted to be **Phishing**.


In [16]:
import joblib
import pandas as pd
import requests

# 1. Load the saved Random Forest model
rf_model = joblib.load("../model/RandomForest_model_new.pkl")

# 2. Function to extract features from a URL
def extract_features(url):
    features = {}
    features['URL'] = url
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Function to check if a URL exists
def check_url_exists(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code < 400
    except requests.RequestException:
        return False

# 4. Input from user
user_input_url = input("Enter the web address (URL) to check: ")

# Ensure the URL starts with http:// or https://
if not user_input_url.startswith(('http://', 'https://')):
    full_url = 'http://' + user_input_url
else:
    full_url = user_input_url

# 5. Extract features and predict
url_features = extract_features(full_url)
input_df = pd.DataFrame([url_features]).drop(columns=['URL'])
prediction = rf_model.predict(input_df)[0]

# 6. Check result and existence
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    if check_url_exists(full_url):
        print("✅ The URL is **Legitimate** and **Exists**.")
    else:
        print("❌ The URL is **Not Reachable** (Does Not Exist).")


Enter the web address (URL) to check: 3souls.us/123H/b4rky/b4rky/P1.html?rAtm1d=;8dcd50af3804821120990f23500e12418dcd50af3804821120990f23500e1241
⚠️ The URL is predicted to be **Phishing**.


In [17]:
import joblib
import pandas as pd
import requests

# 1. Load the saved Random Forest model
rf_model = joblib.load("../model/RandomForest_model_new.pkl")

# 2. Function to extract features from a URL
def extract_features(url):
    features = {}
    features['URL'] = url
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Function to check if a URL exists
def check_url_exists(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code < 400
    except requests.RequestException:
        return False

# 4. Input from user
user_input_url = input("Enter the web address (URL) to check: ")

# Ensure the URL starts with http:// or https://
if not user_input_url.startswith(('http://', 'https://')):
    full_url = 'http://' + user_input_url
else:
    full_url = user_input_url

# 5. Extract features and predict
url_features = extract_features(full_url)
input_df = pd.DataFrame([url_features]).drop(columns=['URL'])
prediction = rf_model.predict(input_df)[0]

# 6. Check result and existence
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    if check_url_exists(full_url):
        print("✅ The URL is **Legitimate** and **Exists**.")
    else:
        print("❌ The URL is **Legitimate** but **Not Reachable** (Does Not Exist).")


Enter the web address (URL) to check: learnai.co.in
✅ The URL is **Legitimate** and **Exists**.


In [18]:
import joblib
import pandas as pd
import requests

# 1. Load the saved Random Forest model
rf_model = joblib.load("../model/RandomForest_model_new.pkl")

# 2. Function to extract features from a URL
def extract_features(url):
    features = {}
    features['URL'] = url
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Function to check if a URL exists
def check_url_exists(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code < 400
    except requests.RequestException:
        return False

# 4. Input from user
user_input_url = input("Enter the web address (URL) to check: ")

# Ensure the URL starts with http:// or https://
if not user_input_url.startswith(('http://', 'https://')):
    full_url = 'http://' + user_input_url
else:
    full_url = user_input_url

# 5. Extract features and predict
url_features = extract_features(full_url)
input_df = pd.DataFrame([url_features]).drop(columns=['URL'])
prediction = rf_model.predict(input_df)[0]

# 6. Check result and existence
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    if check_url_exists(full_url):
        print("✅ The URL is **Legitimate** and **Exists**.")
    else:
        print("❌ The URL is **Legitimate** but **Not Reachable** (Does Not Exist).")


Enter the web address (URL) to check: https://www.faccccebook.com/
⚠️ The URL is predicted to be **Phishing**.


In [21]:
import joblib
import pandas as pd
import requests

# 1. Load the saved Random Forest model
rf_model = joblib.load("../model/RandomForest_model_new.pkl")

# 2. Function to extract features from a URL
def extract_features(url):
    features = {}
    features['URL'] = url
    features['url_length'] = len(url)
    features['num_dots'] = url.count('.')
    features['num_hyphens'] = url.count('-')
    features['has_at'] = int('@' in url)
    features['has_https'] = int('https' in url.lower())
    features['has_ip'] = int(any(char.isdigit() for char in url.split('/')[2])) if '://' in url else 0
    features['count_suspicious_words'] = sum(word in url.lower() for word in ['login', 'secure', 'account', 'update', 'free', 'verify'])
    return features

# 3. Function to check if a URL exists
def check_url_exists(url):
    try:
        response = requests.head(url, allow_redirects=True, timeout=5)
        return response.status_code < 400
    except requests.RequestException:
        return False

# 4. Input from user
user_input_url = input("Enter the web address (URL) to check: ")

# Ensure the URL starts with http:// or https://
if not user_input_url.startswith(('http://', 'https://')):
    full_url = 'http://' + user_input_url
else:
    full_url = user_input_url

# 5. Extract features and predict
url_features = extract_features(full_url)
input_df = pd.DataFrame([url_features]).drop(columns=['URL'])
prediction = rf_model.predict(input_df)[0]

# 6. Check result and existence
if prediction == 1:
    print("⚠️ The URL is predicted to be **Phishing**.")
else:
    if check_url_exists(full_url):
        print("✅ The URL is **Legitimate** and **Exists**.")
    else:
        print("❌ The URL is **Not Reachable** (Does Not Exist).")


Enter the web address (URL) to check: 192.com/atoz/people/harrison/miriam/
❌ The URL is **Not Reachable** (Does Not Exist).
