<a href="https://colab.research.google.com/github/tamirhouri/phishing-detector-chrome-extension/blob/main/static-content-detector-logistic-regression/StaticContentDetector_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import csv

def extract_static_features_to_csv(json_path, csv_path):
    with open(json_path, 'r', encoding='utf-8') as infile:
        data = json.load(infile)

    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        header = [f"feature_{i}" for i in range(6)] + ["label"]
        writer.writerow(header)

        for entry in data:
            features = entry.get("staticContentFeatures", [0]*6)
            label = entry.get("label", "Error")
            writer.writerow(features + [label])

# Usage
json_file_path = "/content/results.json"
csv_output_path = "content_features_regression.csv"
extract_static_features_to_csv(json_file_path, csv_output_path)


In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

df = pd.read_csv("content_features_regression.csv")
# remove rows with label
df = df[df['label'] != 'Error']
# map labels to 0 and 1
df['label'] = df['label'].map({'phishing': 1, 'benign': 0})
X = df[[f"feature_{i}" for i in range(0,6)]]
y = df["label"]

y.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,5847
1,3441


In [4]:
# 2a. Logistic Regression → learns positive/negative weights
lr = LogisticRegression()
lr.fit(X, y)
print("LR bias:", float(lr.intercept_[0]))
print("LR weights:", lr.coef_[0].tolist())

# 2b. Random Forest → gives you feature importances
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)
print("RF importances:", rf.feature_importances_)

# 2c. Estimate overall accuracy
print("LR CV accuracy:", cross_val_score(lr, X, y, cv=5).mean())
print("RF CV accuracy:", cross_val_score(rf, X, y, cv=5).mean())


LR bias: -2.1980296121085106
LR weights: [-0.9980825058085083, 2.6840079092211897, -0.05051873473859611, 3.1778866084080843, -0.6635635780569809, -0.00870018423531527]
RF importances: [0.09969173 0.23818796 0.08363802 0.517191   0.04098902 0.02030227]
LR CV accuracy: 0.7848822104474211
RF CV accuracy: 0.8391464988902435


In [9]:
import json, numpy as np

weights = lr.coef_[0].tolist()        # 6 numbers
bias    = float(lr.intercept_[0])     # scalar

with open("static-content-detector-lr.json", "w") as f:
    json.dump({"bias": bias, "weights": weights}, f, indent=2)

In [None]:
from sklearn.metrics import precision_recall_curve
p, r, th = precision_recall_curve(y, lr.predict_proba(X)[:,1])

idx = np.where(r >= 0.95)[0][0]
best_thresh = th[idx]

best_thresh