## 1. Goal

To find a set of top features using multiple feature selection methods
1. RFE
2. ReliefF
3. RandomForest feature_importances_ attribute

- Readout: Top10 feature rankings

- Dataset: Randomly selected 2,000 dataset with equal data number of labels

In [1]:
!pip install skrebate

Collecting skrebate
  Downloading skrebate-0.62.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: skrebate
  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
  Created wheel for skrebate: filename=skrebate-0.62-py3-none-any.whl size=29255 sha256=5b4a5374f78a616e635bf2b906e7f6fa7bafbab49501d2f46e303f7a727aa210
  Stored in directory: /root/.cache/pip/wheels/dd/67/40/683074a684607162bd0e34dcf7ccdfcab5861c3b2a83286f3a
Successfully built skrebate
Installing collected packages: skrebate
Successfully installed skrebate-0.62


In [2]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from skrebate import ReliefF
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [3]:
# 0. Allow Google Colab to access to Google Drive (Mount)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from sklearn.model_selection import train_test_split

# 1: read csv file
dataset = pd.read_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/PhiUSIIL_Phishing_URL_Dataset.csv')

# Select numerical features and labels for training
numerical_df = dataset.select_dtypes(include=['int64', 'float64']).drop(columns=['label'])
y = dataset['label']

# Define a function to normalize numerical columns
def normalize_columns(df):
    return (df - df.min()) / (df.max() - df.min())

# Apply normalization to all columns
normalized_numerical_df = normalize_columns(numerical_df)

# Separating data points based on their labels
X_class_0 = normalized_numerical_df[y == 0]
X_class_1 = normalized_numerical_df[y == 1]

# Randomly sampling 1000 points from each class
np.random.seed(42)
sampled_indices_class_0 = np.random.choice(len(X_class_0), size=1000, replace=False)
sampled_indices_class_1 = np.random.choice(len(X_class_1), size=1000, replace=False)

# Extracting the sampled data points
X_sampled_class_0 = X_class_0.iloc[sampled_indices_class_0]
X_sampled_class_1 = X_class_1.iloc[sampled_indices_class_1]

# Combining the sampled data points
X = pd.concat([X_sampled_class_0, X_sampled_class_1], axis=0)
y = pd.DataFrame(np.concatenate([np.zeros(len(X_sampled_class_0)), np.ones(len(X_sampled_class_1))]), columns=['label'])

In [5]:
X, y

(        URLLength  DomainLength  IsDomainIP  URLSimilarityIndex  \
 170643   0.000986      0.084906         0.0            0.797215   
 229388   0.005753      0.349057         0.0            0.192161   
 189747   0.008383      0.094340         0.0            0.178640   
 39922    0.002301      0.160377         0.0            0.492388   
 172005   0.007561      0.273585         0.0            0.366618   
 ...           ...           ...         ...                 ...   
 46434    0.002630      0.169811         0.0            1.000000   
 129355   0.003452      0.216981         0.0            1.000000   
 110815   0.001479      0.103774         0.0            1.000000   
 77084    0.002959      0.188679         0.0            1.000000   
 172062   0.002465      0.160377         0.0            1.000000   
 
         CharContinuationRate  TLDLegitimateProb  URLCharProb  TLDLength  \
 170643              1.000000           1.000000     0.901704   0.090909   
 229388              0.323529 

## 2-1. ReliefF

In [6]:
#### Test-run of ReliefF without cross-validation
# Initialize ReliefF selector
relieff_selector = ReliefF(n_features_to_select=10)

# Fit ReliefF selector
relieff_selector.fit(X.values, y.values.flatten())

# Get feature rankings
feature_rankings = pd.DataFrame({'Feature': X.columns, 'ReliefF Score': relieff_selector.feature_importances_})

# Sort feature rankings by ReliefF score in descending order
feature_rankings = feature_rankings.sort_values(by='ReliefF Score', ascending=False)

# Get top 10 features
top_10_features = feature_rankings.head(10)

print("Top 10 features selected by ReliefF:")
print(top_10_features)

Top 10 features selected by ReliefF:
                  Feature  ReliefF Score
3      URLSimilarityIndex       0.635311
36           HasSocialNet       0.528620
43       HasCopyrightInfo       0.524465
32         HasDescription       0.422540
21                IsHTTPS       0.399265
25  DomainTitleMatchScore       0.391061
26     URLTitleMatchScore       0.350222
37        HasSubmitButton       0.272290
20  SpacialCharRatioInURL       0.226207
27             HasFavicon       0.193780


In [8]:
from sklearn.model_selection import KFold

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a DataFrame to store feature rankings for each fold
feature_rankings_per_fold = pd.DataFrame(columns=['Fold', 'Feature', 'ReliefF Score'])

# Iterate over each fold
for fold, (train_index, _) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]

    # Initialize ReliefF selector
    relieff_selector = ReliefF(n_features_to_select=10)

    # Fit ReliefF selector
    relieff_selector.fit(X_train.values, y_train.values.flatten())

    # Get feature rankings
    feature_rankings_fold = pd.DataFrame({'Fold': [fold+1]*len(X.columns), 'Feature': X.columns, 'ReliefF Score': relieff_selector.feature_importances_})

    # Sort feature rankings by ReliefF score in descending order and select top 10
    top_10_features_fold = feature_rankings_fold.sort_values(by='ReliefF Score', ascending=False).head(10)

    # Append top 10 features for this fold to the DataFrame
    feature_rankings_per_fold = pd.concat([feature_rankings_per_fold, top_10_features_fold])

# Reset index of the resulting DataFrame
feature_rankings_per_fold.reset_index(drop=True, inplace=True)

print("Top 10 feature rankings per fold:")
print(feature_rankings_per_fold)

Top 10 feature rankings per fold:
   Fold                Feature  ReliefF Score
0     1     URLSimilarityIndex       0.637102
1     1           HasSocialNet       0.551844
2     1       HasCopyrightInfo       0.536056
3     1         HasDescription       0.437019
4     1  DomainTitleMatchScore       0.400686
5     1                IsHTTPS       0.382337
6     1     URLTitleMatchScore       0.357752
7     1        HasSubmitButton       0.279431
8     1  SpacialCharRatioInURL       0.228177
9     1             HasFavicon       0.216706
10    2     URLSimilarityIndex       0.636883
11    2       HasCopyrightInfo       0.560294
12    2           HasSocialNet       0.534125
13    2         HasDescription       0.432575
14    2                IsHTTPS       0.389550
15    2  DomainTitleMatchScore       0.386009
16    2     URLTitleMatchScore       0.354368
17    2        HasSubmitButton       0.281831
18    2  SpacialCharRatioInURL       0.218116
19    2             HasFavicon       0.201425


In [9]:
# Create a new DataFrame to store the feature rankings
new_table = pd.DataFrame(columns=[f"Rank {i}" for i in range(1, 11)])

# Iterate over each fold
for fold in range(1, 6):
    # Filter the DataFrame for the current fold
    fold_features = feature_rankings_per_fold[feature_rankings_per_fold['Fold'] == fold]['Feature'].tolist()

    # Pad with NaNs if fewer than 10 features selected for this fold
    if len(fold_features) < 10:
        fold_features.extend([float('nan')] * (10 - len(fold_features)))

    # Append the features for this fold to the new table
    new_table.loc[fold] = fold_features

# Rename the index
new_table.index.name = 'Cross-validation Fold'

print("New table with rows representing cross-validation folds and columns representing feature rankings:")
print(new_table)

New table with rows representing cross-validation folds and columns representing feature rankings:
                                   Rank 1            Rank 2            Rank 3  \
Cross-validation Fold                                                           
1                      URLSimilarityIndex      HasSocialNet  HasCopyrightInfo   
2                      URLSimilarityIndex  HasCopyrightInfo      HasSocialNet   
3                      URLSimilarityIndex      HasSocialNet  HasCopyrightInfo   
4                      URLSimilarityIndex      HasSocialNet  HasCopyrightInfo   
5                      URLSimilarityIndex      HasSocialNet  HasCopyrightInfo   

                               Rank 4                 Rank 5  \
Cross-validation Fold                                          
1                      HasDescription  DomainTitleMatchScore   
2                      HasDescription                IsHTTPS   
3                      HasDescription  DomainTitleMatchScore   
4            

In [10]:
# Assign a ranking to each feature in each fold
feature_rankings_per_fold['Rank'] = feature_rankings_per_fold.groupby('Fold').cumcount() + 1

# Count appearances of each feature
appearance_counts = feature_rankings_per_fold['Feature'].value_counts()

# Append missing ranks for features with less than 5 appearances
adjustment_rows = []
for feature, count in appearance_counts.items():
    if count < 5:
        missing_entries = 5 - count
        adjustment_rows.extend([(feature, 11)] * missing_entries)

# Create DataFrame from adjustment rows
adjustment_df = pd.DataFrame(adjustment_rows, columns=['Feature', 'Rank'])

# Concatenate with the original DataFrame
adjusted_rankings = pd.concat([feature_rankings_per_fold[['Feature', 'Rank']], adjustment_df])

# Calculate the adjusted average rankings
adjusted_average_rankings = adjusted_rankings.groupby('Feature')['Rank'].mean().sort_values()

print("Adjusted average rankings for each feature:")
print(adjusted_average_rankings)

# Select the top 10 features
top_10_features = adjusted_average_rankings.head(10).reset_index()

# Add a column for the ranking order
top_10_features['Ranking'] = top_10_features.index + 1
top_10_features['Method'] = 'ReliefF'
top_10_features_ReliefF = top_10_features.drop(columns = 'Rank')

print("Top 10 features based on adjusted average rankings:")
print(top_10_features_ReliefF)

Adjusted average rankings for each feature:
Feature
URLSimilarityIndex        1.0
HasSocialNet              2.2
HasCopyrightInfo          2.8
HasDescription            4.0
IsHTTPS                   5.4
DomainTitleMatchScore     5.6
URLTitleMatchScore        7.0
HasSubmitButton           8.0
SpacialCharRatioInURL     9.0
HasFavicon               10.2
CharContinuationRate     10.8
Name: Rank, dtype: float64
Top 10 features based on adjusted average rankings:
                 Feature  Ranking   Method
0     URLSimilarityIndex        1  ReliefF
1           HasSocialNet        2  ReliefF
2       HasCopyrightInfo        3  ReliefF
3         HasDescription        4  ReliefF
4                IsHTTPS        5  ReliefF
5  DomainTitleMatchScore        6  ReliefF
6     URLTitleMatchScore        7  ReliefF
7        HasSubmitButton        8  ReliefF
8  SpacialCharRatioInURL        9  ReliefF
9             HasFavicon       10  ReliefF


## 2-2. Feature_importances_ attribute of RandomForestClassifier

In [11]:
#### Test-run without cross-validation
y = np.concatenate([np.zeros(len(X_sampled_class_0)), np.ones(len(X_sampled_class_1))])
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame to view the features and their importances
features = pd.DataFrame({
    'Feature': X.columns,  # Adjust this if your X is a DataFrame with column names
    'Importance': importances
})

# Sort the DataFrame by importance in descending order
top_10_features = features.sort_values(by='Importance', ascending=False).head(10)

# Reset index to get ranking order
top_10_features.reset_index(drop=True, inplace=True)

# Add a column for the ranking
top_10_features['Rank'] = top_10_features.index + 1


print("Top 10 features based on importance from RandomForestClassifier:")
print(top_10_features)

Top 10 features based on importance from RandomForestClassifier:
              Feature  Importance  Rank
0  URLSimilarityIndex    0.169828     1
1     NoOfExternalRef    0.167065     2
2          LineOfCode    0.144621     3
3         NoOfSelfRef    0.124289     4
4           NoOfImage    0.088185     5
5              NoOfJS    0.061236     6
6        HasSocialNet    0.034816     7
7             NoOfCSS    0.030789     8
8    HasCopyrightInfo    0.028103     9
9      HasDescription    0.019472    10


In [13]:
# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Array to store feature importances across all folds
feature_importances = np.zeros(X.shape[1])

# Perform cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit the model to the training data
    rf.fit(X_train, y_train)

    # Update the feature importances
    feature_importances += rf.feature_importances_

# Average the feature importances
feature_importances /= kf.get_n_splits()

# Create a DataFrame of features and their average importances
features = pd.DataFrame({
    'Feature': X.columns,  # Adjust if X has named columns
    'Average Importance': feature_importances
})

# Sort by importance and select top 10
top_10_features = features.sort_values(by='Average Importance', ascending=False).head(10)

# Reset index to get the ranking order
top_10_features.reset_index(drop=True, inplace=True)

# Add a column for the rank
top_10_features['Ranking'] = top_10_features.index + 1
top_10_features['Method'] = 'RandomForest'
top_10_features_RFC = top_10_features.drop(columns = 'Average Importance')

print("Top 10 features based on average importance from RandomForestClassifier over 5 folds:")
print(top_10_features_RFC)

Top 10 features based on average importance from RandomForestClassifier over 5 folds:
              Feature  Ranking        Method
0  URLSimilarityIndex        1  RandomForest
1     NoOfExternalRef        2  RandomForest
2          LineOfCode        3  RandomForest
3         NoOfSelfRef        4  RandomForest
4           NoOfImage        5  RandomForest
5              NoOfJS        6  RandomForest
6        HasSocialNet        7  RandomForest
7    HasCopyrightInfo        8  RandomForest
8             NoOfCSS        9  RandomForest
9      HasDescription       10  RandomForest


## 2-3. RFE

In [14]:
#### Test-run without cross-validation
y = np.concatenate([np.zeros(len(X_sampled_class_0)), np.ones(len(X_sampled_class_1))])
# Initialize the classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize RFE with the RandomForestClassifier and request 10 features
rfe = RFE(estimator=rf, n_features_to_select=1, step=1)

# Fit RFE
rfe.fit(X, y)

# Create a DataFrame to display feature rankings
feature_ranking = pd.DataFrame({
    'Feature': X.columns,  # or pass column names if your X is a DataFrame with specific column names
    'Ranking': rfe.ranking_
})

# Sort by ranking
top_10_features_rfe = feature_ranking[feature_ranking['Ranking'] < 11].sort_values(by='Ranking')

# Reset index to get the ranking order
top_10_features_rfe.reset_index(drop=True, inplace=True)

print("Top 10 features selected by RFE using RandomForestClassifier:")
print(top_10_features_rfe)

Top 10 features selected by RFE using RandomForestClassifier:
                      Feature  Ranking
0          URLSimilarityIndex        1
1                  LineOfCode        2
2             NoOfExternalRef        3
3                 NoOfSelfRef        4
4                   NoOfImage        5
5                      NoOfJS        6
6  NoOfOtherSpecialCharsInURL        7
7                     NoOfCSS        8
8                HasSocialNet        9
9                   URLLength       10


In [15]:
# Initialize 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize a DataFrame to store feature rankings for each fold
feature_rankings_per_fold = pd.DataFrame(columns=['Feature', 'Ranking', 'Fold'])

# Iterate over each fold
for fold, (train_index, _) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_index], y[train_index]

    # Initialize the classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Initialize RFE with the RandomForestClassifier and request 10 features
    rfe = RFE(estimator=rf, n_features_to_select=1, step=1)

    # Fit RFE
    rfe.fit(X_train, y_train)

    # Create a DataFrame to display feature rankings
    feature_ranking = pd.DataFrame({
        'Feature': X.columns,  # or pass column names if your X is a DataFrame with specific column names
        'Ranking': rfe.ranking_
    })

    # Sort by ranking
    top_10_features = feature_ranking[feature_ranking['Ranking'] < 11].sort_values(by='Ranking')

    # Reset index to get the ranking order
    top_10_features.reset_index(drop=True, inplace=True)

    # Get feature rankings
    top_10_features['Fold'] = fold + 1

    # Sort feature rankings by ReliefF score in descending order and select top 10
    top_10_features_fold = top_10_features

    # Append top 10 features for this fold to the DataFrame
    feature_rankings_per_fold = pd.concat([feature_rankings_per_fold, top_10_features_fold])

# Reset index of the resulting DataFrame
feature_rankings_per_fold.reset_index(drop=True, inplace=True)

print("Top 10 feature rankings per fold:")
print(feature_rankings_per_fold)

Top 10 feature rankings per fold:
                       Feature Ranking Fold
0           URLSimilarityIndex       1    1
1                   LineOfCode       2    1
2              NoOfExternalRef       3    1
3                  NoOfSelfRef       4    1
4                    NoOfImage       5    1
5                       NoOfJS       6    1
6                 HasSocialNet       7    1
7                      NoOfCSS       8    1
8   NoOfOtherSpecialCharsInURL       9    1
9                    URLLength      10    1
10          URLSimilarityIndex       1    2
11                  LineOfCode       2    2
12             NoOfExternalRef       3    2
13                 NoOfSelfRef       4    2
14                   NoOfImage       5    2
15                      NoOfJS       6    2
16                     NoOfCSS       7    2
17                HasSocialNet       8    2
18  NoOfOtherSpecialCharsInURL       9    2
19              HasDescription      10    2
20          URLSimilarityIndex       1    

In [16]:
# Create a new DataFrame to store the feature rankings
new_table = pd.DataFrame(columns=[f"Rank {i}" for i in range(1, 11)])

# Iterate over each fold
for fold in range(1, 6):
    # Filter the DataFrame for the current fold
    fold_features = feature_rankings_per_fold[feature_rankings_per_fold['Fold'] == fold]['Feature'].tolist()

    # Pad with NaNs if fewer than 10 features selected for this fold
    if len(fold_features) < 10:
        fold_features.extend([float('nan')] * (10 - len(fold_features)))

    # Append the features for this fold to the new table
    new_table.loc[fold] = fold_features

# Rename the index
new_table.index.name = 'Cross-validation Fold'

print("New table with rows representing cross-validation folds and columns representing feature rankings:")
print(new_table)

New table with rows representing cross-validation folds and columns representing feature rankings:
                                   Rank 1      Rank 2           Rank 3  \
Cross-validation Fold                                                    
1                      URLSimilarityIndex  LineOfCode  NoOfExternalRef   
2                      URLSimilarityIndex  LineOfCode  NoOfExternalRef   
3                      URLSimilarityIndex  LineOfCode  NoOfExternalRef   
4                      URLSimilarityIndex  LineOfCode  NoOfExternalRef   
5                      URLSimilarityIndex  LineOfCode  NoOfExternalRef   

                            Rank 4     Rank 5  Rank 6        Rank 7  \
Cross-validation Fold                                                 
1                      NoOfSelfRef  NoOfImage  NoOfJS  HasSocialNet   
2                      NoOfSelfRef  NoOfImage  NoOfJS       NoOfCSS   
3                      NoOfSelfRef  NoOfImage  NoOfJS  HasSocialNet   
4                      NoOf

In [17]:
# Assign a ranking to each feature in each fold
feature_rankings_per_fold['Rank'] = feature_rankings_per_fold.groupby('Fold').cumcount() + 1

# Count appearances of each feature
appearance_counts = feature_rankings_per_fold['Feature'].value_counts()

# Append missing ranks for features with less than 5 appearances
adjustment_rows = []
for feature, count in appearance_counts.items():
    if count < 5:
        missing_entries = 5 - count
        adjustment_rows.extend([(feature, 11)] * missing_entries)

# Create DataFrame from adjustment rows
adjustment_df = pd.DataFrame(adjustment_rows, columns=['Feature', 'Rank'])

# Concatenate with the original DataFrame
adjusted_rankings = pd.concat([feature_rankings_per_fold[['Feature', 'Rank']], adjustment_df])

# Calculate the adjusted average rankings
adjusted_average_rankings = adjusted_rankings.groupby('Feature')['Rank'].mean().sort_values()

print("Adjusted average rankings for each feature:")
print(adjusted_average_rankings)

# Select the top 10 features
top_10_features = adjusted_average_rankings.head(10).reset_index()

# Add a column for the ranking order
top_10_features['Ranking'] = top_10_features.index + 1
top_10_features['Method'] = 'RFE'
top_10_features_RFE = top_10_features.drop(columns = 'Rank')

print("Top 10 features based on adjusted average rankings:")
print(top_10_features_RFE)

Adjusted average rankings for each feature:
Feature
URLSimilarityIndex             1.0
LineOfCode                     2.0
NoOfExternalRef                3.0
NoOfSelfRef                    4.0
NoOfImage                      5.0
NoOfJS                         6.0
HasSocialNet                   7.4
NoOfCSS                        8.4
NoOfOtherSpecialCharsInURL     9.2
URLLength                     10.2
NoOfLettersInURL              10.4
HasCopyrightInfo              10.8
HasDescription                10.8
NoOfDegitsInURL               10.8
Name: Rank, dtype: float64
Top 10 features based on adjusted average rankings:
                      Feature  Ranking Method
0          URLSimilarityIndex        1    RFE
1                  LineOfCode        2    RFE
2             NoOfExternalRef        3    RFE
3                 NoOfSelfRef        4    RFE
4                   NoOfImage        5    RFE
5                      NoOfJS        6    RFE
6                HasSocialNet        7    RFE
7          

## 3. Combined Feature Selection
- Make a list of features selected from the three different feature selection methods
- Assign a feature similarity group number to the top features
- Remove low-rank features and leave only one if there are more than one features present in a feature similarity group

In [18]:
df = pd.concat([top_10_features_ReliefF, top_10_features_RFC, top_10_features_RFE])

print(df)

# Create a pivot table
Top10_combined = df.pivot_table(index='Method', columns='Ranking', values='Feature', aggfunc=lambda x: ' '.join(x))

# export the table
Top10_combined.to_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/Top10_combined.csv', index=False)

# Display the pivot table
print(Top10_combined)

                      Feature  Ranking        Method
0          URLSimilarityIndex        1       ReliefF
1                HasSocialNet        2       ReliefF
2            HasCopyrightInfo        3       ReliefF
3              HasDescription        4       ReliefF
4                     IsHTTPS        5       ReliefF
5       DomainTitleMatchScore        6       ReliefF
6          URLTitleMatchScore        7       ReliefF
7             HasSubmitButton        8       ReliefF
8       SpacialCharRatioInURL        9       ReliefF
9                  HasFavicon       10       ReliefF
0          URLSimilarityIndex        1  RandomForest
1             NoOfExternalRef        2  RandomForest
2                  LineOfCode        3  RandomForest
3                 NoOfSelfRef        4  RandomForest
4                   NoOfImage        5  RandomForest
5                      NoOfJS        6  RandomForest
6                HasSocialNet        7  RandomForest
7            HasCopyrightInfo        8  Random

In [19]:
# Count appearances of each feature
appearance_counts = df['Feature'].value_counts()

# Append missing ranks for features with less than 5 appearances
adjustment_rows = []
for feature, count in appearance_counts.items():
    if count < 3:
        missing_entries = 3 - count
        adjustment_rows.extend([(feature, 11)] * missing_entries)

# Create DataFrame from adjustment rows
adjustment_df = pd.DataFrame(adjustment_rows, columns=['Feature', 'Ranking'])

# Concatenate with the original DataFrame
adjusted_rankings = pd.concat([df[['Feature', 'Ranking']], adjustment_df])

# Calculate the adjusted average rankings
adjusted_average_rankings = adjusted_rankings.groupby('Feature')['Ranking'].mean().sort_values()

print("Adjusted average rankings for each feature:")
print(adjusted_average_rankings)

Adjusted average rankings for each feature:
Feature
URLSimilarityIndex             1.000000
HasSocialNet                   5.333333
LineOfCode                     5.333333
NoOfExternalRef                5.333333
NoOfSelfRef                    6.333333
NoOfImage                      7.000000
HasCopyrightInfo               7.333333
NoOfJS                         7.666667
HasDescription                 8.333333
IsHTTPS                        9.000000
DomainTitleMatchScore          9.333333
NoOfCSS                        9.333333
URLTitleMatchScore             9.666667
HasSubmitButton               10.000000
NoOfOtherSpecialCharsInURL    10.333333
SpacialCharRatioInURL         10.333333
HasFavicon                    10.666667
URLLength                     10.666667
Name: Ranking, dtype: float64


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, f1_score, accuracy_score

# import feature similarity clusters from the experiment 1
cluster_features = pd.read_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/cluster_features.csv')
# Convert dataframe to dictionary
cluster_features_dict = cluster_features.to_dict()
print(cluster_features_dict)

{'Cluster': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14}, 'Features': {0: "['NoOfURLRedirect', 'NoOfSelfRedirect']", 1: "['ObfuscationRatio', 'HasObfuscation']", 2: "['URLLength', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDegitsInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'IsDomainIP']", 3: "['NoOfiFrame', 'HasPasswordField', 'Bank', 'Pay']", 4: "['LineOfCode', 'NoOfImage', 'NoOfSelfRef', 'NoOfExternalRef']", 5: "['URLSimilarityIndex', 'CharContinuationRate', 'URLCharProb', 'DegitRatioInURL', 'SpacialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'NoOfJS', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'Robots', 'IsResponsive', 'HasDescription', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasCopyrightInfo']", 6: "['TLDLegitimateProb', 'TLDLength']", 7: "['DomainLength', 'NoOfSubDomain', 'LetterRatioInURL']", 8: "['HasExternalFormSubmit']", 9: "['NoOfEmptyRef']", 1

In [7]:
# many crushes so just type the dictionary and top features dataframe

# Adjusted average rankings dataframe
adjusted_average_rankings = pd.DataFrame({
    'Feature': ['URLSimilarityIndex', 'HasSocialNet', 'LineOfCode', 'NoOfExternalRef', 'NoOfSelfRef', 'NoOfImage', 'HasCopyrightInfo', 'NoOfJS', 'HasDescription', 'IsHTTPS', 'DomainTitleMatchScore', 'NoOfCSS', 'URLTitleMatchScore', 'HasSubmitButton', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'HasFavicon', 'URLLength'],
    'Average Ranking': [1.0, 5.333333, 5.333333, 5.333333, 6.333333, 7.0, 7.333333, 7.666667, 8.333333, 9.0, 9.333333, 9.333333, 9.666667, 10.0, 10.333333, 10.333333, 10.666667, 10.666667]
})

# Convert cluster features dataframe to dictionary
cluster_features_dict = {
    1: ['NoOfURLRedirect', 'NoOfSelfRedirect'],
    2: ['ObfuscationRatio', 'HasObfuscation'],
    3: ['URLLength', 'NoOfObfuscatedChar', 'NoOfLettersInURL', 'NoOfDegitsInURL', 'NoOfEqualsInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL'],
    4: ['NoOfQMarkInURL'],
    5: ['IsDomainIP'],
    6: ['Bank'],
    7: ['Pay'],
    8: ['NoOfiFrame'],
    9: ['HasPasswordField'],
    10: ['NoOfSelfRef', 'NoOfExternalRef'],
    11: ['LineOfCode'],
    12: ['NoOfImage'],
    13: ['URLCharProb', 'DegitRatioInURL'],
    14: ['URLSimilarityIndex', 'HasDescription', 'HasSocialNet', 'HasCopyrightInfo'],
    15: ['CharContinuationRate', 'SpacialCharRatioInURL', 'DomainTitleMatchScore', 'URLTitleMatchScore'],
    16: ['HasSubmitButton', 'HasHiddenFields'], 17: ['IsHTTPS'], 18: ['IsResponsive'], 19: ['HasTitle'],
    20: ['HasFavicon'], 21: ['Robots'], 22: ['NoOfJS'], 23: ['TLDLegitimateProb'], 24: ['TLDLength'],
    25: ['DomainLength', 'LetterRatioInURL'], 26: ['NoOfSubDomain'], 27: ['HasExternalFormSubmit'], 28: ['NoOfEmptyRef'], 29: ['Crypto'], 30: ['NoOfCSS'], 31: ['LargestLineLength'], 32: ['NoOfPopup']}

# Sort the dataframe by 'Ranking' column
adjusted_average_rankings_sorted = adjusted_average_rankings.sort_values(by='Average Ranking')

# Reset the index to preserve the original index order
adjusted_average_rankings_sorted.reset_index(drop=True, inplace=True)

# Re-rank features based on average ranking
adjusted_average_rankings_sorted['Rank'] = adjusted_average_rankings_sorted.index + 1

# Assign cluster number to features
cluster_assigned = []

for feature in adjusted_average_rankings_sorted['Feature']:
    for cluster, features in cluster_features_dict.items():
        if feature in features:
            cluster_assigned.append(cluster)
            break
    else:
        cluster_assigned.append(None)

# Add the cluster numbers to the adjusted average rankings dataframe
adjusted_average_rankings_sorted['Cluster'] = cluster_assigned
print(adjusted_average_rankings_sorted)


# export the table
Full_rankings_top10 = adjusted_average_rankings_sorted.drop(columns = ['Average Ranking'])
Full_rankings_top10.to_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/Full_rankings_top10.csv', index=False)





                       Feature  Average Ranking  Rank  Cluster
0           URLSimilarityIndex         1.000000     1       14
1                 HasSocialNet         5.333333     2       14
2                   LineOfCode         5.333333     3       11
3              NoOfExternalRef         5.333333     4       10
4                  NoOfSelfRef         6.333333     5       10
5                    NoOfImage         7.000000     6       12
6             HasCopyrightInfo         7.333333     7       14
7                       NoOfJS         7.666667     8       22
8               HasDescription         8.333333     9       14
9                      IsHTTPS         9.000000    10       17
10       DomainTitleMatchScore         9.333333    11       15
11                     NoOfCSS         9.333333    12       30
12          URLTitleMatchScore         9.666667    13       15
13             HasSubmitButton        10.000000    14       16
14  NoOfOtherSpecialCharsInURL        10.333333    15  

In [8]:
# Group by cluster and select top feature in each cluster
top_features_per_cluster = adjusted_average_rankings_sorted.loc[adjusted_average_rankings_sorted.groupby('Cluster')['Average Ranking'].idxmin()]

top_features_per_cluster = top_features_per_cluster[['Feature', 'Rank', 'Cluster']]

# Sort the dataframe by 'Rank' column in ascending order
top_features_per_cluster = top_features_per_cluster.sort_values(by='Rank')

print(top_features_per_cluster)

top_features_per_cluster.to_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/top_feature_cluster_filtered.csv', index=False)
adjusted_average_rankings_sorted.to_csv('/content/drive/My Drive/00. BU BME/05. Spring 2024 (EC503)/Project/PhiUSIIL Phishing URL/top_feature_combined.csv', index=False)

                       Feature  Rank  Cluster
0           URLSimilarityIndex     1       14
2                   LineOfCode     3       11
3              NoOfExternalRef     4       10
5                    NoOfImage     6       12
7                       NoOfJS     8       22
9                      IsHTTPS    10       17
10       DomainTitleMatchScore    11       15
11                     NoOfCSS    12       30
13             HasSubmitButton    14       16
14  NoOfOtherSpecialCharsInURL    15        3
16                  HasFavicon    17       20
