IMPORTS

In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

DATASET ENCODING

In [2]:
def encoding(value):
    if value != '0':
        return 1
    else:
        return 0

In [3]:
df=pd.read_csv(r'E:\Project\1-Output\1 AFTER FEATURE EXTRACTION DATASETS\Feature_Extracted_Dataset-2.csv')
droplist= ['Sr. No.','Domain Name','Hyphenstring','Homoglyph','Vowel string','Bitsquatting','Insertion string','Omission','Repeatition','Replacement','Subdomain','Transposition','Addition string']
df = df.drop(columns=droplist)

#change boolean features into numeric features
df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

#change string features into numeric features
df['TLD'] = df['TLD'].apply(encoding)
df['IP Address'] = df['IP Address'].apply(encoding)
df['ASN Number'] = df['ASN Number'].apply(encoding)
df['ASN Country Code'] = df['ASN Country Code'].apply(encoding)
df['ASN CIDR'] = df['ASN CIDR'].apply(encoding)
df['ASN Postal Code'] = df['ASN Postal Code'].apply(encoding)
df['ASN creation date'] = df['ASN creation date'].apply(encoding)
df['ASN updation date'] = df['ASN updation date'].apply(encoding)
X = df.loc[:,df.columns!="Label"]
y = df["Label"]

  df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))


FORWARD SELECTION

In [4]:
def forward_selection(X, y, criterion='adj_r_squared'):
    selected_features = []
    remaining_features = list(X.columns)
    current_score, best_new_score = 0.0, 0.0

    while remaining_features and current_score == best_new_score:
        scores_with_candidates = []

        for feature in remaining_features:
            model = sm.OLS(y, sm.add_constant(X[selected_features + [feature]])).fit()

            if criterion == 'adj_r_squared':
                score = model.rsquared_adj
            else:
                # Add other criteria as needed
                raise ValueError("Unsupported criterion")

            scores_with_candidates.append((score, feature))

        scores_with_candidates.sort(reverse=True)
        best_new_score, best_candidate = scores_with_candidates.pop(0)

        if current_score < best_new_score:
            remaining_features.remove(best_candidate)
            selected_features.append(best_candidate)
            current_score = best_new_score

    return selected_features

In [5]:
# Run forward selection
selected_features1 = forward_selection(X, y)

CHI-SQUARE

In [6]:
# Select top k features based on chi-square scores
k_best_features = 50 

chi2_selector = SelectKBest(chi2, k=k_best_features)
X_chi2_selected = chi2_selector.fit_transform(X, y)

# Get the selected features
selected_features2 = X.columns[chi2_selector.get_support()]


LASSO

In [7]:
# Standardize the features
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Apply LASSO regression for feature selection
lasso = LassoCV(alphas=None, cv=5)
lasso.fit(X_standardized, y)

# Get selected features
selected_features3 = X.columns[lasso.coef_ != 0]

common = list(set(selected_features1) & set(selected_features2) & set(selected_features3))

# Print the selected features
print("Common Features:")
print(common)

selected_df = df.loc[:, common]
selected_df['Label']=y
selected_df.to_csv("Common_Features_Dataset-2.csv",index=False)

print(len(common))

Common Features:
['Body tags in source', 'TLD in path', 'Numeric Character', 'Percentage Character', 'Google Search Feature', 'IP Address', 'Https in URL', 'Entropy', 'Number of parameter', 'Is English word', 'Dots', 'Levenshtein Distance', 'ASN updation date', 'Host name length', 'Total links', 'Digit to alphabet ratio', 'Is www present']
17
