# Setup Kaggle

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download akashkr/phishing-website-dataset

In [None]:
!unzip phishing-website-dataset.zip

***Dataset Source :*** https://www.kaggle.com/datasets/akashkr/phishing-website-dataset


# **Detect URL from text**

In [None]:
import re
import socket
from urllib.parse import urlparse


text="""Mastering Power BI is essential if you're after a career as a data analyst. In case you've missed it, our Data Analyst in Power BI career track, co-created with Microsoft has arrived and is the perfect way to supercharge your data career!
Click the link to find out more - https://lnkd.in/gQ964dE5"""

urls=re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) #This is code for url detection

urls_str=" ".join(str(y) for y in urls)
hostname=urlparse(urls_str).netloc
ip_address=socket.gethostbyname(hostname)
print(f"Original string: [{text}]\n")
print(f"Urls: {urls_str}")
print(f"Host name: {hostname}")
print(f"Host length: {len(hostname)}")
print(f"URL length: {len(urls_str)}")
print(f"IP Address: {ip_address}")

# **Data Loading**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set(rc={'figure.figsize': (15, 6)})
%matplotlib inline

In [None]:
df=pd.read_csv('dataset.csv')
df.head()

# Data Prerocessing and Analysis

In [None]:
df.drop('index', axis=1, inplace=True)
df.head()

In [None]:
for x in df.columns:
  print(f"Unique values of column :\n{x, df[x].unique()}\n")

In [None]:
for x in df.columns:
  plt.figure(figsize=(15, 6))
  sns.countplot(df[x])
  plt.title("\nCount for "+x+" column values")
  plt.show()

In [None]:
#Replacing -1 with 0 
df['Result']=df['Result'].replace(-1,0)
df[['Result']]

In [None]:
df.isnull().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [None]:
X=df.drop('Result', axis=1)
y=df['Result']
X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

# Model Building

### ***Random Forest***

***Features From Wrapper (default parameters)***

In [None]:
X=df[['having_IPhaving_IP_Address',
 'URLURL_Length',
 'Shortining_Service',
 'having_At_Symbol',
 'Prefix_Suffix',
 'having_Sub_Domain',
 'SSLfinal_State',
 'Domain_registeration_length',
 'HTTPS_token',
 'Request_URL',
 'URL_of_Anchor',
 'Links_in_tags',
 'SFH',
 'Submitting_to_email',
 'Redirect',
 'popUpWidnow',
 'age_of_domain',
 'DNSRecord',
 'web_traffic',
 'Page_Rank',
 'Google_Index',
 'Links_pointing_to_page',
 'Statistical_report']]
y=df['Result']
X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
rf=RandomForestClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

***No Feature Selection***

In [None]:
rf=RandomForestClassifier(max_depth=10, random_state=0).fit(X_train, y_train)
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Number of decision tree estimator to consider at every split
n_estimators=[int(x) for x in np.linspace(start=100, stop=2000, num=20)]

In [None]:
# Number of features to consider at every split
max_features=['auto', 'sqrt']

In [None]:
# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)

In [None]:
# Minimum number of samples required to split a node
min_samples_split=[2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf=[1, 2, 4]
# Method of selecting samples for training each tree
bootstrap=[True, False]

In [None]:
# Create random grid
random_grid={'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap': bootstrap}
random_grid

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1200, 
                          max_depth=30, 
                          bootstrap=False, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=2).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

***Random Forest Importance (Embedded Method)***

In [None]:
imp=rf.feature_importances_

fi=pd.DataFrame({'features': X.columns, 'importance': imp}).sort_values('importance', ascending=False)
plt.figure(figsize=(15, 8))
sns.barplot(x='importance', y='features', data=fi)
plt.show()

In [None]:
imp_feat=list(fi['features'].head(21))
X=df[imp_feat]
y=df['Result']

X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1700, 
                          max_depth=60, 
                          bootstrap=False, 
                          max_features='sqrt', 
                          min_samples_leaf=2, 
                          min_samples_split=5).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

***Correlation Coefficient (Filter Method)***

In [None]:
plt.figure(figsize=(30, 20))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt='.2g')
plt.show()

In [None]:
corr=df.corr()
filter_corr=corr[(corr >= .80) & (corr != 1)]
plt.figure(figsize=(30, 20))
sns.heatmap(filter_corr, annot=True, cmap='Reds', linewidths=.5, fmt='.2g')
plt.show()

In [None]:
print(list(df.columns))

In [None]:
obs_cols=df[['Shortining_Service', 'double_slash_redirecting', 'Favicon', 'port', 'popUpWidnow', 'Result']]
print(obs_cols.corr()['Result'].sort_values(ascending=False)) 

In [None]:
df.shape

In [None]:
df.drop(['Favicon', 'popUpWidnow', 'Shortining_Service'], axis=1, inplace=True)
df.shape

In [None]:
corr=df.corr()
filter_corr=corr[(corr >= .80) & (corr != 1)]
plt.figure(figsize=(30, 20))
sns.heatmap(filter_corr, annot=True, cmap='Reds', linewidths=.5, fmt='.2g')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

In [None]:
X=df.drop('Result', axis=1)
y=df['Result']
X=X/X.max()

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
print(rf.get_params())

In [None]:
# Create random grid
random_grid={'n_estimators': n_estimators,
             'max_features': max_features,
             'max_depth': max_depth,
             'min_samples_split': min_samples_split,
             'min_samples_leaf': min_samples_leaf,
             'bootstrap': bootstrap}
random_grid

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1900, 
                          max_depth=80, 
                          bootstrap=True, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=5).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
import joblib
model=r"randomforest.pkl"
joblib.dump(rf, model)

In [None]:
files.download('randomforest.pkl')

***Step-wise Feature selection (Wrapper)***

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X, y)
print(rand_search.best_params_)

In [None]:
import joblib
import sys
sys.modules['sklearn.externals.joblib']=joblib
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
X.columns

In [None]:
X_fi=X.values
y_fi=y.values

In [None]:
sffs=SFS(RandomForestClassifier(n_estimators=1200, 
                          max_depth=30, 
                          bootstrap=False, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=2), 
         k_features=X.shape[1], forward=True, floating=True, scoring='accuracy', cv=0)
sffs.fit(X_fi, y_fi, custom_feature_names=X.columns)

In [None]:
sffs_df=pd.DataFrame(sffs.subsets_).transpose()
sffs_df

In [None]:
sffs_df.avg_score.sort_values(ascending=False)

In [None]:
sffs_df['feature_names'].loc[23]

In [None]:
sffs_df.to_csv('Important_features.csv', index=False)
files.download('Important_features.csv')

In [None]:
type(sffs_df['feature_names'].loc[23])

In [None]:
IF=list(sffs_df['feature_names'].loc[23])
IF

In [None]:
type(np.array(IF))

In [None]:
X=df[IF]
y=df['Result']


X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1200, 
                          max_depth=30, 
                          bootstrap=False, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=2).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

***Information Gain***

In [None]:
X=df.drop('Result', axis=1)
y=df['Result']
X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
imp=mutual_info_classif(X, y)
fi=pd.DataFrame({'features':df.columns[0: len(df.columns)-1], 'importance':imp}).sort_values('importance', ascending=False)

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x='importance', y='features', data=fi)
plt.show()

In [None]:
imp_feat=list(fi['features'].head(18))

In [None]:
X=df[imp_feat]
y=df['Result']


X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
rf=RandomForestClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1200, 
                          max_depth=30, 
                          bootstrap=False, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=2).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

***Chi Square***

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
X=df.drop('Result', axis=1)
y=df['Result']

In [None]:
for x in X.columns:
  X[x] = X[x].replace(-1, 2)

In [None]:
X.shape

In [None]:
chi_t=SelectKBest(score_func=chi2, k=X.shape[1])
chi_t.fit(X, y)

In [None]:
fi=pd.DataFrame({'features': X.columns, 'importance': chi_t.scores_}).sort_values('importance', ascending=False)
plt.figure(figsize=(15, 8))
sns.barplot(x='importance', y='features', data=fi)
plt.show()

In [None]:
imp_feat=list(fi['features'].head(16))
X=df[imp_feat]
y=df['Result']
for x in X.columns:
  X[x] = X[x].replace(-1, 2)

X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
rf=RandomForestClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
rf=RandomForestClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=1200, 
                          max_depth=30, 
                          bootstrap=False, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=2).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
rf=RandomForestClassifier(n_estimators=1900, 
                          max_depth=80, 
                          bootstrap=True, 
                          max_features='auto', 
                          min_samples_leaf=1, 
                          min_samples_split=5).fit(X_train, y_train)

In [None]:
print(f"Training score: {rf.score(X_train, y_train).round(2)}\n")
print(f"Test score: {rf.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='g')
plt.show()

In [None]:
print(classification_report(y_test, pred, target_names=['Phishing', 'Real']))

In [None]:
import joblib
model=r"randomforest.pkl"
joblib.dump(rf, model)

files.download('randomforest.pkl')

### ***Light Gradient Boosted Machine***

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgb=LGBMClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=rf.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_test.shape

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=100, stop=2000, num=20)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

#learning rate
learning_rate=[0.01, 0.1, 1]
# Create the random grid
random_grid={'n_estimators': n_estimators,
             'max_depth': max_depth,
             'learning_rate': learning_rate}
random_grid

***No Feature Selection***

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
lgb=LGBMClassifier(max_depth=110, n_estimators=700, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

***LGBM Feature Importance (Embedded Method)***

In [None]:
imp=lgb.feature_importances_

fi=pd.DataFrame({'features': X.columns, 'importance': imp}).sort_values('importance', ascending=False)
plt.figure(figsize=(15, 8))
sns.barplot(x='importance', y='features', data=fi)
plt.show()

In [None]:
imp_feat=list(fi['features'].head(21))
X=df[imp_feat]
y=df['Result']

X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
X_train.shape

In [None]:
lgb=LGBMClassifier(max_depth=110, n_estimators=700, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

***Correlation Coefficient***

In [None]:
lgb=LGBMClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_test.shape

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
lgb=LGBMClassifier(max_depth=110, n_estimators=900, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_test.shape

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='g')
plt.show()

In [None]:
print(classification_report(y_test, pred, target_names=['Phishing', 'Real']))

In [None]:
import joblib
model=r"lightgbm.pkl"
joblib.dump(lgb, model)

files.download('lightgbm.pkl')

***Step wise Feature Selection (Wrapper Method)***

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X, y)
print(rand_search.best_params_)

In [None]:
X_fi=X.values
y_fi=y.values

In [None]:
sffs=SFS(LGBMClassifier(max_depth=50, n_estimators=400, learning_rate=0.1), 
         k_features=X.shape[1], forward=True, floating=True, scoring='accuracy', cv=0)
sffs.fit(X_fi, y_fi, custom_feature_names=X.columns)

In [None]:
sffs_df=pd.DataFrame(sffs.subsets_).transpose()
sffs_df

In [None]:
sffs_df.avg_score.sort_values(ascending=False)

In [None]:
IF=list(sffs_df['feature_names'].loc[23])
IF

In [None]:
X=df[IF]
y=df['Result']


X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
lgb=LGBMClassifier(max_depth=110, n_estimators=700, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

***Infoemation Gain (Filter Method)***

In [None]:
X_train.shape

In [None]:
lgb=LGBMClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
lgb=LGBMClassifier(max_depth=70, n_estimators=800, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

***Chi Square Test (Filter Method)***

In [None]:
X_train.shape

In [None]:
lgb=LGBMClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
lgb=LGBMClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=lgb, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
lgb=LGBMClassifier(max_depth=110, n_estimators=1400, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {lgb.score(X_train, y_train).round(2)}\n")
print(f"Test score: {lgb.score(X_test, y_test).round(2)}")

In [None]:
pred=lgb.predict(X_test)
accuracy_score(y_test, pred)

### ***Category Boosting***

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat=CatBoostClassifier().fit(X_train,y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_test.shape

In [None]:
plt.figure(figsize=(16, 6))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='g')
plt.show()

In [None]:
print(classification_report(y_test, pred, target_names=['Phishing', 'Real']))

In [None]:
import joblib
model=r"catboost.pkl"
joblib.dump(cat, model)

files.download('catboost.pkl')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of iterations
iterations=[int(x) for x in np.linspace(start=10, stop=100, num=10)]

# Maximum number of levels in tree
depth=[int(x) for x in np.linspace(10, 110, num = 11)]
depth.append(None)

#learning rate
learning_rate=[0.01, 0.1, 1]
# Create the random grid
random_grid={'iterations': iterations,
             'depth': depth,
             'learning_rate': learning_rate}
random_grid

In [None]:
X_train.shape

***No Feature Selection***

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=10, iterations=90, learning_rate=0.1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_test.shape

***CatBoost Feature Importance (Embedded Method)***

In [None]:
imp=cat.feature_importances_

fi=pd.DataFrame({'features': X.columns, 'importance': imp}).sort_values('importance', ascending=False)
plt.figure(figsize=(15, 8))
sns.barplot(x='importance', y='features', data=fi)
plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of iterations
iterations=[int(x) for x in np.linspace(start=10, stop=100, num=10)]

# Maximum number of levels in tree
depth=[4,5,6,7,8,9, 10]
depth.append(None)

#learning rate
learning_rate=[0.01, 0.1, 1]
# Create the random grid
random_grid={'iterations': iterations,
             'depth': depth,
             'learning_rate': learning_rate}
random_grid

In [None]:
imp_feat=list(fi['features'].head(12))
X=df[imp_feat]
y=df['Result']

X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=10, iterations=90, learning_rate=1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

***Correlation Coefficient (Filter Method)***

In [None]:
cat=CatBoostClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
X_train.shape

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=8, iterations=80, learning_rate=1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

***Wrappera***

In [None]:
X_fi=X.values
y_fi=y.values

In [None]:
sffs=SFS(CatBoostClassifier(depth=10, iterations=90, learning_rate=1), 
         k_features=X.shape[1], forward=True, floating=True, scoring='accuracy', cv=0)
sffs.fit(X_fi, y_fi, custom_feature_names=X.columns)

In [None]:
sffs_df=pd.DataFrame(sffs.subsets_).transpose()
sffs_df

In [None]:
sffs_df.avg_score.sort_values(ascending=False)

In [None]:
IF=list(sffs_df['feature_names'].loc[23])
IF

In [None]:
X=df[IF]
y=df['Result']


X=X/X.max()

X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, test_size=0.30, random_state=np.random.seed(10))
X_train.shape, X_test.shape

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=9, iterations=90, learning_rate=1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

***Information Gain (Filter Method)***

In [None]:
X_train.shape

In [None]:
cat=CatBoostClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=9, iterations=90, learning_rate=1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

***Chi Square Method (Filter Method)***

In [None]:
X_test.shape

In [None]:
cat=CatBoostClassifier().fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
cat=CatBoostClassifier()
# Random search of parameters, using 10 fold cross validation and search across 30 different combinations, and use all available cores
rand_search=RandomizedSearchCV(estimator=cat, param_distributions=random_grid, n_iter=30, cv=10, verbose=2, random_state=42, n_jobs=-1)
rand_search.fit(X_train, y_train)

In [None]:
rand_search.best_params_

In [None]:
cat=CatBoostClassifier(depth=7, iterations=80, learning_rate=1).fit(X_train, y_train)

In [None]:
print(f"Training score: {cat.score(X_train, y_train).round(2)}\n")
print(f"Test score: {cat.score(X_test, y_test).round(2)}")

In [None]:
pred=cat.predict(X_test)
accuracy_score(y_test, pred)