![](../img/results-7.png)

## Load Data

In [1]:
import re
import time
import sys
import warnings
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from IPython.core.display_functions import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.utils.fixes import loguniform
from sklearn.utils import shuffle

dataframe = pd.read_csv('../URL_Classification.csv', header=None)
dataframe.columns = ["index", "url", "label"]

## Preprocessing

In [2]:
dataframe['url'] = dataframe['url'].apply(lambda x: np.str_(x))

## Prepare labels

In [3]:
labels = set(dataframe['label'])
label2id = {l: n for n, l in enumerate(labels)}
dataframe['label'] = dataframe['label'].map(label2id)

## Transform with count vectorizer

In [4]:
count_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
X = count_vectorizer.fit_transform(dataframe['url'])
y = dataframe['label'].values

X, y = shuffle(X, y)

## Optimize alpha parameter with RandomizedSearchCV

In [5]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses


sgd = sklearn.linear_model.SGDClassifier(class_weight='balanced', loss='hinge')

dist = {'alpha': loguniform(1e-7, 1e-1)}
clf = RandomizedSearchCV(sgd, param_distributions=dist, random_state=67, n_jobs=5, n_iter=150)

start = time.time()
search = clf.fit(X, y)
print(f'\nCV Time: {time.time()-start}')
search.best_params_


CV Time: 26771.25333404541


{'alpha': 3.495077991813885e-06}

## Report results

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(pd.DataFrame(clf.cv_results_).sort_values('rank_test_score'))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
16,221.688238,1.35284,0.769872,0.057439,3e-06,{'alpha': 3.495077991813885e-06},0.604656,0.607957,0.593712,0.597534,0.600339,0.60084,0.005044,1
24,238.889447,1.329765,0.763134,0.042669,2e-06,{'alpha': 2.4938910461383064e-06},0.600001,0.60178,0.606236,0.597284,0.592818,0.599624,0.004479,2
105,201.79725,2.312472,0.739241,0.025024,5e-06,{'alpha': 5.4530449168258554e-06},0.598843,0.599749,0.606511,0.591763,0.595086,0.59839,0.004957,3
139,232.598924,1.958237,0.768748,0.026864,3e-06,{'alpha': 2.8702333958770317e-06},0.598181,0.59772,0.599553,0.596356,0.597815,0.597925,0.001023,4
57,237.033462,1.879183,0.779471,0.083629,3e-06,{'alpha': 2.7467787855796015e-06},0.598888,0.599118,0.596342,0.598928,0.593922,0.59744,0.002035,5
33,194.326194,1.45604,0.765626,0.022107,6e-06,{'alpha': 6.107326737919852e-06},0.598075,0.60526,0.586984,0.595573,0.600493,0.597277,0.006059,6
84,235.90308,0.596298,0.813675,0.052737,3e-06,{'alpha': 2.681750245958141e-06},0.60074,0.59631,0.596614,0.591721,0.600694,0.597216,0.003344,7
146,169.423537,1.223019,0.716202,0.055096,1e-05,{'alpha': 1.0480099931173604e-05},0.603795,0.59837,0.586789,0.593845,0.59818,0.596196,0.005664,8
137,213.737783,0.925648,0.812521,0.055017,4e-06,{'alpha': 4.149991063166486e-06},0.595152,0.597477,0.592055,0.604459,0.590656,0.59596,0.00487,9
77,237.79735,1.75683,0.757086,0.050924,3e-06,{'alpha': 2.6660403749505833e-06},0.604803,0.594755,0.599294,0.590844,0.58786,0.595511,0.006026,10
