![](../img/results-6.png)

## Load Data

In [7]:
import re
import time
import sys
import warnings
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from IPython.core.display_functions import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.utils.fixes import loguniform
from sklearn.utils import shuffle

dataframe = pd.read_csv('../URL_Classification.csv', header=None)
dataframe.columns = ["index", "url", "label"]

## Preprocessing

In [8]:
dataframe['url'] = dataframe['url'].apply(lambda x: np.str_(x))

## Prepare labels

In [9]:
labels = set(dataframe['label'])
label2id = {l: n for n, l in enumerate(labels)}
dataframe['label'] = dataframe['label'].map(label2id)

## Transform with count vectorizer

In [10]:
count_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
X = count_vectorizer.fit_transform(dataframe['url'])
y = dataframe['label'].values

X, y = shuffle(X, y)

## Optimize alpha parameter with RandomizedSearchCV

In [11]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses


sgd = sklearn.linear_model.SGDClassifier(class_weight='balanced', loss='hinge')

dist = {'alpha': loguniform(1e-7, 1e-1)}
clf = RandomizedSearchCV(sgd, param_distributions=dist, random_state=67, n_jobs=5, n_iter=15)

start = time.time()
search = clf.fit(X, y)
print(f'\nCV Time: {time.time()-start}')
search.best_params_


CV Time: 3666.119060754776


{'alpha': 7.770699994117932e-06}

## Report results

In [12]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(pd.DataFrame(clf.cv_results_).sort_values('rank_test_score'))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,235.930036,2.405631,0.711817,0.066934,8e-06,{'alpha': 7.770699994117932e-06},0.590471,0.596089,0.603795,0.597137,0.596324,0.596763,0.004238,1
12,292.163019,5.310945,0.731848,0.055272,3e-06,{'alpha': 3.2540861791745958e-06},0.600488,0.598917,0.594397,0.586497,0.599722,0.596004,0.005203,2
6,335.047672,5.145472,0.706333,0.060668,2e-06,{'alpha': 1.9006368801574431e-06},0.595667,0.593629,0.594083,0.592668,0.595838,0.594377,0.001214,3
14,361.120756,12.733088,0.563524,0.11331,1e-06,{'alpha': 1.3397634743642556e-06},0.593587,0.587995,0.585385,0.597997,0.586353,0.590264,0.004798,4
3,227.537132,2.086969,0.639314,0.088997,1e-05,{'alpha': 9.762272497212736e-06},0.578853,0.590081,0.598495,0.587946,0.590448,0.589164,0.006283,5
5,184.642176,2.0462,0.734686,0.009862,2.1e-05,{'alpha': 2.078101167061151e-05},0.585746,0.588306,0.591159,0.562987,0.570962,0.579832,0.010932,6
9,569.036506,10.609288,0.71623,0.032374,0.0,{'alpha': 1.911489162287619e-07},0.574806,0.575116,0.573078,0.580268,0.578157,0.576285,0.002577,7
4,545.368271,10.167325,0.727081,0.009108,0.0,{'alpha': 2.2908021253208054e-07},0.575961,0.580897,0.57034,0.578269,0.575243,0.576142,0.003511,8
0,121.704034,2.007659,0.622915,0.114333,0.000188,{'alpha': 0.00018841200866151887},0.542899,0.537771,0.545218,0.547053,0.543675,0.543323,0.003119,9
11,107.330365,2.540575,0.730925,0.008134,0.000356,{'alpha': 0.00035580905457201467},0.53946,0.521488,0.542652,0.548144,0.522775,0.534904,0.0108,10
