# Summary
* Very coarse alpha grid search
* np.logspace(-7, 2, 10) (10 values between 1e-7 and 100)
* Shows that alpha values around 1e-3 are best

![Csearch0 graph](../img/results-5.png)

## Load Data

In [1]:
import re
import time
import sys
import warnings
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from IPython.core.display_functions import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.utils.fixes import loguniform
from sklearn.utils import shuffle

dataframe = pd.read_csv('../URL_Classification.csv', header=None)
dataframe.columns = ["index", "url", "label"]

## Preprocessing

In [2]:
dataframe['url'] = dataframe['url'].apply(lambda x: np.str_(x))

## Prepare labels

In [3]:
labels = list(set(dataframe['label']))
labels.sort()
label2id = {l: n for n, l in enumerate(labels)}
dataframe['label'] = dataframe['label'].map(label2id)

## Transform with count vectorizer

In [4]:
count_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
X = count_vectorizer.fit_transform(dataframe['url'])
y = dataframe['label'].values

X, y = shuffle(X, y)

## Optimize alpha parameter with grid search

In [6]:
sgd = sklearn.linear_model.SGDClassifier(class_weight='balanced', loss='hinge')

dist = {'alpha': np.logspace(-7, 2, 10)}
clf = RandomizedSearchCV(sgd, param_distributions=dist, random_state=67, n_jobs=4, cv=2)

start = time.time()
search = clf.fit(X, y)
print(f'\nCV Time: {time.time()-start}')
search.best_params_


CV Time: 631.3058078289032


{'alpha': 1e-05}

## Report results

In [7]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(pd.DataFrame(clf.cv_results_).sort_values('rank_test_score'))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
2,115.807221,0.593454,1.23272,0.038837,1e-05,{'alpha': 1e-05},0.582418,0.585391,0.583905,0.001487,1
1,182.4151,0.461442,1.260706,0.051676,1e-06,{'alpha': 1e-06},0.570481,0.570346,0.570414,6.8e-05,2
0,303.375145,0.086031,1.269047,0.025495,0.0,{'alpha': 1e-07},0.549634,0.559133,0.554383,0.004749,3
3,66.300714,1.012127,1.308877,0.017563,0.0001,{'alpha': 0.0001},0.540281,0.551657,0.545969,0.005688,4
4,48.522202,0.137385,1.296312,0.024154,0.001,{'alpha': 0.001},0.511815,0.502994,0.507404,0.004411,5
5,47.461583,0.03869,1.327417,0.018899,0.01,{'alpha': 0.01},0.477142,0.466046,0.471594,0.005548,6
6,42.717528,0.22152,1.317294,0.005594,0.1,{'alpha': 0.1},0.460271,0.448409,0.45434,0.005931,7
7,43.858468,0.471256,1.286627,0.051133,1.0,{'alpha': 1.0},0.326023,0.329278,0.32765,0.001628,8
9,42.167166,0.027986,0.863762,0.098098,100.0,{'alpha': 100.0},0.234728,0.259058,0.246893,0.012165,9
8,52.209119,0.231095,1.208762,0.095846,10.0,{'alpha': 10.0},0.258194,0.202163,0.230179,0.028016,10
