# Summary

* loguniform(1e-5, 1) (20 values between 1e-5 and 1)
* 3-fold CV
* Best mean_test_score 60.86 % for C = 0.041124
* Very similar to Csearch2

![](../img/results-3.png)

## Load Data

In [8]:
import re
import time
import sys
import warnings
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.utils.fixes import loguniform
from sklearn.utils import shuffle

dataframe = pd.read_csv('../URL_Classification.csv', header=None)
dataframe.columns = ["index", "url", "label"]

## Preprocessing

In [9]:
dataframe['url'] = dataframe['url'].apply(lambda x: np.str_(x))

## Prepare labels

In [10]:
labels = set(dataframe['label'])
label2id = {l: n for n, l in enumerate(labels)}
dataframe['label'] = dataframe['label'].map(label2id)

## Transform with count vectorizer

In [11]:
count_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
X = count_vectorizer.fit_transform(dataframe['url'])
y = dataframe['label'].values

X, y = shuffle(X, y)

## Optimize alpha parameter with RandomizedSearchCV

In [12]:
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses


clf = sklearn.svm.LinearSVC(class_weight='balanced')

dist = {'C': loguniform(1e-5, 1)}
clf = RandomizedSearchCV(clf, param_distributions=dist, random_state=67, n_jobs=6, n_iter=20, cv=3, verbose=1)

start = time.time()
search = clf.fit(X, y)
print(f'\nCV Time: {time.time()-start}')
search.best_params_

Fitting 3 folds for each of 20 candidates, totalling 60 fits

CV Time: 8950.783334970474


{'C': 0.04112363485239494}

## Report results

In [13]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,1426.160846,115.616811,1.22891,0.014326,0.041124,{'C': 0.04112363485239494},0.607471,0.609757,0.608819,0.608682,0.000938,1
2,1204.059535,111.948135,1.307985,0.153467,0.026882,{'C': 0.026882083938628135},0.606782,0.608857,0.607879,0.607839,0.000848,2
10,2009.770576,161.26128,1.259174,0.065205,0.127807,{'C': 0.12780688408174093},0.601634,0.603413,0.60257,0.602539,0.000727,3
11,766.186696,123.523448,1.220826,0.027182,0.009106,{'C': 0.009106416221317432},0.597077,0.599294,0.59798,0.598117,0.00091,4
1,2690.843116,117.662147,1.46971,0.142566,0.196917,{'C': 0.19691692845736392},0.596797,0.598465,0.597349,0.597537,0.000694,5
0,688.08198,35.673828,1.271856,0.053809,0.005361,{'C': 0.005361140014825926},0.588789,0.590682,0.589189,0.589553,0.000814,6
7,3813.966674,32.559817,1.256428,0.133,0.459762,{'C': 0.4597621874459045},0.584217,0.586639,0.585187,0.585348,0.000995,7
19,312.64193,5.442985,0.710076,0.227305,0.001152,{'C': 0.0011515220476982474},0.551695,0.553919,0.552145,0.552586,0.00096,8
18,344.386584,12.78985,0.966202,0.197003,0.001117,{'C': 0.001116715513266136},0.55085,0.55303,0.551289,0.551723,0.000942,9
5,338.53252,16.995659,1.259716,0.043064,0.000854,{'C': 0.0008538643965272543},0.542687,0.544627,0.543102,0.543472,0.000834,10
