# Hiperparameter tuning

In [1]:
# It is important to have the same version in Kaggle and in local
# to save and import the models properly
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.22.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.10.0, but you have google-cloud-bigquery 2.34.4 which is incompatible.
bigframes 0.22.0 requires google-cloud-storage>=2.0.0, but you have google-c

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
import string
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, make_scorer
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("/kaggle/input/webtexts-en/all-codes-small.csv")

In [4]:
text_stream = [nltk.word_tokenize(d.lower()) for d in df.text.tolist()]

In [5]:
labels = df["label"].tolist()

In [6]:
!unzip -n /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [7]:
def token_is_error_status_code(token):
    try:
        n = int(token)
        return 400 <= n <= 425 or 500 <= n <= 511
    except ValueError:
        return False

def token_is_punctuation(token):
    for c in token:
        if c not in string.punctuation:
            return False
    return True

def is_token_just_word(token):
    return re.match("^[a-z'_]+$", token)

def clean_term_sentence(ts):
    for token in ts:
        token = token.strip()
        token = token.replace("’", "'")
        if token and not token_is_punctuation(token) \
            and (is_token_just_word(token) or token_is_error_status_code(token)):
            yield WordNetLemmatizer().lemmatize(token)

clean_streams = [list(clean_term_sentence(ts)) for ts in text_stream]

In [8]:
documents = [" ".join(ts) for ts in clean_streams]

In [9]:
my_vectorizer = TfidfVectorizer(
    analyzer='word', 
    max_features=30000,
    ngram_range=(1, 3)
)

X = my_vectorizer.fit_transform(documents)

doc_vectors = X.toarray()

In [10]:
X_train, X_test, y_train, y_test  = train_test_split(
        doc_vectors, 
        labels,
        train_size=0.80,
        random_state=1234)

In [14]:
param_grid = {
    # "tol": [1e-5, 1e-4, 1e-3], # Best = 1e-5
    # 'C': [0.1, 1, 10, 100, 1000], # Best C = 1
    # "fit_intercept": [True, False], # Best fit_intercept = True
    # "intercept_scaling": [0.1, 1.0, 10], # Best = 0.1, but same as 1
    # "max_iter": [1000, 2000, 3000],
}

# Create an SVM classifier
svm = LinearSVC()

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid, 
    cv=5, 
    scoring=make_scorer(f1_score, pos_label="error"),
    verbose=10,
)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy Score: {}".format(grid_search.best_score_))

# Evaluate the model on the test set
best_svm = grid_search.best_estimator_
test_accuracy = best_svm.score(X_test, y_test)
print("Test Accuracy: {}".format(test_accuracy))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START intercept_scaling=0.1.......................................
[CV 1/5; 1/3] END ........intercept_scaling=0.1;, score=0.980 total time=   7.3s
[CV 2/5; 1/3] START intercept_scaling=0.1.......................................
[CV 2/5; 1/3] END ........intercept_scaling=0.1;, score=0.982 total time=   7.1s
[CV 3/5; 1/3] START intercept_scaling=0.1.......................................
[CV 3/5; 1/3] END ........intercept_scaling=0.1;, score=0.981 total time=   7.0s
[CV 4/5; 1/3] START intercept_scaling=0.1.......................................
[CV 4/5; 1/3] END ........intercept_scaling=0.1;, score=0.983 total time=   7.0s
[CV 5/5; 1/3] START intercept_scaling=0.1.......................................
[CV 5/5; 1/3] END ........intercept_scaling=0.1;, score=0.984 total time=   7.1s
[CV 1/5; 2/3] START intercept_scaling=1.0.......................................
[CV 1/5; 2/3] END ........intercept_scaling=1.0;,



[CV 1/5; 3/3] END .........intercept_scaling=10;, score=0.980 total time=  15.7s
[CV 2/5; 3/3] START intercept_scaling=10........................................




[CV 2/5; 3/3] END .........intercept_scaling=10;, score=0.982 total time=  15.6s
[CV 3/5; 3/3] START intercept_scaling=10........................................




[CV 3/5; 3/3] END .........intercept_scaling=10;, score=0.981 total time=  15.6s
[CV 4/5; 3/3] START intercept_scaling=10........................................




[CV 4/5; 3/3] END .........intercept_scaling=10;, score=0.983 total time=  15.8s
[CV 5/5; 3/3] START intercept_scaling=10........................................




[CV 5/5; 3/3] END .........intercept_scaling=10;, score=0.984 total time=  15.8s
Best Hyperparameters:  {'intercept_scaling': 0.1}
Best Accuracy Score: 0.9820038009056908
Test Accuracy: 0.9838474617439883
