In [1]:
from pprint import pprint
from time import time
import pathlib
import pandas as pd


import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
# Load pre-processed train and test data
data_path = pathlib.Path().cwd().parent / "data"
print(f"Logging info - Loading data from {data_path}")
train_df = pd.read_csv(f"{data_path}/train.csv", keep_default_na=False)
test_df = pd.read_csv(f"{data_path}/test.csv", keep_default_na=False)

Logging info - Loading data from /Users/1150704/Documents/mywork/zip_product_classification/data


In [3]:
# build a pipeline 
pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=(1,3), stop_words='english', 
                                              sublinear_tf=True, max_features=50000, min_df=2)),
                     ('clf', LogisticRegression(n_jobs=4, solver='lbfgs', 
                                                random_state=17, verbose=0, 
                                                multi_class='multinomial',
                                                penalty="l2", max_iter=1000,
                                                fit_intercept=True))
                    ])

## Hyper-parameter tuning

In [4]:
# Tune the weight that controls the strength of penalty (Smaller values specify stronger regularization)
parameters = {
    "clf__C": [0.5, 1, 10, 100] 
}

In [5]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=5, scoring="accuracy")

print("Logging info - Performing grid search")
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(train_df["desc"], train_df["cat0_cat1_cat2"])
print("done in %0.3fs" % (time() - t0))

Logging info - Performing grid search
parameters:
{'clf__C': [0.5, 1, 10, 100]}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
done in 1001.541s


In [6]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.970541012216405
{'clf__C': 100}


In [7]:
grid_search.cv_results_

{'mean_fit_time': array([396.01367183, 454.93509941, 624.29208021, 473.66431875]),
 'std_fit_time': array([ 11.75480228,   8.9559064 ,   5.29120517, 132.66540977]),
 'mean_score_time': array([9.54740748, 7.70492458, 5.2909636 , 2.76289558]),
 'std_score_time': array([0.78533107, 0.67507471, 0.7762471 , 0.48931357]),
 'param_clf__C': masked_array(data=[0.5, 1, 10, 100],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__C': 0.5}, {'clf__C': 1}, {'clf__C': 10}, {'clf__C': 100}],
 'split0_test_score': array([0.95826477, 0.96355024, 0.97043131, 0.97137871]),
 'split1_test_score': array([0.95646971, 0.9632012 , 0.97137871, 0.97172775]),
 'split2_test_score': array([0.95771628, 0.96315133, 0.96993269, 0.96948392]),
 'split3_test_score': array([0.95557218, 0.96085764, 0.97013214, 0.97023186]),
 'split4_test_score': array([0.95761655, 0.96364996, 0.96993269, 0.96988282]),
 'mean_test_score': array([0.9571279 , 0.96288207, 0.9

In [None]:
# May consider use MLflow to track hyper-parameter tuning process, 
# so that we can keep each result associated with corresponding parameters. 