In [21]:
from google.colab import drive
drive.mount('/content/gdrive') 
# !unzip src.zip

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import json
import numpy as np
from pprint import pprint
from collections import defaultdict

# from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.decomposition import LatentDirichletAllocation as lda
import pickle

import warnings
warnings.filterwarnings('ignore')

In [5]:
vectors_save_path = "/content/gdrive/MyDrive/nlp/count.npy"
vectorizer_save_path = "/content/gdrive/MyDrive/nlp/count.pickle"

vectors = np.load(open(vectors_save_path, "rb"), allow_pickle=True).item()
vectorizer = pickle.load(open(vectorizer_save_path, "rb"))


In [6]:
print(f"Vocabulary len: {len(vectorizer.get_feature_names())}")
print("Vectors: ", end="")
vectors

Vocabulary len: 48480
Vectors: 

<123915x48480 sparse matrix of type '<class 'numpy.int64'>'
	with 35186442 stored elements in Compressed Sparse Row format>

In [11]:
search_params = {
    'n_components'  : list(range(5, 33, 3))
}
search_params

{'n_components': [5, 8, 11, 14, 17, 20, 23, 26, 29, 32]}

In [12]:
topic_model = lda()
# model = GridSearchCV(topic_model, param_grid=search_params,  n_jobs=-1, cv=3, verbose=2)
model = HalvingGridSearchCV(topic_model, 
                            param_grid=search_params, 
                            min_resources="exhaust", 
                            factor=3, n_jobs=-1, 
                            cv=3, verbose=2)
model.fit(vectors)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 13768
max_resources_: 123915
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 10
n_resources: 13768
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 4
n_resources: 41304
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 2
n_candidates: 2
n_resources: 123912
Fitting 3 folds for each of 2 candidates, totalling 6 fits


HalvingGridSearchCV(cv=3, estimator=LatentDirichletAllocation(), n_jobs=-1,
                    param_grid={'n_components': [5, 8, 11, 14, 17, 20, 23, 26,
                                                 29, 32]},
                    verbose=2)

In [20]:
best_lda_model = model.best_estimator_
print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)

Best Model's Params:  {'n_components': 14}
Best Log Likelihood Score:  -237546477.92098168


In [14]:
pickle.dump(model.best_estimator_, open("IAC_exp_seed_minf_10_max_50%.pk", "wb"))