In [1]:
from pathlib import Path
from sklearn.cluster import KMeans
import pandas as pd
from joblib import load, dump
import time

%load_ext jupyternotify

<IPython.core.display.Javascript object>

# Train $k$-Means

Dimensionality reduction technique to preprocess the PubMed embeddings.
  - 1-3 clusterings over PubMed columns (depending on if categories are valuable)
    - k means clustering grid search should be made more extensive at some point
    - params can be chosen based on resulting tree models' feature importance values
    - explore sample weighting for this
    
KCV is currently not implemented, so $k$-Means sees the validation set data. This may not be a huge problem, considering it is unsupervised and does not see the labels, so for now I'll only leave out the test set here (not the validation set). 

- produce k folds of indices in train-test-split.ipynb and save to file
- use these indices to split off the validation set each loop

`for fold, indices in enumerate(k_folds)`:
- train the model on X, generate predictions on y
- save predictions to folder

`when done:`
concatenate the data to produce a full set of labelings which kNN has not seen values of during training

another option: learn the best $k$-Means parameters ($k$) by doing supervised learning. Then replace PM columns entirely with either the learned label or the clustering.


In [2]:
data = pd.read_parquet('data/split/train.parquet')
cols = data.columns[data.columns.str.startswith('PM')]
models_path = Path('models/knn') # evaluation structure
models_path.mkdir(exist_ok=True, parents=True)

In [3]:
clust_grid = [4] # [4,6,8,10]
max_iter = 100 # set to 100
n_init = 10

In [4]:
for clust_count in clust_grid:
#     subdir = Path(f'n{clust_count}') # kcv
    start_time = time.time()
    
    embeddings = data.loc(axis=1)[cols]
    clust_name = f'{clust_count}_{n_init}x{max_iter}'

    km = KMeans(n_clusters=clust_count, n_init=n_init, max_iter=max_iter, random_state=1, verbose=1)

    km.fit(embeddings)
    clustering = km.predict(embeddings)
    
    clust_df = pd.DataFrame(clustering, index=data.index, columns=[f'knn_{clust_count}'])
    clust_df.to_parquet(f'data\\features\\PubMed_{clust_name}.parquet')
    
    dump(km, models_path / f'km_n{clust_count}.joblib') 
    
    print(time.time() - start_time)
    
    %notify -m f"Completed testing for {clust_count}. Time remaining to test all k: {rem_time}"
    
%notify -m f"finished training kNN for k= {', '.join([str(c) for c in clust_grid])}"

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 19460256.546391312
start iteration
done sorting
end inner loop
Iteration 1, inertia 18799444.139010645
start iteration
done sorting
end inner loop
Iteration 2, inertia 18381468.22819186
start iteration
done sorting
end inner loop
Iteration 3, inertia 18291739.985818926
start iteration
done sorting
end inner loop
Iteration 4, inertia 18272311.481779687
start iteration
done sorting
end inner loop
Iteration 5, inertia 18261454.21530041
start iteration
done sorting
end inner loop
Iteration 6, inertia 18229028.696881436
start iteration
done sorting
end inner loop
Iteration 7, inertia 18199741.232194297
start iteration
done sorting
end inner loop
Iteration 8, inertia 18184032.736645877
start iteration
done sorting
end inner loop
Iteration 9, inertia 18178195.96734036
start iteration
done sorting
end inner loop
Iteration 10, inertia 18173329.15307683
start iteration
done sorting
end inner loop
Iteration 1

end inner loop
Iteration 23, inertia 18530433.97309566
start iteration
done sorting
end inner loop
Iteration 24, inertia 18529997.91348289
start iteration
done sorting
end inner loop
Iteration 25, inertia 18529257.9368202
start iteration
done sorting
end inner loop
Iteration 26, inertia 18528345.36272183
start iteration
done sorting
end inner loop
Iteration 27, inertia 18527399.83320911
start iteration
done sorting
end inner loop
Iteration 28, inertia 18526919.322962508
start iteration
done sorting
end inner loop
Iteration 29, inertia 18526842.970174864
start iteration
done sorting
end inner loop
Iteration 30, inertia 18526834.18425539
start iteration
done sorting
end inner loop
Iteration 31, inertia 18526831.01230553
start iteration
done sorting
end inner loop
Iteration 32, inertia 18526826.193889134
start iteration
done sorting
end inner loop
Iteration 33, inertia 18526819.07758485
start iteration
done sorting
end inner loop
Iteration 34, inertia 18526814.52158819
start iteration
don

Iteration 28, inertia 18031872.576662764
start iteration
done sorting
end inner loop
Iteration 29, inertia 18031856.82960462
start iteration
done sorting
end inner loop
Iteration 30, inertia 18031840.816061437
start iteration
done sorting
end inner loop
Iteration 31, inertia 18031834.10073285
start iteration
done sorting
end inner loop
Iteration 32, inertia 18031833.15033743
center shift 4.104527e-06 within tolerance 8.270496e-06
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 19724921.18910381
start iteration
done sorting
end inner loop
Iteration 1, inertia 19083924.04076101
start iteration
done sorting
end inner loop
Iteration 2, inertia 18634727.45847438
start iteration
done sorting
end inner loop
Iteration 3, inertia 18465936.511947703
start iteration
done sorting
end inner loop
Iteration 4, inertia 18392430.302121144
start iteration
done sorting
end inner loop
Iteration 5, inertia 18362403.79273276
start iteration
done sorting
end inner loo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data can also be generated from a loaded model, implementing this later would be useful (KCV).

In [5]:
# clustering = km.predict(embeddings)
# clust_df.to_csv(DATA / 'features' / f'PubMed_{clust_name}.parquet')


# Data Visualization

In [6]:
# [...]