In [7]:
from pathlib import Path
from sklearn.cluster import KMeans
import pandas as pd
from joblib import load, dump
import time

%load_ext jupyternotify

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


# Train $k$-Means

Dimensionality reduction technique to preprocess the PubMed embeddings.
  - 1-3 clusterings over PubMed columns (depending on if categories are valuable)
    - k means clustering grid search should be made more extensive at some point
    - params can be chosen based on resulting tree models' feature importance values
    - explore sample weighting for this
    
KCV is currently not implemented, so $k$-Means sees the validation set data. This may not be a huge problem, considering it is unsupervised and does not see the labels, so for now I'll only leave out the test set here (not the validation set). 

- produce k folds of indices in train-test-split.ipynb and save to file
- use these indices to split off the validation set each loop

`for fold, indices in enumerate(k_folds)`:
- train the model on X, generate predictions on y
- save predictions to folder

`when done:`
concatenate the data to produce a full set of labelings which kNN has not seen values of during training

another option: learn the best $k$-Means parameters ($k$) by doing supervised learning. Then replace PM columns entirely with either the learned label or the clustering.


In [2]:
data = pd.read_parquet('data/split/train.parquet')
cols = data.columns[data.columns.str.startswith('PM')]
models_path = Path('models/knn') # evaluation structure
models_path.mkdir(exist_ok=True, parents=True)

In [13]:
clust_grid = [4,6,8,10]
max_iter = 10 # set to 100
n_init = 1

In [12]:
for clust_count in clust_grid:
#     subdir = Path(f'n{clust_count}') # kcv
    start_time = time.time()
    
    embeddings = data.loc(axis=1)[cols]
    clust_name = f'{clust_count}_{n_init}x{max_iter}'

    km = KMeans(n_clusters=clust_count, n_init=n_init, max_iter=max_iter, random_state=1, verbose=1)

    km.fit(embeddings)
    clustering = km.predict(embeddings)
    
    clust_df = pd.DataFrame(clustering, index=data.index, columns=[f'knn_{clust_count}'])
    clust_df.to_parquet(f'data\\features\\PubMed_{clust_name}.parquet')
    
    dump(km, models_path / f'km_n{clust_count}.joblib') 
    
    print(time.time() - start_time)
    
    %notify -m f"Completed testing for {clust_count}. Time remaining to test all k: {rem_time}"
    
%notify -m f"finished training kNN for k= {', '.join([str(c) for c in clust_grid])}"

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 19460256.546391312
start iteration
done sorting
end inner loop
Iteration 1, inertia 18799444.139010645
start iteration
done sorting
end inner loop
Iteration 2, inertia 18381468.22819186
start iteration
done sorting
end inner loop
Iteration 3, inertia 18291739.985818926
start iteration
done sorting
end inner loop
Iteration 4, inertia 18272311.481779687
start iteration
done sorting
end inner loop
Iteration 5, inertia 18261454.21530041
start iteration
done sorting
end inner loop
Iteration 6, inertia 18229028.696881436
start iteration
done sorting
end inner loop
Iteration 7, inertia 18199741.232194297
start iteration
done sorting
end inner loop
Iteration 8, inertia 18184032.736645877
start iteration
done sorting
end inner loop
Iteration 9, inertia 18178195.96734036
59.0931670665741


<IPython.core.display.Javascript object>

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 19022390.830042213
start iteration
done sorting
end inner loop
Iteration 1, inertia 18283507.038289074
start iteration
done sorting
end inner loop
Iteration 2, inertia 17991913.703929264
start iteration
done sorting
end inner loop
Iteration 3, inertia 17795172.065348335
start iteration
done sorting
end inner loop
Iteration 4, inertia 17725634.789178934
start iteration
done sorting
end inner loop
Iteration 5, inertia 17695058.695082717
start iteration
done sorting
end inner loop
Iteration 6, inertia 17668855.310889814
start iteration
done sorting
end inner loop
Iteration 7, inertia 17653143.961627904
start iteration
done sorting
end inner loop
Iteration 8, inertia 17639764.484418806
start iteration
done sorting
end inner loop
Iteration 9, inertia 17623155.013436016
62.723193407058716


<IPython.core.display.Javascript object>

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 19172479.383056466
start iteration
done sorting
end inner loop
Iteration 1, inertia 18398197.26501189
start iteration
done sorting
end inner loop
Iteration 2, inertia 17928908.644420758
start iteration
done sorting
end inner loop
Iteration 3, inertia 17605348.515223578
start iteration
done sorting
end inner loop
Iteration 4, inertia 17285546.72618464
start iteration
done sorting
end inner loop
Iteration 5, inertia 17152722.39342602
start iteration
done sorting
end inner loop
Iteration 6, inertia 17101972.8268741
start iteration
done sorting
end inner loop
Iteration 7, inertia 17072254.462679654
start iteration
done sorting
end inner loop
Iteration 8, inertia 17056020.57304715
start iteration
done sorting
end inner loop
Iteration 9, inertia 17048232.846712124
67.32711911201477


<IPython.core.display.Javascript object>

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 18498501.211123515
start iteration
done sorting
end inner loop
Iteration 1, inertia 17886851.276586056
start iteration
done sorting
end inner loop
Iteration 2, inertia 17538090.709796946
start iteration
done sorting
end inner loop
Iteration 3, inertia 17276745.079629637
start iteration
done sorting
end inner loop
Iteration 4, inertia 17065854.785879448
start iteration
done sorting
end inner loop
Iteration 5, inertia 16872757.774030287
start iteration
done sorting
end inner loop
Iteration 6, inertia 16828920.99285817
start iteration
done sorting
end inner loop
Iteration 7, inertia 16801358.725725416
start iteration
done sorting
end inner loop
Iteration 8, inertia 16791291.256766297
start iteration
done sorting
end inner loop
Iteration 9, inertia 16788851.684604745
69.86560487747192


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data can also be generated from a loaded model, implementing this later would be useful (KCV).

In [None]:
# clustering = km.predict(embeddings)
# clust_df.to_csv(DATA / 'features' / f'PubMed_{clust_name}.parquet')


# Data Visualization

In [None]:
# [...]