In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import reload_recursive

import common
import preprocess_common
import preprocess_lda
import train_common
import train_lda
import user_profile
import preprocess_bert
import train_bert

%reload common
%reload preprocess_common
%reload preprocess_lda
%reload train_common
%reload train_lda
%reload user_profile
%reload preprocess_bert
%reload train_bert

import common
import preprocess_common
import preprocess_lda
import train_common
import train_lda
import user_profile
import preprocess_bert
import train_bert

import logging
from pprint import pprint
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARN, force=True)
plt.rc('font', size=24)
figsize = (16, 16)
wc_figsize = (32, 32)
colormap = ListedColormap(["yellow", "limegreen", "turquoise"])

publications, users, authors_raw = common.load_raw_datasets()
authors = authors_raw[authors_raw['state'] == 'validatAcceptat']
authors_negative = authors_raw[authors_raw['state'] == 'validatRefuzat']
common_dataset_cached = True
lda_dataset_cached = True
bert_dataset_cached = True

lda_visualize_results = True
lda_visualize_test_results = True
bert_visualize_test_results = True
bert_visualize_results = True

lda_model_cached = True
bert_model_cached = False


split_cfg = train_common.TrainConfig(cv_size=0.2, test_size=0.2)

if not common_dataset_cached:
    nlp = preprocess_common.init_nlp()
    publications_en = preprocess_common.preprocess_publications_common(publications, nlp)
else:
    publications_en = common.load_dataframe("publications_en")

if not bert_dataset_cached:
    publications_bert = preprocess_bert.preprocess_bert(publications_en)
else:
    publications_bert = common.load_dataframe("publications_bert")

publications_bert_train, publications_bert_cv, publications_bert_test = \
    train_common.split_train_cv_test_simple(publications_bert, split_cfg)

authors_bert_train, authors_bert_cv, authors_bert_test = \
    train_common.split_authors_by_publications(\
        publications_bert_train, publications_bert_cv, publications_bert_test, authors)

authors_negative_bert_train, authors_negative_bert_cv, authors_negative_bert_test = \
    train_common.split_authors_by_publications(\
        publications_bert_train, publications_bert_cv, publications_bert_test, authors_negative)

import time
start = time.time()
if not bert_model_cached:
    bert_conf = train_bert.BERTConfig()
    bert_conf.embedding_model = "all-MiniLM-L6-v2"
    bert_conf.device = "cuda"
    bert_conf.n_components = 12
    bert_conf.batch_size = 32
    bert_conf.fpr_samples_from = 0
    bert_conf.fpr_samples_to = 1
    bert_conf.fpr_samples_count = 1000
    bert_conf.normalize_features = False
    bert_conf.metric = "cos"
    bert_conf.umap_metric = "cosine"
    bert_conf.reducer = "pca"
    bert_conf.use_scaler = False
    bert_conf.umap_min_dist = 0
    bert_conf.umap_n_neighbors = 10
    bert_conf.clustering_algorithm = "kmeans"
    bert_conf.n_clusters = 10
    bert_conf.random_state = 42
    bert_conf.score = "phi"

    bert, publications_bert_train, publications_bert_cv, authors_bert_cv,\
        authors_negative_bert_cv, users_features_bert, performance_report = \
        train_bert.train_and_evaluate_bert(\
            publications_bert_train, publications_bert_cv, authors_bert_train, authors_bert_cv,
            authors_negative_bert_cv, users, bert_conf, save_model=True, plot=True,
            random_negative_examples=False, recalculate_embeddings=False, progress=True,
            figsize=figsize)
    print(performance_report)
    print(time.time() - start)
    publications_bert_test = train_bert.eval_bert(bert, publications_bert_test,
                                                  recalculate_embeddings=False, progress=True)
    print(time.time() - start)

    common.save_dataframe(publications_bert_train, "publications_bert_train2")
    common.save_dataframe(publications_bert_cv, "publications_bert_cv2")
    common.save_dataframe(publications_bert_test, "publications_bert_test2")
    
    common.save_dataframe(authors_bert_cv, "authors_bert_cv")
    common.save_dataframe(authors_negative_bert_cv, "authors_negative_bert_cv")
    common.save_dataframe(users_features_bert, "users_features_bert")
else:
    publications_bert_train = common.load_dataframe("publications_bert_train2")
    publications_bert_cv = common.load_dataframe("publications_bert_cv2")
    publications_bert_test = common.load_dataframe("publications_bert_test2")
    
    authors_bert_cv = common.load_dataframe("authors_bert_cv")
    authors_negative_bert_cv = common.load_dataframe("authors_negative_bert_cv")
    users_features_bert = common.load_dataframe("users_features_bert")
    
    bert = train_bert.load_bert_model()

