In [None]:
import os
import random
import json
import pandas as pd
import pickle
from src.run_postprocessing_predictions import calculate_true_positives
from src.postprocessing import remove_seedword, lemmatize

def upperbound_baseline(seed_words, gold_clusters, 
                        save_results_path,
                        test_indexes_path=None,
                        k=50,
                        seed=42):
    
    assert len(seed_words) == len(gold_clusters)
    random.seed(seed)

    
    
    if test_indexes_path is not None:
        df = pd.DataFrame()
        df['gold_clusters'] = gold_clusters
        df['seed_words'] = seed_words
        with open(test_indexes_path, 'r') as f:
            test_indexes = json.load(f)
            if len(gold_clusters)!= len(test_indexes):
                        df = df.loc[test_indexes].copy().reset_index(drop=True)    
        seed_words = df['seed_words']
        gold_clusters = df['gold_clusters']
                
    predictions = []
    
    seed_words = [w.lower() for w in seed_words]
#     seed_words = lemmatize(seed_words)
    gold_clusters_preds = remove_seedword(seed_words, gold_clusters)
    
    for w, gd in zip(seed_words, gold_clusters_preds):
        
        G = len(gd)
        if G < k:
            preds = random.sample(gd, G)
            for i in range(k-G):
                preds.append("NO")
        else:
            preds = random.sample(gd, k)
#         print(w)
#         print(gd)
#         print(preds)    
        predictions.append(preds)
    
    print('Saving final predictions...', len(predictions))
    
    if not os.path.exists(save_results_path):
        os.mkdir(save_results_path)
        
    with open(os.path.join(save_results_path, 'final_predictions.pkl'), 'wb') as f:
        pickle.dump(predictions, f)
        
    calculate_true_positives(predictions, 
                         gold_clusters, 
                         save_dir_path,
                         k=k)
    
    return predictions



## Verbs

In [None]:
save_results_path = "workdir/upperbound_results/paper_verbs_st2"
test_indexes_path = 'workdir/data/swv_gold_dataset_test_split.json'

gold_df = pd.read_pickle("workdir/data/swv_gold_dataset.pkl")


final_preds = upperbound_baseline(gold_df['luName'], gold_df['gold_cluster_processed'],
                                 save_results_path,
                                 test_indexes_path=test_indexes_path,
                                 k=50)



## Nouns

In [None]:
save_results_path = "workdir/upperbound_results/paper_nouns_st"
test_indexes_path = 'workdir/data/swn_gold_dataset_test_split.json'
gold_df = pd.read_pickle("workdir/data/swn_gold_dataset.pkl")

final_preds = upperbound_baseline(gold_df['luName'], gold_df['gold_cluster_processed'],
                                 save_results_path,
                                 test_indexes_path=test_indexes_path,
                                 k=50)



## Roles

In [None]:
gold_df = pd.read_pickle("workdir/data/swr_gold_dataset.pkl")
save_results_path = "workdir/upperbound_results/paper_roles_st"
test_indexes_path = 'workdir/data/swr_gold_dataset_test_split.json'

final_preds = upperbound_baseline(gold_df['feText'], gold_df['gold_cluster_patternlemmatized'],
                                 save_results_path,
                                 test_indexes_path=test_indexes_path,
                                 k=50)


## Evaluate

In [None]:
!python -m src.run_evaluate --results_path=workdir/upperbound_results
