# Подключение гугл диска

In [1]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/3_year/WSI

/content/gdrive/MyDrive/3_year/WSI


# Установки и импорты

In [3]:
import sys
from pathlib import Path
sys.path.append('/content/gdrive/MyDrive/3_year/WSI/lexsubgen/')

In [4]:
sys.path

['/content',
 '/env/python',
 '/usr/lib/python310.zip',
 '/usr/lib/python3.10',
 '/usr/lib/python3.10/lib-dynload',
 '',
 '/usr/local/lib/python3.10/dist-packages',
 '/usr/lib/python3/dist-packages',
 '/usr/local/lib/python3.10/dist-packages/IPython/extensions',
 '/root/.ipython',
 '/content/gdrive/MyDrive/3_year/WSI/lexsubgen/']

In [5]:
import importlib
import numpy as np
import pandas as pd
from itertools import product, combinations

# Кластеризация

In [52]:
# папки, где хранятся подстановки, кластеризация и посчитанные метрики

clust_folder = Path('./semeval-2010/clusterization').resolve()
subst_folder = Path('./semeval-2010/substitutes').resolve()
cluter_results_folder = Path('./semeval-2010/clusterization_results').resolve()
clust_folder, subst_folder, cluter_results_folder

(PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/clusterization'),
 PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes'),
 PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/clusterization_results'))

In [53]:
# Пути к файлам с подстановками

pos_excl_context_lang_substs = [f.absolute() for f in Path(subst_folder).glob('*pos_excl.json')]
dummy_context_lang_substs = [f.absolute() for f in Path(subst_folder).glob('*dummy.json')]
pos_excl_context_lang_substs, dummy_context_lang_substs

([PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/en_substitutes_pos_excl.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/fr_substitutes_pos_excl.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/es_substitutes_pos_excl.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/de_substitutes_pos_excl.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/ru_substitutes_pos_excl.json')],
 [PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/en_substitutes_dummy.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/fr_substitutes_dummy.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/es_substitutes_dummy.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/de_substitutes_dummy.json'),
  PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/subst

In [54]:
# темплейт для названия файла с результатами кластеризации

fname_tmp = '{langs}__{n_subst}__{vectorizer}__{subst_file_suffix}.json'

In [55]:
from lexsubgen.clusterizer import SubstituteClusterizer
from lexsubgen.evaluation import WSIEvaluator

## Подбор лучших параметров на английском

In [56]:
# Параметры для разных способов векторизовать подстановки

vectorizer_params = [
    {'weighted_tfidf': False, 'use_idf': False},
    {'weighted_tfidf': False, 'use_idf': True},
    {'weighted_tfidf': True, 'use_idf': False},
    {'weighted_tfidf': True, 'use_idf': True},
]
vectorizer_params2name = {
    (False, False): 'tf',
    (False, True): 'tf-idf',
    (True, False): 'tf-weighted', 
    (True, True): 'tf-idf-weighted'
}

In [44]:
from typing import Iterable, Dict, Union
import os

In [25]:
def file_langs(paths):
    langs = [f.stem.split('_')[0] for f in paths]
    return '-'.join(langs)

In [57]:
def run_experiments(clust_folder: Union[str, os.PathLike],
                    filepaths: Iterable[Iterable[os.PathLike]], 
                    n_subst: Iterable[int] = [5], 
                    vectorizer_params: Iterable[Dict[str, bool]] = [{}]):
    sil_scores = {}
    for paths, vec_params, n in product(filepaths, vectorizer_params, n_subst):
        clust = SubstituteClusterizer(**vec_params)
        clust_res = clust.cluster_all(paths, n_subst=n)
        vectorizer_name = vectorizer_params2name[tuple(vec_params.values())]
        params = {
            'subst_file_suffix': '_'.join(paths[0].stem.split('_')[2:]),
            'langs': file_langs(paths),
            'vectorizer': vectorizer_name,
            'n_subst': n
        }
        filename = fname_tmp.format(**params)
        clust._save_clusterization(clust_res, params, clust_folder, filename)
        sil_score = [v[0] for v in clust_res.values()]
        sil_scores[filename] = np.array(sil_score).mean()   
    return sil_scores

In [58]:
eng_paths = [[dummy_context_lang_substs[0]], [pos_excl_context_lang_substs[0]]]
eng_paths

[[PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/en_substitutes_dummy.json')],
 [PosixPath('/content/gdrive/MyDrive/3_year/WSI/semeval-2010/substitutes/en_substitutes_pos_excl.json')]]

In [59]:
exp1_sil_scores = run_experiments(clust_folder=clust_folder, 
                                  filepaths=eng_paths, 
                                  n_subst=[5], 
                                  vectorizer_params=vectorizer_params)
exp1_sil_scores

100%|██████████| 100/100 [00:10<00:00,  9.17it/s]
100%|██████████| 100/100 [00:11<00:00,  8.74it/s]
100%|██████████| 100/100 [00:10<00:00,  9.99it/s]
100%|██████████| 100/100 [00:10<00:00,  9.52it/s]
100%|██████████| 100/100 [00:11<00:00,  8.66it/s]
100%|██████████| 100/100 [00:10<00:00,  9.47it/s]
100%|██████████| 100/100 [00:09<00:00, 10.18it/s]
100%|██████████| 100/100 [00:11<00:00,  8.54it/s]


{'en__5__tf__dummy.json': 0.3075525767343736,
 'en__5__tf-idf__dummy.json': 0.21147292223771483,
 'en__5__tf-weighted__dummy.json': 0.3548858961707397,
 'en__5__tf-idf-weighted__dummy.json': 0.26387304538285883,
 'en__5__tf__pos_excl.json': 0.25470104565512974,
 'en__5__tf-idf__pos_excl.json': 0.19289360206223932,
 'en__5__tf-weighted__pos_excl.json': 0.3042249299697472,
 'en__5__tf-idf-weighted__pos_excl.json': 0.24259146203632284}

### Подсчет метрик

In [60]:
# нужно подрузить датасет для evaluator

dataset_path = Path('./semeval-2010/dataset_wsi_2010.csv').resolve()
dataset_2010 = pd.read_csv(dataset_path)
dataset_2010.head()

Unnamed: 0,context_id,group_by,target_lemma,pos_tag,sentence,target_id
0,threat.n.1,threat.n,threat,n,"['However', ',', 'history', 'has', 'proven', '...",58
1,threat.n.2,threat.n,threat,n,"['This', 'report', 'predicted', 'that', 'by', ...",75
2,threat.n.3,threat.n,threat,n,"['A', 'department', 'spokesman', 'said', 'the'...",42
3,threat.n.4,threat.n,threat,n,"['``', 'But', 'the', 'question', 'is', 'whethe...",27
4,threat.n.5,threat.n,threat,n,"['According', 'to', 'the', 'transcript', ',', ...",38


In [62]:
def compute_exp_metrics(clust_files: Iterable[str],
                        clust_folder: Union[str, os.PathLike],
                        clust_results_folder: Union[str, os.PathLike],
                        dataset: pd.DataFrame, 
                        semeval_data_path='/content/gdrive/MyDrive/3_year/WSI/semeval-2010'):
    exp_names = []
    exp_results = []
    for filename in clust_files:
        eval = WSIEvaluator(dataset=dataset,
                            clust_res_path=Path(clust_folder, filename),
                            semeval_data_path=semeval_data_path)
        metrics = eval.compute_metrics()
        exp_names.append(filename)
        exp_results.append(metrics.loc['mean'])
        metrics.to_csv(Path(clust_results_folder, Path(filename).with_suffix('.csv')))
    return pd.DataFrame(exp_results, index=exp_names)

In [63]:
exp1_metrics = compute_exp_metrics(
    clust_files=exp1_sil_scores,
    clust_folder=clust_folder,
    clust_results_folder=cluter_results_folder,
    dataset=dataset_2010)
exp1_metrics

Unnamed: 0,fscore,precision,recall,vmeasure,homogenity,completeness,(fs * vm) ** 0.5
en__5__tf__dummy.json,51.816,71.888,47.93,12.892,21.405,14.484,20.719195
en__5__tf-idf__dummy.json,40.87,47.004,49.575,18.343,20.059,25.958,23.180972
en__5__tf-weighted__dummy.json,48.491,64.675,49.121,14.836,22.006,18.861,22.241313
en__5__tf-idf-weighted__dummy.json,43.593,51.53,49.506,18.537,21.766,24.116,24.388285
en__5__tf__pos_excl.json,47.96,63.83,48.077,13.822,19.432,17.353,20.904363
en__5__tf-idf__pos_excl.json,38.068,41.439,50.491,19.311,19.07,27.72,23.72491
en__5__tf-weighted__pos_excl.json,44.489,54.053,48.656,15.967,19.083,19.936,22.71989
en__5__tf-idf-weighted__pos_excl.json,38.188,39.925,50.081,19.634,18.676,28.423,23.857848


In [67]:
exp1_metrics.to_csv(Path(cluter_results_folder, 'exp_en_only_context_and_vectorizer.csv'))

Для дальнейших экспериментов я выберу подстановки, полученные по контексту собранному dummy способом. А для векторизации контекстов буду использовать tf-idf-weighted и просто tf-idf.

# Эксперименты

## Разное количество подстановок
Буду брать все языки и буду пробовать разное количество подстановок

In [64]:
ns = [5, 10, 15, 20, 25]
vectorizer_params = [
    {'weighted_tfidf': False, 'use_idf': True},
    {'weighted_tfidf': True, 'use_idf': True},
]
vectorizer_params2name = {
    (False, False): 'tf',
    (False, True): 'tf-idf',
    (True, False): 'tf-weighted', 
    (True, True): 'tf-idf-weighted'
}

In [65]:
exp2_sil_scores = run_experiments(clust_folder=clust_folder, 
                                  filepaths=[dummy_context_lang_substs], 
                                  n_subst=ns, 
                                  vectorizer_params=vectorizer_params)
exp2_sil_scores

100%|██████████| 100/100 [01:38<00:00,  1.02it/s]
100%|██████████| 100/100 [01:37<00:00,  1.03it/s]
100%|██████████| 100/100 [01:41<00:00,  1.02s/it]
100%|██████████| 100/100 [01:37<00:00,  1.02it/s]
100%|██████████| 100/100 [01:41<00:00,  1.01s/it]
100%|██████████| 100/100 [01:36<00:00,  1.04it/s]
100%|██████████| 100/100 [01:40<00:00,  1.01s/it]
100%|██████████| 100/100 [01:42<00:00,  1.02s/it]
100%|██████████| 100/100 [01:42<00:00,  1.03s/it]
100%|██████████| 100/100 [01:49<00:00,  1.10s/it]


{'en-fr-es-de-ru__5__tf-idf__dummy.json': 0.09518465775960867,
 'en-fr-es-de-ru__10__tf-idf__dummy.json': 0.10976898392996277,
 'en-fr-es-de-ru__15__tf-idf__dummy.json': 0.11978248333464543,
 'en-fr-es-de-ru__20__tf-idf__dummy.json': 0.12675430559436213,
 'en-fr-es-de-ru__25__tf-idf__dummy.json': 0.13474182037488933,
 'en-fr-es-de-ru__5__tf-idf-weighted__dummy.json': 0.11214295212013878,
 'en-fr-es-de-ru__10__tf-idf-weighted__dummy.json': 0.13381838224052447,
 'en-fr-es-de-ru__15__tf-idf-weighted__dummy.json': 0.14958741927392963,
 'en-fr-es-de-ru__20__tf-idf-weighted__dummy.json': 0.1645645892306271,
 'en-fr-es-de-ru__25__tf-idf-weighted__dummy.json': 0.17777642086519083}

In [66]:
exp2_metrics = compute_exp_metrics(
    clust_files=exp2_sil_scores,
    clust_folder=clust_folder,
    clust_results_folder=cluter_results_folder,
    dataset=dataset_2010)
exp2_metrics

Unnamed: 0,fscore,precision,recall,vmeasure,homogenity,completeness,(fs * vm) ** 0.5
en-fr-es-de-ru__5__tf-idf__dummy.json,55.937,82.072,47.875,11.063,22.681,11.355,19.353319
en-fr-es-de-ru__10__tf-idf__dummy.json,54.963,79.546,48.158,12.053,23.061,12.646,19.983363
en-fr-es-de-ru__15__tf-idf__dummy.json,56.432,83.676,47.669,11.089,22.279,11.396,18.921604
en-fr-es-de-ru__20__tf-idf__dummy.json,56.196,83.9,47.008,9.923,21.679,10.091,17.940595
en-fr-es-de-ru__25__tf-idf__dummy.json,56.626,83.477,47.407,10.895,21.997,10.816,19.1049
en-fr-es-de-ru__5__tf-idf-weighted__dummy.json,53.109,75.311,48.048,12.42,20.819,14.444,19.181693
en-fr-es-de-ru__10__tf-idf-weighted__dummy.json,55.687,81.352,48.553,12.486,24.678,12.955,20.649188
en-fr-es-de-ru__15__tf-idf-weighted__dummy.json,56.095,83.121,47.642,10.841,22.92,11.256,19.191007
en-fr-es-de-ru__20__tf-idf-weighted__dummy.json,56.995,86.528,46.739,9.266,22.342,8.893,17.632665
en-fr-es-de-ru__25__tf-idf-weighted__dummy.json,57.478,87.508,46.974,8.843,21.84,8.419,17.19215


In [68]:
exp2_metrics.to_csv(Path(cluter_results_folder, 'exp_all_langs_n_clusters.csv'))

## Разные комбинации языков

В этом эксперименте буду брать по 10 подстановок

In [69]:
pairs = list(combinations(dummy_context_lang_substs, 2))[:4]
triples = list(combinations(dummy_context_lang_substs, 3))[:6]
quarters = list(combinations(dummy_context_lang_substs, 4))[:4]

In [73]:
paths = [dummy_context_lang_substs[0:1], *pairs, *triples, *quarters, dummy_context_lang_substs]
vectorizer_params = [
    {'weighted_tfidf': False, 'use_idf': True},
    {'weighted_tfidf': True, 'use_idf': True},
]
vectorizer_params2name = {
    (False, True): 'tf-idf',
    (True, True): 'tf-idf-weighted'
}

In [74]:
exp3_sil_scores = run_experiments(clust_folder=clust_folder, 
                                  filepaths=paths, 
                                  n_subst=[10], 
                                  vectorizer_params=vectorizer_params)
exp3_sil_scores

100%|██████████| 100/100 [00:11<00:00,  8.62it/s]
100%|██████████| 100/100 [00:09<00:00, 10.25it/s]
100%|██████████| 100/100 [00:25<00:00,  3.91it/s]
100%|██████████| 100/100 [00:23<00:00,  4.27it/s]
100%|██████████| 100/100 [00:18<00:00,  5.31it/s]
100%|██████████| 100/100 [00:18<00:00,  5.45it/s]
100%|██████████| 100/100 [00:24<00:00,  4.13it/s]
100%|██████████| 100/100 [00:24<00:00,  4.05it/s]
100%|██████████| 100/100 [01:06<00:00,  1.51it/s]
100%|██████████| 100/100 [01:06<00:00,  1.51it/s]
100%|██████████| 100/100 [00:30<00:00,  3.28it/s]
100%|██████████| 100/100 [00:31<00:00,  3.22it/s]
100%|██████████| 100/100 [00:35<00:00,  2.79it/s]
100%|██████████| 100/100 [00:39<00:00,  2.50it/s]
100%|██████████| 100/100 [01:17<00:00,  1.29it/s]
100%|██████████| 100/100 [01:18<00:00,  1.28it/s]
100%|██████████| 100/100 [00:31<00:00,  3.13it/s]
100%|██████████| 100/100 [00:34<00:00,  2.92it/s]
100%|██████████| 100/100 [01:12<00:00,  1.38it/s]
100%|██████████| 100/100 [01:10<00:00,  1.41it/s]


{'en__10__tf-idf__dummy.json': 0.2021695374506412,
 'en__10__tf-idf-weighted__dummy.json': 0.2596352716454208,
 'en-fr__10__tf-idf__dummy.json': 0.1393118527847119,
 'en-fr__10__tf-idf-weighted__dummy.json': 0.17966228329028616,
 'en-es__10__tf-idf__dummy.json': 0.11966519187944513,
 'en-es__10__tf-idf-weighted__dummy.json': 0.1512712675902858,
 'en-de__10__tf-idf__dummy.json': 0.13875539270942738,
 'en-de__10__tf-idf-weighted__dummy.json': 0.17721614274944872,
 'en-ru__10__tf-idf__dummy.json': 0.14427973695771829,
 'en-ru__10__tf-idf-weighted__dummy.json': 0.17981588991710779,
 'en-fr-es__10__tf-idf__dummy.json': 0.111259696369165,
 'en-fr-es__10__tf-idf-weighted__dummy.json': 0.13947012821099114,
 'en-fr-de__10__tf-idf__dummy.json': 0.12608739014146933,
 'en-fr-de__10__tf-idf-weighted__dummy.json': 0.15778212637412956,
 'en-fr-ru__10__tf-idf__dummy.json': 0.12744478486705194,
 'en-fr-ru__10__tf-idf-weighted__dummy.json': 0.15932791331057647,
 'en-es-de__10__tf-idf__dummy.json': 0.113

In [75]:
exp3_metrics = compute_exp_metrics(
    clust_files=exp3_sil_scores,
    clust_folder=clust_folder,
    clust_results_folder=cluter_results_folder,
    dataset=dataset_2010)
exp3_metrics

Unnamed: 0,fscore,precision,recall,vmeasure,homogenity,completeness,(fs * vm) ** 0.5
en__10__tf-idf__dummy.json,49.936,67.694,48.817,13.288,20.674,16.284,21.072244
en__10__tf-idf-weighted__dummy.json,51.384,67.099,50.184,15.225,22.876,17.644,23.247277
en-fr__10__tf-idf__dummy.json,52.028,76.214,46.742,11.258,22.001,12.393,19.473266
en-fr__10__tf-idf-weighted__dummy.json,54.018,77.326,48.393,12.601,22.878,13.902,20.612135
en-es__10__tf-idf__dummy.json,50.028,68.833,49.133,13.484,21.467,17.154,20.720586
en-es__10__tf-idf-weighted__dummy.json,52.888,72.577,49.963,13.704,21.748,16.486,20.94783
en-de__10__tf-idf__dummy.json,52.171,75.003,48.222,12.455,21.901,14.794,20.149983
en-de__10__tf-idf-weighted__dummy.json,52.91,76.613,47.698,13.48,25.868,15.389,21.147695
en-ru__10__tf-idf__dummy.json,52.905,74.41,48.597,12.543,21.76,14.421,20.439043
en-ru__10__tf-idf-weighted__dummy.json,53.666,77.183,48.441,13.242,25.524,14.54,21.713252


In [76]:
exp3_metrics.to_csv(Path(cluter_results_folder, 'exp_lang_combinations.csv'))