In [2]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
tqdm.pandas()
import os

import argparse
import logging
import os
import sys

from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import torch.optim
from accelerate import Accelerator
from sentence_transformers import SentenceTransformer, models
from sentence_transformers import losses
from sentence_transformers.datasets import NoDuplicatesDataLoader
from sentence_transformers.readers import InputExample
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

import os
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/cluster/scratch/yakram/conda_env/'

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
from pandarallel import pandarallel
import random, ast
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 36 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
LANGUAGE = 'german'
model_dir_language = 'de'

data_dir = f'/cluster/scratch/yakram/sbert-copy/turtle_files/{LANGUAGE}_data/'

In [5]:
model = SentenceTransformer('symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli')

In [6]:
MemConcept_description = pd.read_csv(os.path.join(data_dir, 'query_MemConcept_description.rq'), sep='\t', index_col=[0])
SkillsHier_description = pd.read_csv(os.path.join(data_dir, 'query_SkillsHier_description.rq'), sep='\t', index_col=[0])

MemConcept_prefLabel = pd.read_csv(os.path.join(data_dir, 'query_MemConcept_prefLabel.rq'), sep='\t', index_col=[0])
SkillsHier_prefLabel = pd.read_csv(os.path.join(data_dir, 'query_SkillsHier_prefLabel.rq'), sep='\t', index_col=[0])

MemConcept_altLabel = pd.read_csv(os.path.join(data_dir, 'query_MemConcept_altLabel.rq'), sep='\t', index_col=[0])
SkillsHier_altLabel = pd.read_csv(os.path.join(data_dir, 'query_SkillsHier_altLabel.rq'), sep='\t', index_col=[0])

MemConcept_hiddenLabel = pd.read_csv(os.path.join(data_dir, 'query_MemConcept_hiddenLabel.rq'), sep='\t', index_col=[0])
SkillsHier_hiddenLabel = pd.read_csv(os.path.join(data_dir, 'query_SkillsHier_hiddenLabel.rq'), sep='\t', index_col=[0])


concept_schemes = pd.read_csv(os.path.join(data_dir, 'query_concept_schemes_combined.rq'), sep='\t', index_col=[0])

In [7]:
MemConcept_description.columns = MemConcept_description.columns
SkillsHier_description.columns = MemConcept_description.columns

MemConcept_altLabel.columns = MemConcept_description.columns
SkillsHier_altLabel.columns = MemConcept_description.columns
MemConcept_hiddenLabel.columns = MemConcept_description.columns
SkillsHier_hiddenLabel.columns = MemConcept_description.columns

In [8]:
MemConcept_SkillsHier_prefLabel = pd.concat([MemConcept_prefLabel, SkillsHier_prefLabel])
MemConcept_SkillsHier_description = pd.concat([MemConcept_altLabel, SkillsHier_altLabel])

pref_desc_data = MemConcept_SkillsHier_prefLabel.merge(MemConcept_SkillsHier_description, on=['?a'], how='left')

skill_pref_desc = concept_schemes.merge(pref_desc_data, on=['?a'], how='left')
skill_pref_desc.dropna(inplace=True)

skill_pref_desc.shape

(5159, 3)

In [9]:
skill_pref_desc.head()

Unnamed: 0,?a,?prefLabel,?desc
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel lagern
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel aufbewahren
2,http://data.europa.eu/esco/skill/1a61a520-f4a5...,geschriebenes Irisch verstehen,Irisch lesen
16,http://data.europa.eu/esco/skill/1889010b-f796...,Dämmstoff zuschneiden,Isolierstoff zuschneiden
31,http://data.europa.eu/esco/skill/7f1a7abb-9208...,Anrufrouting,Call Routing


### Check statistics of term-desc intersections in matching pairs

In [10]:
from nltk.corpus import stopwords

stop_words = stopwords.words(LANGUAGE)

In [11]:
def check_subset(label, description):
    label = label.lower()
    description = description.lower()
    
    clean_label = re.sub(r'\W+', ' ',label).split() # for german, only \W+
    clean_desc = re.sub(r'\W+', ' ', description).split()
    
    labels_sw_removed = [label_term for label_term in clean_label if label_term not in stop_words]
    desc_sw_removed = [desc_term for desc_term in clean_desc if desc_term not in stop_words]
    
    if len(desc_sw_removed) > 0:
        intersections = len(set(labels_sw_removed).intersection(desc_sw_removed))/len(desc_sw_removed) # remove stopwords, and also set a threshold for intersection before removal
    
        return intersections
    return 0
        

In [12]:
skill_pref_desc.columns = ['?a', 'prefLabel', 'desc']

In [13]:
skill_pref_desc['pref_desc_intesection'] = skill_pref_desc.parallel_apply(lambda x: check_subset(x.prefLabel, x.desc), axis=1)
skill_pref_desc['desc_pref_intesection'] = skill_pref_desc.parallel_apply(lambda x: check_subset(x.desc, x.prefLabel), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=144), Label(value='0 / 144'))), HB…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=144), Label(value='0 / 144'))), HB…

In [14]:
skill_pref_desc.head()

Unnamed: 0,?a,prefLabel,desc,pref_desc_intesection,desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel lagern,0.333333,0.5
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel aufbewahren,0.0,0.0
2,http://data.europa.eu/esco/skill/1a61a520-f4a5...,geschriebenes Irisch verstehen,Irisch lesen,0.5,0.333333
16,http://data.europa.eu/esco/skill/1889010b-f796...,Dämmstoff zuschneiden,Isolierstoff zuschneiden,0.5,0.5
31,http://data.europa.eu/esco/skill/7f1a7abb-9208...,Anrufrouting,Call Routing,0.0,0.0


In [15]:
100*len(skill_pref_desc[skill_pref_desc['pref_desc_intesection'] == 0])/skill_pref_desc.shape[0]

28.552044969955418

###### NOTE: follow correct pipeline for german

### ENGLISH: only 10.23% of the label data is not available in the term descriptions, and so we can use nearest neighbours using term description as the anchor and the other descriptions as from the search space


### GERMAN:  for german, the insterseciotn from preflabels into desc is 32.61%, still small, so also apply nn search to min negatives

In [16]:
# skill_pref_desc.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/positive_intersections.csv', sep='\t')

### Check statistics of term desc intersections in negative pairs

In [17]:
negatives_file = pd.read_csv(os.path.join(data_dir, '../', 'negative_100.csv'), sep='\t', index_col=[0])

In [18]:
negatives_file.rename(columns={'main_uri': '?a'}, inplace=True)

negatives_file_prefLabels = MemConcept_SkillsHier_prefLabel.merge(negatives_file, on=['?a'], how='left')
negatives_file_prefLabels.dropna(inplace=True)

negatives_file_prefLabels['negative_uris'] = negatives_file_prefLabels['negative_uris'].apply(lambda x: ast.literal_eval(x))

negatives_with_main_uri = negatives_file_prefLabels.explode('negative_uris')

negatives_with_main_uri.columns = ['main_uri', 'prefLabel', '?a']

In [19]:
skills_desc = skill_pref_desc[['?a', 'desc']]
skills_desc.rename(columns={'desc': 'negative_desc'}, inplace=True)

skills_desc.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skills_desc.rename(columns={'desc': 'negative_desc'}, inplace=True)


Unnamed: 0,?a,negative_desc
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,rohe Lebensmittel lagern
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,rohe Lebensmittel aufbewahren
2,http://data.europa.eu/esco/skill/1a61a520-f4a5...,Irisch lesen
16,http://data.europa.eu/esco/skill/1889010b-f796...,Isolierstoff zuschneiden
31,http://data.europa.eu/esco/skill/7f1a7abb-9208...,Call Routing


In [20]:
negative_file_desc_merged = negatives_with_main_uri.merge(skills_desc, on=['?a'], how='left')

negative_file_desc_merged.dropna(inplace=True)

negative_file_desc_merged.head()

Unnamed: 0,main_uri,prefLabel,?a,negative_desc
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,http://data.europa.eu/esco/skill/815d1585-1706...,Holz bearbeiten
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,http://data.europa.eu/esco/skill/5e433def-210d...,Abfall entsorgen
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,http://data.europa.eu/esco/skill/8b6f2996-218b...,Strabologie
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,http://data.europa.eu/esco/skill/8b6f2996-218b...,Schielheilkunde
8,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,http://data.europa.eu/esco/skill/112bf219-beb2...,Grundlagen der Didaktik vermitteln


In [21]:
negative_file_desc_merged.shape, skills_desc.shape

((513329, 4), (5159, 2))

In [22]:
negative_file_desc_merged['pref_desc_intersection'] = negative_file_desc_merged.parallel_apply(lambda x: check_subset(x.prefLabel, x.negative_desc), axis=1)
negative_file_desc_merged['desc_pref_intersection'] = negative_file_desc_merged.parallel_apply(lambda x: check_subset(x.negative_desc, x.prefLabel), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14260), Label(value='0 / 14260')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14260), Label(value='0 / 14260')))…

In [29]:
negative_file_desc_merged.head(10)

Unnamed: 0,main_uri,prefLabel,?a,negative_desc,pref_desc_intersection,desc_pref_intersection
386,http://data.europa.eu/esco/skill/7e11b04d-c6b5...,Patienten/Patientinnen zur Verbesserung des Sp...,http://data.europa.eu/esco/skill/d8d261bd-da63...,Patienten/Patientin an Ophthalmologen/Ophthalm...,0.2,0.2
1303,http://data.europa.eu/esco/skill/3838a64c-78dc...,Systeme zur internen Kommunikation warten,http://data.europa.eu/esco/skill/2c5fbce2-cd76...,landwirtschaftliches Gerät warten,0.333333,0.25
2056,http://data.europa.eu/esco/skill/83f713fb-2650...,Signalgeräte für Züge bedienen,http://data.europa.eu/esco/skill/770b473a-52db...,Nagelungsmaschinen bedienen,0.5,0.333333
3698,http://data.europa.eu/esco/skill/c349b18e-5038...,Foliendruckmaschine bedienen,http://data.europa.eu/esco/skill/d18408c1-c2ba...,Wiegeanlage bedienen,0.5,0.5
3742,http://data.europa.eu/esco/skill/c349b18e-5038...,Foliendruckmaschine bedienen,http://data.europa.eu/esco/skill/e8c9361b-9fbc...,Kuvertmaschine bedienen,0.5,0.5
4223,http://data.europa.eu/esco/skill/bc54eee1-a858...,Feldarbeit durchführen,http://data.europa.eu/esco/skill/8ab0d202-a908...,Kabelprüfung durchführen,0.5,0.5
6504,http://data.europa.eu/esco/skill/19da5526-0a6f...,Wartungsarbeiten an Gleisen durchführen,http://data.europa.eu/esco/skill/7802ce4b-2b70...,Nahrungsmitteltests durchführen,0.5,0.333333
6566,http://data.europa.eu/esco/skill/19da5526-0a6f...,Wartungsarbeiten an Gleisen durchführen,http://data.europa.eu/esco/skill/ba6d3a97-6737...,Aromatisierung von Wein durchführen,0.333333,0.333333
8926,http://data.europa.eu/esco/skill/0516cfc3-eec7...,Betongießmaschine bedienen,http://data.europa.eu/esco/skill/865d6fbc-c430...,Insektizide-Mischanlagen bedienen,0.333333,0.5
10761,http://data.europa.eu/esco/skill/fb1203d0-7a3e...,Feuerlöscher bedienen,http://data.europa.eu/esco/skill/a0626f30-22e2...,Abkanntmaschine bedienen,0.5,0.5


In [24]:
100*len(negative_file_desc_merged[negative_file_desc_merged['pref_desc_intersection'] == 0])/negative_file_desc_merged.shape[0]

99.51609981123217

In [25]:
negative_file_desc_merged.shape

(513329, 6)

In [26]:
negative_file_desc_merged[negative_file_desc_merged['pref_desc_intersection'] > 0]['?a'].nunique(), negative_file_desc_merged[negative_file_desc_merged['pref_desc_intersection'] > 0]['?a'].shape

(949, (2484,))

In [27]:
negative_file_desc_merged = negative_file_desc_merged[negative_file_desc_merged['pref_desc_intersection'] > 0]

In [28]:
negative_file_desc_merged.shape

(2484, 6)

### merging original and the negative pairs with both their scores

In [29]:
negative_file_desc_merged.columns

Index(['main_uri', 'prefLabel', '?a', 'negative_desc',
       'pref_desc_intersection', 'desc_pref_intersection'],
      dtype='object')

In [30]:
skill_pref_desc.columns

Index(['?a', 'prefLabel', 'desc', 'pref_desc_intesection',
       'desc_pref_intesection'],
      dtype='object')

In [31]:
skill_pref_desc.columns = ['?a', 'prefLabel', 'desc_positive', 'positive_pref_desc_intesection',
       'positive_desc_pref_intesection']

In [32]:
negative_file_filtered = negative_file_desc_merged[['main_uri', 'prefLabel', 'negative_desc', 'pref_desc_intersection',
                                                   'desc_pref_intersection']]

negative_file_filtered.columns = ['?a', 'prefLabel', 'desc_negative', 'negative_pref_desc_intesection',
       'negative_desc_pref_intesection']

In [35]:
negative_file_filtered[negative_file_filtered['?a'] == negative_file_filtered['?a'].iloc[0]]

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection
208,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25
209,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption,0.25,0.25
211,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety,0.25,0.25
212,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product,0.333333,0.25
270,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials preparing,0.333333,0.25
271,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials organising,0.333333,0.25
272,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,organising rubber materials,0.333333,0.25
273,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,preparing rubber materials,0.333333,0.25
274,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,prepare rubber materials,0.333333,0.25
275,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,organise rubber materials,0.333333,0.25


In [36]:
triplets_with_intersections_full = negative_file_filtered.merge(skill_pref_desc, on=['?a', 'prefLabel'], how='left')

In [37]:
triplets_with_intersections_full.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,desc_positive,positive_pref_desc_intesection,positive_desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stored raw food materials,0.75,0.75
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,store raw food materials,1.0,1.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stock raw prime ingredients,0.25,0.25
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,store materials of raw food,1.0,1.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stock raw food materials,0.75,0.75


In [38]:
triplets_with_intersections_full.shape

(852796, 8)

In [39]:
triplets_with_intersections_full.shape

(852796, 8)

In [40]:
triplets_with_intersections_full.dropna(inplace=True)

In [41]:
triplets_with_intersections_full.shape

(847115, 8)

In [43]:
triplets_with_intersections_full.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,desc_positive,positive_pref_desc_intesection,positive_desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stored raw food materials,0.75,0.75
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,store raw food materials,1.0,1.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stock raw prime ingredients,0.25,0.25
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,store materials of raw food,1.0,1.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,stock raw food materials,0.75,0.75


In [32]:
# triplets_with_intersections_full.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/triplets_with_intersection_only.csv', sep='\t')

### get nearest neighbours for prefLabel and positive descriptions

In [44]:
model.to('cuda')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [45]:
negative_pref_data = negative_file_filtered[['?a', 'prefLabel', 'desc_negative']]

In [46]:
negative_pref_data.shape

(120421, 3)

In [98]:
negative_pref_data_grouped = negative_pref_data.groupby(by=['?a','prefLabel'])['desc_negative'].apply(list).reset_index()

In [99]:
negative_pref_data_grouped.head()

Unnamed: 0,?a,prefLabel,desc_negative
0,http://data.europa.eu/esco/isced-f/00,generic programmes and qualifications,"[support implementation of health programmes, ..."
1,http://data.europa.eu/esco/isced-f/000,generic programmes and qualifications not furt...,"[coordinate programmes of audio systems, coord..."
2,http://data.europa.eu/esco/isced-f/0000,generic programmes and qualifications not furt...,"[expand sports programmes, produce sports prog..."
3,http://data.europa.eu/esco/isced-f/001,basic programmes and qualifications,[prepare training programmes for outreach]
4,http://data.europa.eu/esco/isced-f/0011,basic programmes and qualifications,"[provide instruction on basic numeracy skills,..."


In [100]:
def find_negatives_by_nn(main_uri, df_copy):
    
    df_copy_grouped = df_copy[df_copy['?a'] == main_uri]
    negative_corpus = df_copy_grouped['desc_negative'].iloc[0]
    main_term = df_copy_grouped['prefLabel'].unique().tolist()
    
    negative_embeddings = model.encode(negative_corpus, convert_to_tensor=True)
    query_embedding = model.encode(main_term, convert_to_tensor=True)
    
    hits = util.semantic_search(query_embedding, negative_embeddings, score_function=util.cos_sim, top_k=1000)
    
    return str(hits[0])

In [103]:
# negative_pref_data_grouped['hits'] = negative_pref_data_grouped.progress_apply(lambda x: find_negatives_by_nn(x['?a'], negative_pref_data_grouped), axis=1)

100%|██████████| 10967/10967 [04:27<00:00, 41.07it/s]


In [104]:
negative_pref_data_grouped.head() 

Unnamed: 0,?a,prefLabel,desc_negative,hits
0,http://data.europa.eu/esco/isced-f/00,generic programmes and qualifications,"[support implementation of health programmes, ...","[{'corpus_id': 0, 'score': 0.3955739140510559}..."
1,http://data.europa.eu/esco/isced-f/000,generic programmes and qualifications not furt...,"[coordinate programmes of audio systems, coord...","[{'corpus_id': 0, 'score': 0.22147676348686218..."
2,http://data.europa.eu/esco/isced-f/0000,generic programmes and qualifications not furt...,"[expand sports programmes, produce sports prog...","[{'corpus_id': 5, 'score': 0.4673820436000824}..."
3,http://data.europa.eu/esco/isced-f/001,basic programmes and qualifications,[prepare training programmes for outreach],"[{'corpus_id': 0, 'score': 0.4921908378601074}]"
4,http://data.europa.eu/esco/isced-f/0011,basic programmes and qualifications,"[provide instruction on basic numeracy skills,...","[{'corpus_id': 0, 'score': 0.2726180851459503}..."


In [105]:
negative_pref_data_grouped.shape

(10967, 4)

In [106]:
def append_scores(main_uri, df_large, all_data_combined):
    
    df = df_large[df_large['?a'] == main_uri]
    exploded_df = df.explode('desc_negative')
    
    hits_list = ast.literal_eval(exploded_df['hits'].iloc[0])
    hits_df = pd.DataFrame(hits_list)
    hits_df.sort_values(by='corpus_id', ascending=True, inplace=True)
    
    hits_df.reset_index(drop=True, inplace=True)
    exploded_df.reset_index(drop=True, inplace=True)
    
    new_df = pd.concat([exploded_df, hits_df[['score']]], axis='columns')
    
    all_data_combined.append(new_df)
    
    return all_data_combined

In [107]:
df_list = list()

scores_appended_df = negative_pref_data_grouped.progress_apply(
    lambda x: append_scores(x['?a'],negative_pref_data_grouped, df_list )
, axis=1)

100%|██████████| 10967/10967 [01:00<00:00, 180.62it/s]


In [108]:
len(df_list)

10967

In [109]:
negatives_pref_data_nn_scores = pd.DataFrame()

for df in tqdm(df_list):
    negatives_pref_data_nn_scores = pd.concat([negatives_pref_data_nn_scores, df])

100%|██████████| 10967/10967 [00:18<00:00, 585.59it/s]


In [110]:
negatives_pref_data_nn_scores.drop(columns='hits', inplace=True)

In [113]:
negatives_pref_data_nn_scores.tail(100)

Unnamed: 0,?a,prefLabel,desc_negative,score
0,http://data.europa.eu/esco/skill/ffe198e3-3f51...,operate forestry equipment,repairing refuse collection equipment,0.238179
1,http://data.europa.eu/esco/skill/ffe198e3-3f51...,operate forestry equipment,maintaining refuse collection equipment,0.250609
2,http://data.europa.eu/esco/skill/ffe198e3-3f51...,operate forestry equipment,maintain refuse collection equipment,0.237229
3,http://data.europa.eu/esco/skill/ffe198e3-3f51...,operate forestry equipment,refuse collection equipment maintaining,0.229739
4,http://data.europa.eu/esco/skill/ffe198e3-3f51...,operate forestry equipment,repair refuse collection equipment,0.255841
...,...,...,...,...
12,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,sport activities,0.372424
13,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,sporting activities,0.352582
14,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,perform activities to ensure airside safety,0.289907
15,http://data.europa.eu/esco/skill/fff5bc45-b506...,coordinate construction activities,undertake activities to ensure airside safety,0.292276


In [114]:
negatives_pref_data_nn_scores.iloc[-1]['prefLabel']

'http://data.europa.eu/esco/skill/fff5bc45-b506-4466-8977-4869079c1cb2'

In [116]:
negatives_pref_data_nn_scores.columns= ['?a', 'prefLabel', 'desc_negative', 'negative_desc_nn_score']

In [117]:
negative_file_filtered.shape, negatives_pref_data_nn_scores.shape

((120421, 5), (120421, 4))

In [118]:
negative_pref_desc_with_scores = negative_file_filtered.merge(negatives_pref_data_nn_scores,
                                                              on=['?a', 'prefLabel', 'desc_negative'], how='left')

#### checking number of terms in skills_desc

In [124]:
negative_pref_desc_with_scores

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption,0.250000,0.25,0.385450
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety,0.250000,0.25,0.465179
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product,0.333333,0.25,0.410487
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials preparing,0.333333,0.25,0.241972
...,...,...,...,...,...,...
120428,http://data.europa.eu/esco/isced-f/099,health and welfare not elsewhere classified,monitor tree health,0.333333,0.25,0.189794
120429,http://data.europa.eu/esco/isced-f/099,health and welfare not elsewhere classified,check health of trees,0.333333,0.25,0.255430
120430,http://data.europa.eu/esco/isced-f/099,health and welfare not elsewhere classified,tree health checking,0.333333,0.25,0.219507
120431,http://data.europa.eu/esco/isced-f/099,health and welfare not elsewhere classified,checking tree health,0.333333,0.25,0.256832


In [125]:
skill_pref_desc.head()

Unnamed: 0,?a,prefLabel,desc_positive,positive_pref_desc_intesection,positive_desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stored raw food materials,0.75,0.75
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store raw food materials,1.0,1.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw prime ingredients,0.25,0.25
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store materials of raw food,1.0,1.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw food materials,0.75,0.75


In [128]:
alt_label_counts = skill_pref_desc.groupby(by=['?a'])['desc_positive'].count().reset_index()

In [129]:
alt_label_counts

Unnamed: 0,?a,desc_positive
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,5
1,http://data.europa.eu/esco/skill/00064735-8fad...,7
2,http://data.europa.eu/esco/skill/000709ed-2be5...,8
3,http://data.europa.eu/esco/skill/0007bdc2-dd15...,10
4,http://data.europa.eu/esco/skill/00090cc1-1f27...,9
...,...,...
13806,http://data.europa.eu/esco/skill/ffef5eb3-a15e...,6
13807,http://data.europa.eu/esco/skill/fff0b074-5a76...,7
13808,http://data.europa.eu/esco/skill/fff0e2cd-d0bd...,2
13809,http://data.europa.eu/esco/skill/fff5bc45-b506...,11


In [131]:
negative_pref_desc_with_scores.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption,0.25,0.25,0.38545
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety,0.25,0.25,0.465179
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product,0.333333,0.25,0.410487
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials preparing,0.333333,0.25,0.241972


In [136]:
negative_pref_desc_with_counts = negative_pref_desc_with_scores.merge(alt_label_counts, on=['?a'], how='left')

In [149]:
negative_pref_desc_with_counts.dropna(inplace=True)

In [150]:
negative_pref_desc_with_counts.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score,desc_positive
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,5.0
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption,0.25,0.25,0.38545,5.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety,0.25,0.25,0.465179,5.0
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product,0.333333,0.25,0.410487,5.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials preparing,0.333333,0.25,0.241972,5.0


In [141]:
def create_random_groups(negative_df, count, main_uri):
    
    negative_subset = negative_df[negative_df['?a'] == main_uri]
    
    negative_subset.sort_values(by='negative_desc_nn_score', ascending=False, inplace=True)
    
    top_negatives = negative_subset.head(int(count))
    
    distinct_values = top_negatives['desc_negative'].unique().tolist()
    
    return distinct_values

In [151]:
negative_pref_desc_with_counts['negative_labels'] = negative_pref_desc_with_counts.parallel_apply(
lambda x: create_random_groups(negative_pref_desc_with_counts, x['desc_positive'], x['?a']), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3188), Label(value='0 / 3188'))), …

In [154]:
negative_pref_desc_with_counts.head(10)

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score,desc_positive,negative_labels
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,5.0,"[animal products and food safety, animal food ..."
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption,0.25,0.25,0.38545,5.0,"[animal products and food safety, animal food ..."
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety,0.25,0.25,0.465179,5.0,"[animal products and food safety, animal food ..."
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product,0.333333,0.25,0.410487,5.0,"[animal products and food safety, animal food ..."
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials preparing,0.333333,0.25,0.241972,5.0,"[animal products and food safety, animal food ..."
5,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,rubber materials organising,0.333333,0.25,0.20789,5.0,"[animal products and food safety, animal food ..."
6,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,organising rubber materials,0.333333,0.25,0.229141,5.0,"[animal products and food safety, animal food ..."
7,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,preparing rubber materials,0.333333,0.25,0.245423,5.0,"[animal products and food safety, animal food ..."
8,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,prepare rubber materials,0.333333,0.25,0.293334,5.0,"[animal products and food safety, animal food ..."
9,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,organise rubber materials,0.333333,0.25,0.242342,5.0,"[animal products and food safety, animal food ..."


In [155]:
negative_pref_list_subset = negative_pref_desc_with_counts[['?a', 'prefLabel', 'negative_labels']]

In [159]:
negative_pref_list_exploded = negative_pref_list_subset.explode('negative_labels')

In [162]:
negative_pref_list_exploded.iloc[0]['?a']

'http://data.europa.eu/esco/skill/29b326ea-a9c8-4963-b35a-8541d0cdff77'

In [164]:
negative_pref_list_exploded.drop_duplicates(inplace=True)

In [165]:
negative_pref_list_exploded

Unnamed: 0,?a,prefLabel,negative_labels
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,prepare rubber materials
...,...,...,...
120308,http://data.europa.eu/esco/skill/T1.2,working with numbers and measures,working in bad weather
120314,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,digital accessibility standards
120314,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,call in at working locations
120314,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,arrive at working locations


In [63]:
triplets_with_nn_scores_full = negative_pref_desc_with_scores.merge(skill_pref_desc, on=['?a', 'prefLabel'], how='left')

In [64]:
triplets_with_nn_scores_full.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score,desc_positive,positive_pref_desc_intesection,positive_desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stored raw food materials,0.75,0.75
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,store raw food materials,1.0,1.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stock raw prime ingredients,0.25,0.25
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,store materials of raw food,1.0,1.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stock raw food materials,0.75,0.75


In [65]:
triplets_with_nn_scores_full.dropna(inplace=True)

In [66]:
triplets_with_nn_scores_full.shape

(847197, 9)

In [67]:
# triplets_with_nn_scores_full.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/triplets_with_nn_scores_full.csv', sep='\t')

In [68]:
triplets_with_nn_scores_full['?a'].nunique()

10469

In [69]:
triplets_with_nn_scores_full[triplets_with_nn_scores_full['negative_pref_desc_intesection'] > 0]['?a'].nunique()

10469

### a total of 79.23% of the nearest neightbours have an intersection term as well, let's now select the instersection terms based on nn criteria

In [70]:
triplets_with_nn_scores_full.head()

Unnamed: 0,?a,prefLabel,desc_negative,negative_pref_desc_intesection,negative_desc_pref_intesection,negative_desc_nn_score,desc_positive,positive_pref_desc_intesection,positive_desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stored raw food materials,0.75,0.75
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,store raw food materials,1.0,1.0
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stock raw prime ingredients,0.25,0.25
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,store materials of raw food,1.0,1.0
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products,0.333333,0.25,0.425389,stock raw food materials,0.75,0.75


In [71]:
triplet_data_with_scores = triplets_with_nn_scores_full[['?a', 'prefLabel', 'desc_negative', 
                                                         'negative_pref_desc_intesection', 'negative_desc_nn_score']]

In [72]:
def create_grouped_data(df, column_name):
    
    df_subset = df[['?a', column_name]]
    
    df_subset_grouped = df_subset.groupby(by=['?a'])[column_name].apply(list).reset_index()
    
    return df_subset_grouped

In [73]:
triplet_grouped_desc = create_grouped_data(triplet_data_with_scores, 'desc_negative')
triplet_grouped_negative_pref_desc_score = create_grouped_data(triplet_data_with_scores, 'negative_pref_desc_intesection')
triplet_grouped_negative_desc_nn_score = create_grouped_data(triplet_data_with_scores, 'negative_desc_nn_score')

In [74]:
triplet_data_appended = triplet_grouped_desc.merge(
    triplet_grouped_negative_pref_desc_score, on=['?a'], how='left').merge(
    triplet_grouped_negative_desc_nn_score, on=['?a'], how='left')

In [75]:
triplet_data_appended.head()

Unnamed: 0,?a,desc_negative,negative_pref_desc_intesection,negative_desc_nn_score
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,"[training new staff, training new staff, train...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.4757101237773895, 0.4757101237773895, 0.475..."
1,http://data.europa.eu/esco/skill/00064735-8fad...,"[supervise cleaning staff work, supervise clea...","[0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.2...","[0.5259576439857483, 0.5259576439857483, 0.525..."
2,http://data.europa.eu/esco/skill/000709ed-2be5...,"[apply a different roasting method, apply a di...","[0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.2...","[0.09511102735996246, 0.09511102735996246, 0.0..."
3,http://data.europa.eu/esco/skill/0007bdc2-dd15...,"[burner control adjusting, burner control adju...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.1796051561832428, 0.1796051561832428, 0.179..."
4,http://data.europa.eu/esco/skill/00090cc1-1f27...,"[rail company services, rail company services,...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.43253225088119507, 0.43253225088119507, 0.4..."


In [77]:
triplet_data_appended.iloc[1]['?a']

'http://data.europa.eu/esco/skill/00064735-8fad-454b-90c7-ed858cc993f2'

In [95]:
triplet_data_appended.iloc[1]['desc_negative']

['supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise cleaning staff work',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'supervise the work of cleaning staff',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'implement procedures to address airport hazards',
 'warehouse hazard prevention procedures',
 'warehouse hazard prevention procedures',
 'warehouse hazard preven

In [66]:
# triplet_data_appended.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/triplet_data_without_negatives.csv', sep='\t')

In [78]:
def compute_hard_negatives(main_uri, df):
    
    df_filtered = df[df['?a'] == main_uri]
    df_subset = df_filtered.explode(['desc_negative','negative_pref_desc_intesection', 'negative_desc_nn_score'])
    df_subset.reset_index(drop=True, inplace=True)
    
    df_intersected = df_subset[df_subset['negative_pref_desc_intesection'] > 0]
    
    if len(df_intersected) == 0:
        df_subset.sort_values(by='negative_desc_nn_score', ascending=False, inplace=True)
        return df_subset['desc_negative'].iloc[0]
    else:
        df_intersected.sort_values(by='negative_desc_nn_score', ascending=False, inplace=True)

        return df_intersected['desc_negative'].iloc[0]

In [79]:
triplet_data_appended['hard_negative'] = triplet_data_appended.parallel_apply(
    lambda x: compute_hard_negatives(x['?a'], triplet_data_appended), axis=1
                                                                                             )

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=291), Label(value='0 / 291'))), HB…

In [80]:
triplet_data_appended.head(10)

Unnamed: 0,?a,desc_negative,negative_pref_desc_intesection,negative_desc_nn_score,hard_negative
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,"[training new staff, training new staff, train...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.4757101237773895, 0.4757101237773895, 0.475...",train new staff
1,http://data.europa.eu/esco/skill/00064735-8fad...,"[supervise cleaning staff work, supervise clea...","[0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.2...","[0.5259576439857483, 0.5259576439857483, 0.525...",supervise cleaning staff work
2,http://data.europa.eu/esco/skill/000709ed-2be5...,"[apply a different roasting method, apply a di...","[0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.2...","[0.09511102735996246, 0.09511102735996246, 0.0...",apply psychoeducation
3,http://data.europa.eu/esco/skill/0007bdc2-dd15...,"[burner control adjusting, burner control adju...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.1796051561832428, 0.1796051561832428, 0.179...",principles of railway operations planning
4,http://data.europa.eu/esco/skill/00090cc1-1f27...,"[rail company services, rail company services,...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.43253225088119507, 0.43253225088119507, 0.4...",services supplied by rail companies
5,http://data.europa.eu/esco/skill/000bb1e4-89f0...,[perform tasks according to written instructio...,"[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, ...","[0.08533264696598053, 0.08533264696598053, 0.0...",virological studies
6,http://data.europa.eu/esco/skill/000c94d2-2a2e...,[ensure medical images' diagnostic suitability...,"[0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, ...","[0.1618170440196991, 0.1618170440196991, 0.161...",ensure uniformity of veterinary inspection sta...
7,http://data.europa.eu/esco/skill/001d46db-035e...,"[perform train maintenance activities, perform...","[0.25, 0.25, 0.25, 0.25, 0.3333333333333333, 0...","[0.4436647295951843, 0.4436647295951843, 0.443...",perform train maintenance activities
8,http://data.europa.eu/esco/skill/0023e7a5-43da...,"[apply fertiliser, apply fertiliser, apply fer...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ...","[0.03303065896034241, 0.03303065896034241, 0.0...",apply procedures to ensure that cargo complies...
9,http://data.europa.eu/esco/skill/002b2e58-35ab...,"[examine fish stock, examine fish stock, exami...","[0.3333333333333333, 0.3333333333333333, 0.333...","[0.4823446273803711, 0.4823446273803711, 0.482...",harvested fish transporting


In [81]:
# triplet_data_appended.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/triplets_hard_negatives.csv', sep='\t')

# creating final classification_data

In [30]:
negative_file_desc_merged.head()

Unnamed: 0,main_uri,prefLabel,?a,negative_desc,pref_desc_intersection,desc_pref_intersection
386,http://data.europa.eu/esco/skill/7e11b04d-c6b5...,Patienten/Patientinnen zur Verbesserung des Sp...,http://data.europa.eu/esco/skill/d8d261bd-da63...,Patienten/Patientin an Ophthalmologen/Ophthalm...,0.2,0.2
1303,http://data.europa.eu/esco/skill/3838a64c-78dc...,Systeme zur internen Kommunikation warten,http://data.europa.eu/esco/skill/2c5fbce2-cd76...,landwirtschaftliches Gerät warten,0.333333,0.25
2056,http://data.europa.eu/esco/skill/83f713fb-2650...,Signalgeräte für Züge bedienen,http://data.europa.eu/esco/skill/770b473a-52db...,Nagelungsmaschinen bedienen,0.5,0.333333
3698,http://data.europa.eu/esco/skill/c349b18e-5038...,Foliendruckmaschine bedienen,http://data.europa.eu/esco/skill/d18408c1-c2ba...,Wiegeanlage bedienen,0.5,0.5
3742,http://data.europa.eu/esco/skill/c349b18e-5038...,Foliendruckmaschine bedienen,http://data.europa.eu/esco/skill/e8c9361b-9fbc...,Kuvertmaschine bedienen,0.5,0.5


In [31]:
skill_pref_desc.head()

Unnamed: 0,?a,prefLabel,desc,pref_desc_intesection,desc_pref_intesection
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel lagern,0.333333,0.5
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,Lebensmittelrohstoffe lagern,rohe Lebensmittel aufbewahren,0.0,0.0
2,http://data.europa.eu/esco/skill/1a61a520-f4a5...,geschriebenes Irisch verstehen,Irisch lesen,0.5,0.333333
16,http://data.europa.eu/esco/skill/1889010b-f796...,Dämmstoff zuschneiden,Isolierstoff zuschneiden,0.5,0.5
31,http://data.europa.eu/esco/skill/7f1a7abb-9208...,Anrufrouting,Call Routing,0.0,0.0


In [35]:
LANGUAGE

'german'

In [37]:
if LANGUAGE == 'german':
    negative_file_subset = negative_file_desc_merged[['main_uri', 'prefLabel', 'negative_desc']]
    negative_file_subset.columns = ['?a', 'term1', 'term2']
    
    positive_file_subset = skill_pref_desc[['?a', 'prefLabel', 'desc']]
    positive_file_subset.columns = ['?a', 'term1', 'term2']
    
    negative_file_subset.dropna(inplace=True)
    positive_file_subset.dropna(inplace=True)

    positive_file_subset['labels'] = 1
    negative_file_subset['labels'] = 0
    
    classification_data_full = pd.concat([positive_file_subset, negative_file_subset])

    classification_data_full.reset_index(drop=True, inplace=True)
    
    
    shuffled_classification_data = classification_data_full.sample(frac=1)
    
    first_half = shuffled_classification_data[: shuffled_classification_data.shape[0]]
    second_half = shuffled_classification_data[shuffled_classification_data.shape[0]:]

    first_half_shuffled = first_half[['?a', 'term2', 'term1', 'labels']]

    classification_data_shuffled = pd.concat([first_half_shuffled, second_half])

    classification_data_shuffled.reset_index(inplace=True, drop=True)

    classification_data_shuffled.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/shuffled_classification_data_term_term.csv', sep='\t')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_file_subset.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_file_subset.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_file_subset['labels'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [38]:
classification_data_shuffled

Unnamed: 0,?a,term2,term1,labels
0,http://data.europa.eu/esco/skill/f01fe651-4270...,Wasserfluss messen,Wasserfluss bestimmen,1
1,http://data.europa.eu/esco/skill/fe42b42e-0cad...,Tunnelbohrergeschwindigkeit bestimmen,Geschwindigkeit des Tunnelbohrers festlegen,1
2,http://data.europa.eu/esco/skill/442be689-4c4e...,Flaschschleifmaschinenteile,Flachschleifmaschinenbauteile,1
3,http://data.europa.eu/esco/skill/fc651e3f-c163...,Steinofen bedienen,Rettungsausrüstung auf Schiffen bedienen,0
4,http://data.europa.eu/esco/skill/2f48c606-a936...,Parameter der Wasserqualität messen,Parameter der Wasserqualität bestimmen,1
...,...,...,...,...
7638,http://data.europa.eu/esco/skill/9e06bac6-6b91...,IT-Systemprobleme lösen,IKT-Systemprobleme lösen,1
7639,http://data.europa.eu/esco/skill/0f8b7a8d-371b...,Attraktionen von Erlebnisparks ankündigen,Attraktionen von Themenparks ankündigen,1
7640,http://data.europa.eu/esco/skill/bf675dff-ce7b...,Tabellenkalkulationssoftware verwenden,Farbdruckprogramme verwenden,0
7641,http://data.europa.eu/esco/skill/a6dcd5bd-d498...,Hydropumpen bedienen,hydraulische Pumpen bedienen,1


In [32]:
negative_file_desc_merged.columns

Index(['main_uri', 'prefLabel', '?a', 'negative_desc',
       'pref_desc_intersection', 'desc_pref_intersection'],
      dtype='object')

In [169]:
negative_subset = negative_pref_list_exploded

In [170]:
positive_subset = skill_pref_desc[['?a', 'prefLabel', 'desc_positive']]

In [171]:
positive_subset.head()

Unnamed: 0,?a,prefLabel,desc_positive
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stored raw food materials
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store raw food materials
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw prime ingredients
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store materials of raw food
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw food materials


In [172]:
negative_subset.head()

Unnamed: 0,?a,prefLabel,negative_labels
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal products and food safety
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food products
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,animal food product
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,food intended for animal consumption
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,prepare rubber materials


In [185]:
negative_subset.columns = ['?a', 'term1', 'term2']
positive_subset.columns = ['?a', 'term1', 'term2']

In [186]:
negative_subset.dropna(inplace=True)
positive_subset.dropna(inplace=True)

positive_subset['labels'] = 1
negative_subset['labels'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_subset.dropna(inplace=True)


In [188]:
classification_data_full = pd.concat([positive_subset, negative_subset])

classification_data_full.reset_index(drop=True, inplace=True)

In [190]:
classification_data_full

Unnamed: 0,?a,term1,term2,labels
0,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stored raw food materials,1
1,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store raw food materials,1
2,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw prime ingredients,1
3,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,store materials of raw food,1
4,http://data.europa.eu/esco/skill/29b326ea-a9c8...,store raw food materials,stock raw food materials,1
...,...,...,...,...
154609,http://data.europa.eu/esco/skill/T1.2,working with numbers and measures,working in bad weather,0
154610,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,digital accessibility standards,0
154611,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,call in at working locations,0
154612,http://data.europa.eu/esco/skill/T1.3,working with digital devices and applications,arrive at working locations,0


#### jump to shuffle

In [88]:
classficiation_data_triplets = negative_subset.merge(positive_subset, on=['?a'], how='left')

In [89]:
classficiation_data_triplets.shape

(75899, 4)

In [90]:
classficiation_data_triplets.head()

Unnamed: 0,?a,hard_negative,prefLabel,desc_positive
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,train new staff,manage musical staff,manage staff of music
1,http://data.europa.eu/esco/skill/0005c151-5b5a...,train new staff,manage musical staff,coordinate duties of musical staff
2,http://data.europa.eu/esco/skill/0005c151-5b5a...,train new staff,manage musical staff,manage music staff
3,http://data.europa.eu/esco/skill/0005c151-5b5a...,train new staff,manage musical staff,direct musical staff
4,http://data.europa.eu/esco/skill/0005c151-5b5a...,train new staff,manage musical staff,manage musical staff


In [91]:
positive_classification_data = classficiation_data_triplets[['?a', 'prefLabel', 'desc_positive']]
negative_classification_data = classficiation_data_triplets[['?a', 'prefLabel', 'hard_negative']]

In [92]:
positive_classification_data.dropna(inplace=True)
negative_classification_data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_classification_data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negative_classification_data.dropna(inplace=True)


In [93]:
positive_classification_data.head()

Unnamed: 0,?a,prefLabel,desc_positive
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,manage staff of music
1,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,coordinate duties of musical staff
2,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,manage music staff
3,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,direct musical staff
4,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,manage musical staff


In [94]:
negative_classification_data.head()

Unnamed: 0,?a,prefLabel,hard_negative
0,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,train new staff
1,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,train new staff
2,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,train new staff
3,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,train new staff
4,http://data.europa.eu/esco/skill/0005c151-5b5a...,manage musical staff,train new staff


In [89]:
positive_classification_data['labels'] = 1
negative_classification_data['labels'] = 0

In [90]:
positive_classification_data.columns = ['?a', 'term1', 'term2', 'labels']
negative_classification_data.columns = ['?a', 'term1', 'term2', 'labels']

In [91]:
classification_data_full = pd.concat([positive_classification_data, negative_classification_data])

In [92]:
classification_data_full.reset_index(drop=True, inplace=True)

In [93]:
classification_data_full.head()

Unnamed: 0,?a,term1,term2,labels
0,http://data.europa.eu/esco/skill/05aa7c09-46e7...,accept criticism and guidance,feedback,1
1,http://data.europa.eu/esco/skill/05aa7c09-46e7...,accept criticism and guidance,criticism,1
2,http://data.europa.eu/esco/skill/12c82224-6394...,optical character recognition software,SimpleOCR,1
3,http://data.europa.eu/esco/skill/12c82224-6394...,optical character recognition software,Microsoft Office Document Imaging,1
4,http://data.europa.eu/esco/skill/12c82224-6394...,optical character recognition software,Tesseract,1


In [119]:
classification_data_full.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/classification_term_desc.csv', sep='\t')

### shuffle classification data

In [191]:
shuffled_classification_data = classification_data_full.sample(frac=1)

In [192]:
first_half = shuffled_classification_data[: shuffled_classification_data.shape[0]]
second_half = shuffled_classification_data[shuffled_classification_data.shape[0]:]

first_half_shuffled = first_half[['?a', 'term2', 'term1', 'labels']]

classification_data_shuffled = pd.concat([first_half_shuffled, second_half])

classification_data_shuffled.reset_index(inplace=True, drop=True)

classification_data_shuffled.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/shuffled_classification_data_term_term.csv', sep='\t')

In [193]:
first_half_shuffled = first_half[['?a', 'term2', 'term1', 'labels']]

In [195]:
first_half_shuffled[first_half_shuffled.labels == 0].sample(n=5)

Unnamed: 0,?a,term2,term1,labels
101029,http://data.europa.eu/esco/skill/6962058d-9808...,implement horticultural standards and practices,implement sustainable procurement,0
125675,http://data.europa.eu/esco/skill/ce84a0b1-5217...,manufacturer's information for audio visual eq...,select well equipment,0
107814,http://data.europa.eu/esco/skill/86d8a19f-1694...,manage plastic machinery,manage bioreactors,0
143161,http://data.europa.eu/esco/skill/2e5f8663-f123...,apply dental hygiene measures,apply clinical chiropractic competencies in sport,0
122078,http://data.europa.eu/esco/skill/68828bd3-4137...,maintain a router machine,tend belt branding machine,0


In [196]:
classification_data_shuffled = pd.concat([first_half_shuffled, second_half])

classification_data_shuffled.reset_index(inplace=True, drop=True)

In [197]:
classification_data_shuffled.to_csv(f'/cluster/scratch/yakram/sbert-copy/turtle_files/classification_data/{model_dir_language}/shuffled_classification_data_term_term.csv', sep='\t')

In [198]:
classification_data_shuffled.shape

(154614, 4)