In [1]:
#Code related to loading and filtering datasets
import pandas as pd
import re
import os
import numpy as np
from tqdm import tqdm
import seaborn as sns
from collections import defaultdict
from sklearn.model_selection import train_test_split
import random
random.seed(101)
import os
import unicodedata

wsd_path = "wsd_data/slo_sense_1.1.csv"
sense = pd.read_csv(wsd_path)
sense.word_index = sense.word_index.astype(int)

def filter_impact(df, name):
  print(name)
  print("word sense number {}".format(df.senseID.nunique()))
  print("lemma number{}".format(df.lemma.nunique()))
  print("unique sents number {}".format(df.sent.nunique()))
  print("all sents number: {}".format(df.shape))


In [2]:
#general filtering
filter_impact(sense, "original")
filt = sense[sense["sent"].apply(lambda x: len(x) <= 540)].copy() # 200 at 75%, std = 170
filter_impact(filt, "len_filter")

#Manual corrections
filt.at[83899, 'instance'] = 'multimedijske' #wrong inflection
drop_list = [113682, 13226, 23644, 42813, 49063, 49078, 105551, 113676, 128790, 149384,153792, 174480]
filt = filt.drop(index=drop_list) #duplicate examples with differing indices.

#MWU removal
filt = filt[filt.instance.apply(lambda x: " " not in x)].copy()
filt = filt[filt.lemma.apply(lambda x: " " not in x)].copy()
filt = filt[~filt.pos.apply(lambda x: " " in x)] #spaces in pos tags
filter_impact(filt, "mwu_filter")

filt.pos = filt.pos.apply(lambda x: "NOUN" if x == "PROPN" else x)
filt.pos = filt.pos.apply(lambda x: x if x in ["ADJ", "ADV", "NOUN", "VERB"] else "OTHR")

#examples with multiple tags for the same target lemma
filt["temp_text"] = filt.lemma + " " + filt.sent
group_obj = filt.groupby("temp_text", as_index = False).senseID.nunique()
multi_tagged = group_obj[group_obj.senseID > 1]
multi_tagged_list = set(multi_tagged.temp_text)
multi_tagged_ind  = filt[filt["temp_text"].apply(lambda x: x in multi_tagged_list)]
filt = filt.drop(multi_tagged_ind.index)
del filt['temp_text']
filter_impact(filt, "multitag_filter")

## Slossbert specific filtering
def compliant(df):
    at_least_two = df.groupby("lemma").senseID.nunique().min() == 2
    sent_num_cond = df.groupby("senseID").sent.count().min() == 4
    print("two sense per lemma: {}".format(at_least_two))
    print("four examples per sense: {}".format(sent_num_cond))
    filter_impact(df, "describe filtered df")

def elim_single_senses(df):
    lemma_condition = df.groupby("lemma").senseID.nunique() >= 2
    two_senses = [lemma for lemma, condition in zip(lemma_condition.index.values, lemma_condition.values) if condition]
    return df[df.lemma.apply(lambda x: x in two_senses)]
    #filt_2sense_per_lemma = test_soup.copy()

def elim_too_few_senses(df): #!+ misleading name
    sent_num_cond = df.groupby("senseID").sent.count() >= 4
    two_example = [sense for sense, condition in zip(sent_num_cond.index.values, sent_num_cond.values) if condition]
    return df[df.senseID.apply(lambda x : x in two_example)]

filt = elim_single_senses(filt)
filt = elim_too_few_senses(filt)
filt = elim_single_senses(filt)
filt = elim_too_few_senses(filt)


#Sentence formatting
#strips punctuation & double spaces and does '-weak supervision (glossbert idea)
#4-metil whatever cases are already removed in the previous step (Probably MWU removal)

def strip_punctuation(text):
    punctuation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
    return ''.join(x for x in text
                   if unicodedata.category(x) not in punctuation_cats)

def mark_target_word(old_string, target_w):
    new_string = old_string.strip(" ")
    new_string = re.sub("  ", " ", new_string)
    new_string = re.sub(target_w, "'" + target_w + "'", new_string)
    return new_string

filt["sent"] = filt.apply(lambda x: strip_punctuation(x["sent"]), axis=1)
filt["sent"] = filt.apply(lambda x: mark_target_word(x["sent"], x["instance"]), axis=1)

#duplikati vstran
filt.drop('old_index', axis=1, inplace=True)
filt.drop_duplicates(inplace = True)

clean_df = filt.copy()
clean_df["sentID"] = [i for i in range(len(clean_df))]
compliant(clean_df)

original
število besednih pomenov 11069
število lem 5604
število unikatnih stavkov 196655
skupno število primerov: (202240, 7)
len_filter
število besednih pomenov 11068
število lem 5603
število unikatnih stavkov 194649
skupno število primerov: (200188, 7)
mwu_filter
število besednih pomenov 10331
število lem 4927
število unikatnih stavkov 193536
skupno število primerov: (198789, 7)
multitag_filter
število besednih pomenov 10318
število lem 4927
število unikatnih stavkov 193388
skupno število primerov: (198438, 7)
two sense per lemma: True
four examples per sense: True
describe filtered df
število besednih pomenov 4633
število lem 1597
število unikatnih stavkov 139445
skupno število primerov: (139445, 7)


In [5]:
#more POS fixes
fixes = {
    'cepljiv%0': "ADJ",
    'cepljiv%1': "ADJ",
    'dovolj%0': "ADV",
    'dovolj%1': "ADV",
    'edin%0': "ADJ",
    'edin%1': "ADJ",
    'kad%0': "NOUN",
    'kad%1': "NOUN",
    'mnogo%0': "ADV",
    'mnogo%1': "ADV",
    'mnogo%2': "ADV",
    'obročast%0': "ADJ",
    'obročast%1': "ADJ",
    'pritlikav%0': "ADJ",
    'pritlikav%1': "ADJ",
    'pritlikav%3': "ADJ",
    'vas%0': "NOUN",
    'vas%1': "NOUN",
    'vas%2': "NOUN",
    'veliko%0': "ADV",
    'veliko%1': "ADV",
    'veliko%2': "ADV",
    'zvezdaš%0': "NOUN",
    'zvezdaš%1': "NOUN"
}

#def replace(senseID_in):
#    if senseID_in in fixes.keys():
        

clean_df["pos"] = clean_df.apply(lambda x: fixes[x.senseID] if x.senseID in fixes.keys() else x.pos, axis = 1)

In [9]:
#splits

#test set
# at least 2 and max 8 examples per word sense (Stratified sample with a cap)
# Keep test set zero-shot-like
#Why 8? Why not. Median of sents per sense is 14
# just experimenting with test size split till min 2 sents per sense is achieved

#validation set
#from remaining examples, applies sense freq filter (8) to keep train covering all labels
#filter for combination compliance
#sample 4 examples from all

def split_compliance(df):
    at_least_two = df.groupby("lemma").senseID.nunique().min() >= 2
    sent_num_cond = df.groupby("senseID").sent.count().min() >= 2
    print("two sense per lemma: {}".format(at_least_two))
    print("two examples per sense: {}".format(sent_num_cond))
    
_ , test_df = train_test_split(clean_df, test_size= 0.6, random_state=101, stratify = clean_df["senseID"])
#print(test_df.groupby("senseID").sent.count().describe())

over_eight = test_df.groupby("senseID").filter(lambda x: x.sent.count() >= 8).copy()
capped_senses = over_eight.groupby("senseID", group_keys=False).apply(lambda x: x.sample(n = 8, random_state = 101))
low_senses = test_df.drop(over_eight.index) # Only keep low ones

test_df = pd.concat([low_senses, capped_senses])
remain_df = clean_df.drop(test_df.index)

proto_val = remain_df.groupby("senseID").filter(lambda x: x.sent.count() >= 8).copy()
proto_val = elim_single_senses(proto_val)
proto_val = elim_too_few_senses(proto_val)
val_df = proto_val.groupby("senseID", group_keys=False).apply(lambda x: x.sample(n = 4, random_state = 101))
train_df = remain_df.drop(val_df.index)

split_compliance(train_df)
split_compliance(val_df)
split_compliance(test_df)

filter_impact(train_df, "train")
filter_impact(val_df, "val")
filter_impact(test_df, "test")

two sense per lemma: True
two examples per sense: True
two sense per lemma: True
two examples per sense: True
two sense per lemma: True
two examples per sense: True
train
število besednih pomenov 4633
število lem 1597
število unikatnih stavkov 104316
skupno število primerov: (104316, 7)
val
število besednih pomenov 1743
število lem 691
število unikatnih stavkov 6972
skupno število primerov: (6972, 7)
test
število besednih pomenov 4633
število lem 1597
število unikatnih stavkov 28157
skupno število primerov: (28157, 7)


In [4]:
#combo experiments: 
#- save intermediate results in a separate folder
#- make it easy to start from some point
#change sampling strategy to just take 50% neg pairs?
from itertools import combinations
from itertools import product
import math

save_intermed = "wsd_data/combo_train_staging/"
column_format = ["lemma", "text", "label", "senseID1", "senseID2", "sentID1", "sentID2"]

def df_to_combos(df):
    #breaks down df into row combinations
    #transforms each combination df into a new row
    #only used for positive examples because different approach needed for neg
    gather = [df.loc[index,:] for index in list(combinations(df.index,2))]
    return gather
    
def mod_rows(pair_df, label):
    example1, example2 = [body for ind, body in pair_df.iterrows()]
    txt_pairs = example1["sent"] + " [SEP] " + example2["sent"]
    out_vals = [example1["lemma"], txt_pairs, label, \
                example1["senseID"], example2["senseID"], example1["sentID"], example2["sentID"]]
    out_index = ["lemma", "text", "label", "senseID1", "senseID2", "sentID1", "sentID2"]
    return pd.DataFrame([out_vals], columns = out_index)

def harvest_positive_pairs(sense_df):
    #takes a df of a single sense and combines the examples for positive
    df_with_two =  df_to_combos(sense_df)
    df_list_ones = [mod_rows(i, 1) for i in df_with_two]
    positive_pairs_df = pd.concat(df_list_ones).reset_index(drop = True)
    return positive_pairs_df
    
#generating negative combos
#positives can be generates wihtin a senseID groupby no problem
#negatives need access on a lemma level, so maybe it's two different loops actually

def get_neg_goal(df):
    #takes in a lemma-level sub_df
    #returns sent_num per sense, positive combinations and required sents to be sampled for the neg
    sense_list = list(df.senseID.unique())
    sent_counts = df.groupby("senseID").sent.nunique()
    sent_counts = [sent_counts[sense] for sense in sense_list]
    positive_combo_n = [math.comb(sent_count, 2) for sent_count in sent_counts]
    required_negatives = [math.ceil(pos_combo/pos_example) for pos_combo, pos_example in zip(positive_combo_n, sent_counts)]
    #sent_counts, positive_combo_n, 
    return required_negatives

def check_add(candidate_pair, save_df, keytracker):
    #checks if the combo is already present in a neg collector df
    #if not, it adds it to the neg storage and the keytracker
    #it modifies the objects passed into args, doesn't return anything
    pair_id = candidate_pair[["sentID1", "sentID2"]].values
    pair_id = frozenset(pair_id[0])
    if pair_id not in keytracker:
        keytracker.add(pair_id)
        save_df.loc[len(save_df.index)] = candidate_pair.values[0]

def combine_pos_neg(df_pos, df_neg, save_df, keytracker):
    #will generate non-matching examples into a df format 
    src = product(df_pos.values, df_neg.values)
    for example1, example2 in src:
        row_values = list(example1) + list(example2)
        lemma = row_values[0]
        sent1 = row_values[3]
        sent2 = row_values[10]
        txt_pair = sent1 + " [SEP] " + sent2
        senseID1 = row_values[4]
        sentID1 = row_values[6]
        senseID2 = row_values[11]
        sentID2 = row_values[13]
        row_data = [lemma, txt_pair, 0, senseID1, senseID2, sentID1, sentID2]
        temp_neg = pd.DataFrame([row_data], columns =  ["lemma", "text", "label", "senseID1", "senseID2", "sentID1", "sentID2"])
        check_add(temp_neg, save_df, keytracker)
    
def harvest_negative_pairs(df):
    #works on lemma level
    #get remainder without i-th sense and sample required negative sents
    collector = pd.DataFrame([], columns =  ["lemma", "text", "label", "senseID1", "senseID2", "sentID1", "sentID2"])
    keys = set()
    required_negatives = get_neg_goal(df)
    for senseID in range(df.senseID.nunique()):        
        anti_df = df[df["senseID"].apply(lambda x: x.split("%")[1] != str(senseID))]
        samp_target = required_negatives[senseID] \
                if anti_df.sent.nunique() > required_negatives[senseID] else anti_df.sent.nunique()    
        anti_samples = anti_df.sample(random_state = 101, n = samp_target)
        pos_samples = df[df["senseID"].apply(lambda x: x.split("%")[1] == str(senseID))]
        combine_pos_neg(pos_samples, anti_samples, collector, keys)
    return collector

In [148]:
#Loop for pair generation over the train_df
"""
lemma_groups = train_df.sort_values('lemma').groupby("lemma")
for lemma, lemma_group in tqdm(lemma_groups):
    positive_lemma = [harvest_positive_pairs(sense) for _, sense in lemma_group.groupby("senseID")]
    pos_pairs = pd.concat(positive_lemma)
    neg_pairs = harvest_negative_pairs(lemma_group)
    joined = pd.concat([pos_pairs, neg_pairs])
    joined.to_csv(save_intermed + "{}.csv".format(lemma))
"""

#loading the temp files into a unified df
"""
def mass_load(file_list):
    df_collection = [pd.read_csv(save_intermed + "/" + i) for i in file_list if ".csv" in i]
    return pd.concat(df_collection)
combo_df = mass_load(file_list)
"""

#uniform index (senseID1xsenseID2) for combo df
def senseID_order(senseID1, senseID2):
    id1, id2 = [i.split("%")[1] for i in [senseID1, senseID2]]
    if int(id2) >= int(id1):
        return "{}_{}".format(senseID1, senseID2)
    else:
        return "{}_{}".format(senseID2, senseID1)

combo_df["senseID2x"] = combo_df.apply(lambda x: senseID_order(x.senseID1, x.senseID2), axis = 1)

In [None]:
#sampling 10% and 20% train datasets
import pandas as pd
from sklearn.model_selection import train_test_split

combo_big = pd.read_csv("wsd_data/combo_dfs/combo_ready/combo_train_fin.csv")
prop = 0.2 #required % of the main combo df

target_size = len(combo_big)*prop
keeper = combo_big.groupby("senseID2x").filter(lambda x: x.text.count() == 1)
target_prop = (target_size - len(keeper))/3683255

big_wo_keeper = combo_big.drop(keeper.index)
_, combo_20 = train_test_split(big_wo_keeper, stratify = big_wo_keeper[["senseID2x"]], test_size = target_prop, random_state=101)

df_20 = pd.concat([keeper, combo_20])
#df_20.to_csv("wsd_data/combo_dfs/combo_ready/combo_train_20s.csv")

In [6]:
#balancing (non)-matching sent pairs
import pandas as pd
pth = "/home/fijavzz/workspace/wsd_data/combo_dfs/"

val_df = pd.read_csv(pth + 'combo_val_df.csv')
train_df = pd.read_csv(pth + 'combo_train.csv')
test_df = pd.read_csv(pth + 'combo_test_df.csv')
train_mini_df = pd.read_csv(pth + 'combo_train_mini.csv')

def senseID_order(senseID1, senseID2):
    id1, id2 = [i.split("%")[1] for i in [senseID1, senseID2]]
    if int(id2) >= int(id1):
        return "{}_{}".format(senseID1, senseID2)
    else:
        return "{}_{}".format(senseID2, senseID1)

val_df["senseID2x"] = val_df.apply(lambda x: senseID_order(x.sense1, x.sense2), axis = 1)
test_df["senseID2x"] = test_df.apply(lambda x: senseID_order(x.sense1, x.sense2), axis = 1)

def sep_sents(df, label): #text for train, "text_pair" for rest
    split = df[label].apply(lambda x: x.split('[SEP]'))
    df["sent1"] = split.apply(lambda x: x[0])
    df["sent2"] = split.apply(lambda x: x[1])
    return df

val_df = sep_sents(val_df, "text_pair")
test_df = sep_sents(test_df, "text_pair")
train_df = sep_sents(train_df, "text")

def balance_plz(df):
    #downsamples examples with negative class
    pos = df[df.label == 1].copy()
    neg = df[df.label == 0].copy()
    req_prop = round(len(pos)/len(neg), 3)
    neg_down = neg.groupby("senseID2x", group_keys = False).apply(lambda x: x.sample(frac = req_prop, random_state=101))
    return pd.concat([pos, neg_down])


In [None]:
#Alternative approach for sentence combination dfs
#works for smaller dfs (test, val)
#abandoned for lack of control and long computation time for train df
### see solution above with intermittent saving after each lemma and limited negative examples

from collections import defaultdict
from itertools import combinations

def sent_combos(target_df):
  #pobere vse kombinacije v df-ju. lahko traja kar dolgo, da zaključi.
  pair_df = pd.DataFrame(columns=['text_pair','sense1','sense2', "sent1_ind", "sent2_ind", "lemma",'label'])
  sense_groups = target_df.groupby("lemma")
  for lemma, body in tqdm(sense_groups):
    tupl = [(sent, sense, sent_ind) for sent, sense, sent_ind in zip(body.sent, body.senseID, body.sentID)]
    combos = [i for i in combinations(tupl, 2)]
    for pair in combos:
      pack1, pack2 = pair
      t1, sense1, sent_ind1 = pack1
      t2, sense2, sent_ind2 = pack2
      text_pair = t1 + " [SEP] " + t2
      label = 1 if (sense1 == sense2) else 0
      pair_df.loc[len(pair_df.index)] = [text_pair, sense1, sense2, sent_ind1, sent_ind2, lemma, label] 
  return pair_df

"""
test_combos = sent_combos(test_df)
val_combos = sent_combos(val_df)
val_combos.to_csv("wsd_data/combo_val_df.csv", index = False)
train_combos = sent_combos(train_df)
train_combos.to_csv("wsd_data/combo_train_df.csv", index = False)
"""