In [None]:
import numpy as np
import pandas as pd
import json
from datasets import Dataset
from datasets import concatenate_datasets
from tqdm.notebook import tqdm
import hashlib

def get_reference_df(raw_df):
    ref_df = raw_df[['document', 'tokens', 'labels']].copy()
    ref_df = ref_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
    ref_df['token_str'] = ref_df['token'].copy()
    ref_df['token'] = ref_df.groupby('document').cumcount()
        
    reference_df = ref_df[ref_df['label'] != 'O'].copy()
    reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
    reference_df = reference_df[['row_id', 'document', 'token', 'token_str', 'label']].copy()
    return reference_df

def pii_fbeta_score(pred_df, gt_df, beta=5):
    df = pred_df.merge(gt_df, how="outer", on=["document", "token"], suffixes=("_pred", "_gt"))
    df["cm"] = ""
    df.loc[df.label_gt.isna(), "cm"] = "FP"
    df.loc[df.label_pred.isna(), "cm"] = "FN"
    df.loc[(df.label_gt.notna() & df.label_pred.notna()) & (df.label_gt != df.label_pred), "cm"] = "FNFP" # CHANGED
    df.loc[
        (df.label_pred.notna()) & (df.label_gt.notna()) & (df.label_gt == df.label_pred), "cm"
    ] = "TP"
    FP = (df["cm"].isin({"FP", "FNFP"})).sum()
    FN = (df["cm"].isin({"FN", "FNFP"})).sum()
    TP = (df["cm"] == "TP").sum()
    s_micro = (1+(beta**2))*TP/(((1+(beta**2))*TP) + ((beta**2)*FN) + FP)
    return s_micro

def parse_predictions(predictions, ds, config, threshold=0.9):
    id2label = config["id2label"]
    pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)
    preds = predictions.argmax(-1)
    preds_without_O = pred_softmax[:,:,:12].argmax(-1)
    O_preds = pred_softmax[:,:,12]
    preds_final = np.where(O_preds < threshold, preds_without_O , preds)
    
    pairs = set()
    document, token, label, token_str, probabilities = [], [], [], [], []
    for p, token_map, offsets, tokens, doc, probs in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"], pred_softmax):
        for token_pred, (start_idx, end_idx), prob in zip(p, offsets, probs):
            label_pred = id2label[str(token_pred)]

            if start_idx + end_idx == 0: 
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            if label_pred in ("O") or token_id == -1:
                continue

            pair = (doc, token_id)

            if pair in pairs:
                continue

            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            probabilities.append(prob.max(axis=-1))
            pairs.add(pair)
            
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str,
        "probability": probabilities,
    })
    df["row_id"] = list(range(len(df)))
    return df

def make_document_id(full_text):
    id = hashlib.sha256(full_text.encode('utf-8')).hexdigest()[:32]
    return id

def correct_label_after_new_line_label(dataframe):
    train_data = json.load(open('data/raw/train.json'))
    df = dataframe.copy()
    dfs = []
    for doc in df['document'].unique():
        sub = df[df['document'] == doc].copy()
        if not 'I-NAME_STUDENT' in sub['label'].values:
            dfs.append(sub)
            continue
        for sample in train_data:
            sample["document"] = make_document_id(sample["full_text"])
            if sample['document'] == doc:
                break
        new_labels = []
        for tok, lab in sub[['token', 'label']].values:
            if lab == 'I-NAME_STUDENT' and '\n' in sample['tokens'][tok-1]:
                new_labels.append('B-NAME_STUDENT')
            else:
                new_labels.append(lab)
        sub['label'] = new_labels
        dfs.append(sub)       
    df = pd.concat(dfs)
    return df


In [None]:
from spacy.lang.en import English
import re
nlp = English()

subtitles = ['Dr', 'Mr', 'Ms', 'Mrs', 'By', 'Dr.', 'Dr .', 'Mr.', 'Mr .', 'Ms.', 'Ms .', 'Mrs.', 'Mrs .', 'By.', 'By .']

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

def regex_predictions(data):
    email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
    phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
    id_num_regex = re.compile("[\w\.\:\-\_\|]*\d{6,}")
    emails = []
    phone_nums = []
    id_nums = []
    

    for _data in data:
        # email
        for token_idx, token in enumerate(_data["tokens"]):
            if re.fullmatch(email_regex, token) is not None:
                emails.append(
                    {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
                )
        # phone number
        matches = phone_num_regex.findall(_data["full_text"])
        if not matches:
            continue
        for match in matches:
            target = [t.text for t in nlp.tokenizer(match)]
            matched_spans = find_span(target, _data["tokens"])
        for matched_span in matched_spans:
            for intermediate, token_idx in enumerate(matched_span):
                prefix = "I" if intermediate else "B"
                phone_nums.append(
                    {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
                )
                
        for token_idx, token in enumerate(_data["tokens"]):
            match = id_num_regex.match(token)
            if match is None:
                continue
            id_nums.append(
                {"document": _data["document"], "token": token_idx, "label": "B-ID_NUM", "token_str": token}
            )
            
    return pd.DataFrame(emails + phone_nums + id_nums)


def filter_student_preds(row):
    
    if not "NAME_STUDENT" in row["label"]:
        return True
    else:
        try:
            if (row["token_str"].istitle() or row["token_str"] == "\n" or row["token_str"] == "-") and (not any(x.isdigit() for x in row["token_str"])):
                return True
            else:
                return False
        except:
            return False

def postprocess_id_phone(df, DEBUG = False):
    
    sub = df
    
    digit_pat = r'^\d+$'
    phone_dot_pat = r'^\d{3}\.\d{3}\.\d{4}$'
    id_dot_pat = r'^\d{3}\.\d{4}\.\d{4}$'
    all_dot_pat = r'\d+\.\d+\.\d+'
    
    ssn_id_num_pat = r'^\d{3}-\d{2}-\d{4}$'
    phone_hyphen_pat = r'^\d{3}-\d{3}-\d{4}$'
    
    id_comma_pat = r'^\d{1,2}\,\d{1,2}\,\d{1,2},\d{1,2}$'
    alphabet_pattern = r'[a-zA-Z]'
    
    for i in range(len(sub)):
        
        #========================================================================
        
        string_to_check = sub.token_str[i]
        if DEBUG:
            old_label = sub.label[i]

        if 1 + 1 == 2: #"ID_NUM" in sub.label[i] or "PHONE_NUM" in sub.label[i]:
            
            try:
                if re.match(digit_pat, string_to_check):
                    
                    if len(string_to_check) >= 9 and "PHONE_NUM" in sub.label[i]:
                        sub.label[i] = "B-ID_NUM" 
                        
                        if DEBUG:
                            if old_label != sub.label[i]:
                                print(string_to_check, old_label, sub.label[i])
                                
                        continue
            except:
                pass
            
            try:
                if re.match(all_dot_pat, string_to_check):
                    if re.match(phone_dot_pat, string_to_check):
                        sub.label[i] = "B-PHONE_NUM"
                    else:
                        if "x" in string_to_check:
                            sub.label[i] = "B-PHONE_NUM"
                        elif re.match(id_dot_pat, string_to_check):
                            sub.label[i] = "B-ID_NUM"
                            
                    if DEBUG:
                        if old_label != sub.label[i]:
                            print(string_to_check, old_label, sub.label[i])
                        
                    continue
            except:
                pass
            
            try:
                if re.match(id_comma_pat, string_to_check):
                    sub.label[i] = "B-ID_NUM"
                    if DEBUG:
                        if old_label != sub.label[i]:
                            print(string_to_check, old_label, sub.label[i])
                    continue
            except:
                pass
            
            
            try:
                if "PHONE_NUM" in sub.label[i] and re.search(alphabet_pattern, string_to_check):
                    if "x" not in string_to_check and "X" not in string_to_check and "Ext" not in string_to_check and "ext" not in string_to_check and "EXT" not in string_to_check:
                        sub.label[i] = "B-ID_NUM"
                        if DEBUG:
                            if old_label != sub.label[i]:
                                print(string_to_check, old_label, sub.label[i])
                        continue
            except:
                pass
            
        
            #========================================================================
            
            string_to_check = ""

            if i+4 < len(sub):
                #if it is not the first index of a contiguous segment, or it is not the last index of a contiguous segment, skip it
                if (i-1 >= 0 and sub.document[i-1] == sub.document[i] and sub.token[i-1] + 1 == sub.token[i]) or \
                (i+5 < len(sub) and sub.document[i+5] == sub.document[i] and sub.token[i+5]-5 == sub.token[i]):
                    pass
                
                else:
                    if len(set([sub.document[i], sub.document[i+1], sub.document[i+2], sub.document[i+3], sub.document[i+4]])) == 1 and \
                    sub.token[i] + 1 == sub.token[i+1] and \
                    sub.token[i] + 2 == sub.token[i+2] and \
                    sub.token[i] + 3 == sub.token[i+3] and \
                    sub.token[i] + 4 == sub.token[i+4]:
                        for inner_index in range(i, i+5):
                            string_to_check += sub.token_str[inner_index]
            
            try:
                if re.match(ssn_id_num_pat, string_to_check):
                    for inner_index in range(i, i+5):
                        old_label = sub.label[inner_index]
                        if inner_index == i:
                            sub.label[inner_index] = "B-ID_NUM"
                        else:
                            sub.label[inner_index] = "I-ID_NUM"
                            
                        if DEBUG:
                            if old_label != sub.label[inner_index]:
                                print(string_to_check, old_label, sub.label[inner_index])
                            
                    continue
                            
                elif re.match(phone_hyphen_pat, string_to_check):
                    for inner_index in range(i, i+5):
                        old_label = sub.label[inner_index]
                        if inner_index == i:
                            sub.label[inner_index] = "B-PHONE_NUM"
                        else:
                            sub.label[inner_index] = "I-PHONE_NUM"
                            
                        if DEBUG:
                            if old_label != sub.label[inner_index]:
                                print(string_to_check, old_label, sub.label[inner_index])
                            
                    continue
            except:
                pass
                   
    sub['row_id'] = sub.index
    return sub


def postprocess_street_address(df):
    
    sub = df
    new_street_addresses = []
    
    for i in range(len(sub)):
        if sub.label[i] == "B-STREET_ADDRESS":
            start = i
            end = i+1
            while end < len(sub) and sub.label[end] == "I-STREET_ADDRESS" and sub.document[end] == sub.document[start] and sub.token[end] - sub.token[start] <= 12:
                end += 1
            end -= 1
            
            token_diff = sub.token[end] - sub.token[start]
            index_diff = end - start
            if 0 <= token_diff - index_diff <= 2:
                for new_index in range(sub.token[start], sub.token[end]+1):
                    if new_index == sub.token[start]:
                        new_street_addresses.append([sub.document[start], new_index, "B-STREET_ADDRESS", "\n", 0])
                    else:
                        new_street_addresses.append([sub.document[start], new_index, "I-STREET_ADDRESS", "\n", 0])
                   
    sub = pd.concat([sub, pd.DataFrame(new_street_addresses, columns = ["document", "token", "label", "token_str", "row_id"])]).reset_index(drop=True)
    sub['row_id'] = sub.index
    return sub

def remove_false_positives(df):
    
    sub = df
    sub["valid"] = True
    for i in range(len(sub)):
        if sub.label[i] == "B-ID_NUM" and len(sub.token_str[i]) > 25:
            sub.valid[i] = False
            
        if sub.label[i] == "B-URL_PERSONAL" and len(sub.token_str[i]) < 10:
            sub.valid[i] = False
            
    sub = sub[sub.valid == True].reset_index(drop=True)
    sub['row_id'] = sub.index
    return sub

def postprocess_username(df):
    
    sub = df
    new_usernames = []
    for i in range(len(sub)):
        if sub.label[i] == "B-USERNAME":
            if sub.token[i]+2 < len(doc2tokens[str(sub.document[i])]) and doc2tokens[str(sub.document[i])][sub.token[i]+1] in [".", "-"]:
                new_usernames.append([sub.document[i], sub.token[i], "B-USERNAME", "\n", 0])
                new_usernames.append([sub.document[i+1], sub.token[i+1], "I-USERNAME", "\n", 0])
                new_usernames.append([sub.document[i+2], sub.token[i+2], "I-USERNAME", "\n", 0])

    sub = pd.concat([pd.DataFrame(new_usernames, columns = ["document", "token", "label", "token_str", "row_id"]), sub]).reset_index(drop=True)
    sub['row_id'] = sub.index
    
    return sub

def postprocess_url(df):
    sub = df
    new_usernames = []
    for i in range(len(sub)):
        if sub.label[i] == "B-URL_PERSONAL":
            if i+1 < len(sub) and sub.label[i+1] == "B-ID_NUM" and sub.document[i+1] == sub.document[i] and sub.token[i+1] == sub.token[i] + 1 and (not any(x.isdigit() for x in sub.token_str[i+1])):
                sub.label[i+1] = "I-URL_PERSONAL"
    
    sub['row_id'] = sub.index
    return sub

def all_postprocess(df):
    
    df = postprocess_id_phone(df)
    df = df.drop_duplicates(subset=["document", "token"], keep="first")
    df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    df = postprocess_street_address(df)
    df = df.drop_duplicates(subset=["document", "token"], keep="first")
    df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    #df = postprocess_username(df)
    #df = df.drop_duplicates(subset=["document", "token"], keep="first")
    #df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
    #df.reset_index(drop=True, inplace=True)
    
    #df = postprocess_url(df)
    #df = df.drop_duplicates(subset=["document", "token"], keep="first")
    #df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
    #df.reset_index(drop=True, inplace=True)
    
    df = remove_false_positives(df)
    df = df.drop_duplicates(subset=["document", "token"], keep="first")
    df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df


In [60]:
datasets = ['original'] #  'original', 'mpware'
dataset = datasets[0]

exp052_ds = []
exp073_ds = []
exp000_ds = []
exp086_ds = []
exp069_ds = []
data = []

folds = [0,]
for fold in folds:
    ds1 = Dataset.from_parquet(f"data/tune_ens_weights2/exp052_0_{dataset}_valid.parquet")
    ds2 = Dataset.from_parquet(f"data/tune_ens_weights2/exp073_{fold}_{dataset}_valid.parquet")
    ds3 = Dataset.from_parquet(f"data/tune_ens_weights2/bestteam_{fold}_{dataset}_valid.parquet")
    ds4 = Dataset.from_parquet(f"data/tune_ens_weights2/exp086_{fold}_{dataset}_valid.parquet")
    ds5 = Dataset.from_parquet(f"data/tune_ens_weights2/exp069_{fold}_{dataset}_valid.parquet")
    
    exp052_ds.append(ds1)
    exp073_ds.append(ds2)
    exp000_ds.append(ds3)
    exp086_ds.append(ds4)
    exp069_ds.append(ds5)
        
    df = pd.read_parquet("data/tune_ens_weights2/raw_data.parquet")
    df = df[df['document'] % 4 == fold]
    data_ = []
    for i in range(len(df)):
        data_.append(df.iloc[i][['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels']].to_dict())
    
    new_data = []
    for sample in data_:
        new_data.append(sample)
    data += new_data

valid_texts = pd.DataFrame(data)
data = get_reference_df(pd.DataFrame(data))   
        
exp031_config = json.load(open("data/tune_ens_weights2/exp031_config.json"))
exp068_config = json.load(open("data/tune_ens_weights2/config.json"))
bestteam_config = json.load(open("data/tune_ens_weights2/bestteam_config.json"))

In [61]:

models_names = [
    'exp052', 
    'exp069',
    'bestteam',
    'exp073',
    'exp075',
    'exp076',
    # 'exp086',
    'exp087',
]

# ("/kaggle/input/pii-detect-exp073-0", 7.712525886688761), # 963
#     ("/kaggle/input/pii-detect-exp075-0", 1.006986515616388), # 958
#     ("/kaggle/input/pii-detect-exp076-0", 1.2647161934501114), # 961
#     ("/kaggle/input/pii-detect-exp086/exp086/exp086_train_42_0", 3.73951430819615), # ?
#     ("/kaggle/input/pii-detect-exp087/exp087/exp087_train_42_0", 5.087217435807587), # ?
#     ("/kaggle/input/pii-detection-models/deberta-v3-large-fulltrain-02-20240215T081012Z-001/deberta-v3-large-fulltrain-02/", 7.9858516815807725), # ?

# weights =  {
#     'bestteam': 1,
#     'exp052': 7.837598953336008,
#     'exp069': 1.4675304620960052,
#     'exp073': 9.627042443386674,
#     'exp075': 1.8042350145106174,
#     'exp076': 6.931660327090153,
#     'exp086': 6.931660327090153,
#     'exp087': 6.931660327090153,
#     'thr': 10.814593658362838
#  } 

# weights =  {
#     'bestteam': 7.9858516815807725,
#     'exp052': 7.837598953336008,
#     'exp069': 1.4675304620960052,
#     'exp073': 7.712525886688761,
#     'exp075': 1.006986515616388,
#     'exp076': 1.2647161934501114,
#     'exp086': 3.73951430819615,
#     'exp087': 5.087217435807587,
#     'thr': 11.608788075932253
#  } # 9763 4 folds - 0.9810 0 fold - 0.969 lb
# thr = 11.608788075932253


weights =  {
    'bestteam': 9.262006548700064,
    'exp052': 7.837598953336008,
    'exp069': 1.4675304620960052,
    'exp073': 9.558133102436543,
    'exp075': 5.384845708592142,
    'exp076': 5.0057680970954195,
    'exp086': 3.73951430819615,
    'exp087': 1.3949429027781592,
    'thr': 12.582708675708965
 } # 0.9702 4 folds - 0.9794 0 fold - 0.972 lb
thr = 12.582708675708965

threshold = 0.9

dfs = []
for model_name in tqdm(models_names):
    preds = []
    for i, fold in enumerate(folds):
        with open(f'data/tune_ens_weights2/{model_name}_{fold}_{dataset}_preds.npy', 'rb') as file:
            predictions = np.load(file)
            
        dataset = f"data/tune_ens_weights2/{model}_{fold}_{dataset}_valid.parquet"

        if model_name in ['exp031', 'exp050', 'exp052', 'exp055', 'exp056']:
            df = parse_predictions(predictions, exp052_ds[i], exp031_config, threshold=threshold)
        elif model_name in ['bestteam']:
            df = parse_predictions(predictions, exp000_ds[i], bestteam_config, threshold=threshold)
        elif model_name in ['exp086']:
            df = parse_predictions(predictions, exp086_ds[i], exp068_config, threshold=threshold)
        elif model_name in ['exp069']:
            df = parse_predictions(predictions, exp069_ds[i], exp068_config, threshold=threshold)
        else:
            df = parse_predictions(predictions, exp073_ds[i], exp068_config, threshold=threshold)
        
        preds.append(df)
        
    df = pd.concat(preds)
    df['weight'] = weights[model_name]
    df['model_name'] = model_name
    dfs.append(df)
    
df = pd.concat(dfs)
df['weight'] = df['weight']
df = df.groupby(['document', 'token', 'label', 'token_str']).agg({'weight': 'sum', 'probability': 'mean'}).reset_index()
df = df[df['weight'] >= thr]

df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
df = df.drop_duplicates(['document', 'token'], keep='first')
# df = correct_label_after_new_line_label(df)

score = pii_fbeta_score(pred_df=df, gt_df=data) # 076 - 0.964
print(score)

df = df.merge(data, how="outer", on=["document", "token"], suffixes=("_pred", "_gt"))
df["cm"] = ""

df.loc[df.label_gt.isna(), "cm"] = "FP"
df.loc[df.label_pred.isna(), "cm"] = "FN"

df.loc[(df.label_gt.notna() & df.label_pred.notna()) & (df.label_gt != df.label_pred), "cm"] = "FNFP" # CHANGED

df.loc[
    (df.label_pred.notna()) & (df.label_gt.notna()) & (df.label_gt == df.label_pred), "cm"
] = "TP"

df[df['cm'] == 'FNFP']


  0%|          | 0/7 [00:00<?, ?it/s]

0.9790104947526237


Unnamed: 0,document,token,label_pred,token_str_pred,weight,probability,row_id,token_str_gt,label_gt,cm
432,9460,17,I-NAME_STUDENT,Jose,25.672516,0.750294,368041.0,Jose,B-NAME_STUDENT,FNFP
653,11900,241,I-NAME_STUDENT,Benjamin,22.81122,0.925231,994202.0,Benjamin,B-NAME_STUDENT,FNFP
861,19280,55,B-ID_NUM,30407059,39.910826,0.887224,570746.0,30407059,I-ID_NUM,FNFP


In [None]:
# 

In [None]:
# df = pd.concat(dfs)
i = 1
print(models_names[i])
df = dfs[i]
df['weight'] = df['weight']
df = df.groupby(['document', 'token', 'label', 'token_str']).agg({'weight': 'sum', 'probability': 'mean'}).reset_index()
df = df[df['weight'] >= 0]

df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
df = df.drop_duplicates(['document', 'token'], keep='first')

# df = correct_label_after_new_line_label(df)

score = pii_fbeta_score(pred_df=df, gt_df=data) # 076 - 0.964
print(score)

In [None]:
# df = pd.concat(dfs)
df = dfs[7]
df['weight'] = df['weight']
df = df.groupby(['document', 'token', 'label', 'token_str']).agg({'weight': 'sum', 'probability': 'mean'}).reset_index()
df = df[df['weight'] >= 1]

df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
df = df.drop_duplicates(['document', 'token'], keep='first')

# df = correct_label_after_new_line_label(df)

score = pii_fbeta_score(pred_df=df, gt_df=data) # 076 - 0.964
print(score)

In [None]:
weights =  {
    'exp052': 7.837598953336008,
    'exp069': 1.4675304620960052,
    'exp073': 9.627042443386674,
    'exp075': 1.8042350145106174,
    'exp076': 6.931660327090153,
    'thr': 10.814593658362838
 } # 0.956619


In [None]:
threshold = 0.9
voting_thr = 12.582708675708965

models_paths = [
#     ("/kaggle/input/pii-detect-exp052-0", 7.867644526268889), # 958
#     ("/kaggle/input/pii-detect-exp069-0", 1.5544158378090782), # 958
    ("/kaggle/input/pii-detect-exp073-0", 9.558133102436543), # 963
    ("/kaggle/input/pii-detect-exp075-0", 5.384845708592142), # 958
    ("/kaggle/input/pii-detect-exp076-0", 5.0057680970954195), # 961
    ("/kaggle/input/pii-detect-exp087/exp087/exp087_train_42_0", 1.3949429027781592), # ?
    ("/kaggle/input/pii-detection-models/deberta-v3-large-fulltrain-02-20240215T081012Z-001/deberta-v3-large-fulltrain-02/", 9.262006548700064), # ?
]

In [62]:
bounds = {
    'exp052': [1., 10],
    'exp069': [1., 10],
    'exp073': [9., 10],
    'exp075': [4.8, 5.8],
    'exp076': [4.5, 5.5],
    'exp087': [1., 1.9],
    'bestteam': [8.7, 9.7],
}

In [None]:
with open("data/raw/train.json", "r") as f:
    rdata = json.load(f)

In [None]:
# SettingWithCopyWarning
import warnings
warnings.filterwarnings("ignore")

In [63]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    weights = [trial.suggest_float(models_names[i], bounds[models_names[i]][0], bounds[models_names[i]][1]) for i in range(len(models_names))]
    # weights = [trial.suggest_float(f'{models_names[i]}', 1, 10) for i in range(len(models_names))]
    for i in range(len(models_names)):
        dfs[i]['weight'] = weights[i]
    df = pd.concat(dfs)
    df['weight'] = df['weight']
    df = df.groupby(['document', 'token', 'label', 'token_str'])['weight'].sum().reset_index()
    
    thr = trial.suggest_float('thr', 8, 15)
    df = df[df['weight'] >= thr]
    
    df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
    df = df.drop_duplicates(['document', 'token'], keep='first')
    score = pii_fbeta_score(pred_df=df, gt_df=data) # 076 - 0.964
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, show_progress_bar=True)
study.best_params


  0%|          | 0/1000 [00:00<?, ?it/s]

{'exp052': 2.0696284831356238,
 'exp069': 3.0511467213118255,
 'bestteam': 8.7434000461426,
 'exp073': 9.942166951500617,
 'exp075': 4.958400473808574,
 'exp076': 4.595871408325738,
 'exp087': 1.8136772671200951,
 'thr': 14.720990487155191}

In [None]:
# {'exp052': 2.0696284831356238,
#  'exp069': 3.0511467213118255,
#  'bestteam': 8.7434000461426,
#  'exp073': 9.942166951500617,
#  'exp075': 4.958400473808574,
#  'exp076': 4.595871408325738,
#  'exp087': 1.8136772671200951,
#  'thr': 14.720990487155191} # 0.982039

In [None]:
# {'exp052': 2.813943002836644,
#  'exp069': 3.7811549154743695,
#  'bestteam': 9.262006548700064,
#  'exp073': 9.558133102436543,
#  'exp075': 5.384845708592142,
#  'exp076': 5.0057680970954195,
#  'exp087': 1.3949429027781592,
#  'thr': 14.640233169125622} # 0.981529

In [None]:
# {'bestteam': 7.9858516815807725,
#  'exp073': 7.712525886688761,
#  'exp075': 1.006986515616388,
#  'exp076': 1.2647161934501114,
#  'exp086': 3.73951430819615,
#  'exp087': 5.087217435807587,
#  'thr': 11.608788075932253} # 0.974591

In [None]:
# {'bestteam': 9.262006548700064,
#  'exp073': 9.558133102436543,
#  'exp075': 5.384845708592142,
#  'exp076': 5.0057680970954195,
#  'exp087': 1.3949429027781592,
#  'thr': 12.582708675708965} # 0.96866

In [None]:
# {'bestteam': 7.072471951042985,
#  'exp073': 8.727378556526062,
#  'exp075': 6.9400453833274085,
#  'exp076': 5.952426630708058,
#  'exp087': 4.306940313509255,
#  'thr': 12.904023649423127} # 0.968468

In [None]:
# {'bestteam': 8.48018610272431,
#  'exp073': 8.950022042462034,
#  'exp075': 4.478238782781635,
#  'exp076': 6.318450213262733,
#  'exp087': 1.3751125535991877,
#  'thr': 12.344683355092172} # 0.96866

In [None]:
# {'bestteam': 9.81682450753376,
#  'exp073': 7.622067460846842,
#  'exp075': 1.7757265883150035,
#  'exp076': 7.906246869241722,
#  'thr': 11.133467500824786}

In [None]:
weights = {'bestteam': 7.9858516815807725,
 'exp073': 7.712525886688761,
 'exp075': 1.006986515616388,
 'exp076': 1.2647161934501114,
 'exp086': 3.73951430819615,
 'exp087': 5.087217435807587,}

thr = 11.608788075932253

In [None]:
weights = list(weights.values())
for i in range(len(models_names)):
    dfs[i]['weight'] = weights[i]
df = pd.concat(dfs)
df['weight'] = df['weight']
df = df.groupby(['document', 'token', 'label', 'token_str'])['weight'].sum().reset_index()

df = df[df['weight'] >= thr]

df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
df = df.drop_duplicates(['document', 'token'], keep='first')

In [None]:
df["keep"] = df.apply(filter_student_preds, axis=1)
df = df[df.keep == True].reset_index(drop=True).drop(columns = ["keep"])

df = df.drop_duplicates(subset=["document", "token"], keep="first")
df = df[~((df["label"].str.contains("EMAIL")) & (~df["token_str"].str.contains("@")))]
df = df[~((df["label"].str.contains("NAME_STUDENT")) & (df["token_str"].isin(subtitles)))]

df.sort_values(by = ["document", "token"], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

df = all_postprocess(df)

regex_df = regex_predictions(rdata)
# display(df)
# display(regex_df)

df = pd.concat([df, regex_df]).drop_duplicates(subset=["document", "token"], keep="first")
df.reset_index(drop=True, inplace=True)
df.sort_values(by = ["document", "token"], ascending=True, inplace=True)

# df = correct_label_after_new_line_label(df)

In [None]:
score = pii_fbeta_score(pred_df=df, gt_df=data)

In [None]:
score