In [1]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, recall_score, precision_score
import torch
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize

In [2]:
train_dropped = ['e9be80d', '6017fea']
# dt_map_8 = {
#     'Unannotated': 0,
#     'Lead': 1,
#     'Position': 2,
#     'Claim': 3,
#     'Evidence': 4,
#     'Concluding Statement': 5,
#     'Counterclaim': 6,
#     'Rebuttal': 7,
#     'Repeated': -1
# }
dt_map = {
    'Lead': 0,
    'Position': 1,
    'Claim': 2,
    'Evidence': 3,
    'Concluding Statement': 4,
    'Counterclaim': 5,
    'Rebuttal': 6,
    'Repeated': -1
}

token_map = {
    128000: '',
    128001: '',
    128002: '[UNANNOTATED]',
    128003: '[LEAD]',
    128004: '[POSITION]',
    128005: '[CLAIM]',
    128006: '[EVIDENCE]',
    128007: '[CONCLUDE]',
    128008: '[COUNTER]',
    128009: '[REBUTTAL]',
    -1: '',
}


rev_dt_map = {v: k for k, v in dt_map.items()}

count_dt = {
    0: '\[(LEAD)\]',
    1: '\[(POSITION)\]',
    2: '\[(CLAIM)\]',
    3: '\[(EVIDENCE)\]',
    4: '\[(CONCLUDE)\]',
    5: '\[(COUNTER)\]',
    6: '\[(REBUTTAL)\]',
}

token_dt_map = {
    'Unannotated': '[UNANNOTATED]',
    'Lead': '[LEAD]',
    'Position': '[POSITION]',
    'Claim': '[CLAIM]',
    'Evidence': '[EVIDENCE]',
    'Concluding Statement': '[CONCLUDE]',
    'Counterclaim': '[COUNTER]',
    'Rebuttal': '[REBUTTAL]',
    'Repeated': '',
}
selected_cols = ['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only']

pat_1 = '\[(UNANNOTATED|LEAD|POSITION|CLAIM|EVIDENCE|CONCLUDE|COUNTER|REBUTTAL)\]\n\n'
pat_2 = '\n\n[${1}]'

pat_3 = '\[(SEP)\]\.'
pat_4 = '.[${1}]'

mlm_pat_1 = '\[(UNANNOTATED|LEAD|POSITION|CLAIM|EVIDENCE|CONCLUDE|COUNTER|REBUTTAL)\]'
mlm_pat_2 = '[MASK]'

mlm_pat_3 = '\[(MASK)\]\[PARAGRAPH\]'
mlm_pat_4 = '[PARAGRAPH][${1}]'

mlm_pat_3_2 = '\[(MASK)\]\n\n'
mlm_pat_4_2 = '\n\n[${1}]'

# create discourse training dataset

In [3]:
tp_df = pl.read_csv('/kaggle/input/lal-aes2-create-prompt-data/train_df_with_prompt.csv')
t_df = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
p1_df = pl.read_csv('/kaggle/input/persuade-data/persuade_data/persuade_corpus_1.0.csv')
p2_df = pl.read_csv('/kaggle/input/persuade-data/persuade_data/persuade_2.0_human_scores_demo_id_github.csv')

In [4]:
t = p1_df.join(t_df, on='full_text').pivot(index='full_text', columns='discourse_type',values='discourse_type_num', aggregate_function='len').fill_null(0)
t = t.join(t_df, on='full_text').group_by('score').mean().sort('score').to_pandas().style.background_gradient()
t

Unnamed: 0,score,full_text,Lead,Unannotated,Position,Claim,Evidence,Concluding Statement,Counterclaim,Rebuttal,essay_id
0,1,,0.312169,0.515344,0.612698,0.675132,1.601058,0.344974,0.065608,0.022222,
1,2,,0.423491,1.100754,0.997037,2.054688,2.255388,0.734375,0.207974,0.11611,
2,3,,0.566529,1.370306,1.003039,3.051226,2.997395,0.91209,0.316258,0.215759,
3,4,,0.692425,1.588123,1.002665,3.778455,3.323944,0.966121,0.398934,0.323944,
4,5,,0.802145,1.589988,1.004768,4.070322,3.526818,0.990465,0.638856,0.551847,
5,6,,0.858156,1.460993,1.014184,4.234043,3.836879,1.014184,1.113475,1.056738,


In [5]:
sen_discourse = p1_df.join(t_df, on='full_text').join(p2_df.select(['essay_id_comp', 'full_text', 'prompt_name']), on='full_text')
sen_discourse = sen_discourse.select(['essay_id', 'full_text', 'discourse_start', 'discourse_text', 'discourse_type', 'score', 'prompt_name'])
sen_discourse

essay_id,full_text,discourse_start,discourse_text,discourse_type,score,prompt_name
str,str,i64,str,str,i64,str
"""d5b6859""","""Cars have been…",0,"""Cars have been…","""Lead""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",164,"""ut, ""","""Unannotated""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",168,"""there are many…","""Position""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",210,"""uch as ""","""Unannotated""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",217,"""improving our …","""Claim""",5,"""Car-free citie…"
…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,"""Because whats …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",440,"""Now that I've …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",823,"""Can voters con…","""Claim""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",885,"""Do voters some…","""Claim""",2,"""Does the elect…"


In [6]:
new_id = []
for name, data in sen_discourse.group_by(['essay_id'], maintain_order=True):
    for i in range(data.shape[0]):
        new_id.append(data['essay_id'][i] + f'_{i}')
        
sen_discourse = sen_discourse.with_columns(essay_id=pl.Series(new_id))

In [7]:
sen_discourse = sen_discourse.filter(pl.col('discourse_type') != 'Unannotated')
sen_discourse

essay_id,full_text,discourse_start,discourse_text,discourse_type,score,prompt_name
str,str,i64,str,str,i64,str
"""d5b6859_0""","""Cars have been…",0,"""Cars have been…","""Lead""",5,"""Car-free citie…"
"""d5b6859_2""","""Cars have been…",168,"""there are many…","""Position""",5,"""Car-free citie…"
"""d5b6859_4""","""Cars have been…",217,"""improving our …","""Claim""",5,"""Car-free citie…"
"""d5b6859_5""","""Cars have been…",251,"""improving safe…","""Claim""",5,"""Car-free citie…"
"""d5b6859_7""","""Cars have been…",273,"""decreasing pol…","""Claim""",5,"""Car-free citie…"
…,…,…,…,…,…,…
"""ac8fa95_2""","""This goes out …",204,"""Because whats …","""Evidence""",2,"""Does the elect…"
"""ac8fa95_3""","""This goes out …",440,"""Now that I've …","""Evidence""",2,"""Does the elect…"
"""ac8fa95_4""","""This goes out …",823,"""Can voters con…","""Claim""",2,"""Does the elect…"
"""ac8fa95_5""","""This goes out …",885,"""Do voters some…","""Claim""",2,"""Does the elect…"


In [8]:
sen_discourse.write_csv('sen_discourse.csv')

In [9]:
ol_df = p1_df.join(t_df, on='full_text').join(p2_df.select(['essay_id_comp', 'full_text', 'prompt_name']), on='full_text')
ol_df = ol_df.select(['essay_id', 'full_text', 'discourse_start', 'discourse_end', 'discourse_text', 'discourse_type', 'score', 'prompt_name'])
ol_df_8 = ol_df.clone()
ol_df = ol_df.filter(pl.col('discourse_type') != "Unannotated")
ol_df

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name
str,str,i64,i64,str,str,i64,str
"""d5b6859""","""Cars have been…",0,163,"""Cars have been…","""Lead""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",168,209,"""there are many…","""Position""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",217,250,"""improving our …","""Claim""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",251,268,"""improving safe…","""Claim""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",273,294,"""decreasing pol…","""Claim""",5,"""Car-free citie…"
…,…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,439,"""Because whats …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",440,822,"""Now that I've …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",823,884,"""Can voters con…","""Claim""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",885,980,"""Do voters some…","""Claim""",2,"""Does the elect…"


## nko_sen contains all DT except unannotated

In [10]:
nko_sen = ol_df.with_columns(pl.col('discourse_text').str.strip_chars()).filter((pl.col('discourse_text') != '')).filter(pl.col('discourse_text') != '.')
nko_sen = nko_sen.with_columns(pl.col('discourse_type').replace(token_dt_map).alias('s_token'))
nko_sen

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name,s_token
str,str,i64,i64,str,str,i64,str,str
"""d5b6859""","""Cars have been…",0,163,"""Cars have been…","""Lead""",5,"""Car-free citie…","""[LEAD]"""
"""d5b6859""","""Cars have been…",168,209,"""there are many…","""Position""",5,"""Car-free citie…","""[POSITION]"""
"""d5b6859""","""Cars have been…",217,250,"""improving our …","""Claim""",5,"""Car-free citie…","""[CLAIM]"""
"""d5b6859""","""Cars have been…",251,268,"""improving safe…","""Claim""",5,"""Car-free citie…","""[CLAIM]"""
"""d5b6859""","""Cars have been…",273,294,"""decreasing pol…","""Claim""",5,"""Car-free citie…","""[CLAIM]"""
…,…,…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,439,"""Because whats …","""Evidence""",2,"""Does the elect…","""[EVIDENCE]"""
"""ac8fa95""","""This goes out …",440,822,"""Now that I've …","""Evidence""",2,"""Does the elect…","""[EVIDENCE]"""
"""ac8fa95""","""This goes out …",823,884,"""Can voters con…","""Claim""",2,"""Does the elect…","""[CLAIM]"""
"""ac8fa95""","""This goes out …",885,980,"""Do voters some…","""Claim""",2,"""Does the elect…","""[CLAIM]"""


In [11]:
essay_id_list = []
tokened_text = []
for name, data in nko_sen.group_by(['essay_id', 'full_text'], maintain_order=True):
    essay_id = name[0]
    full_text = name[1]
    dt_len = data.shape[0]
    offset = 0
    n_offset = 0
    for i in range(dt_len):
        
        start_pos = data['discourse_start'][i]
        s_token = data['s_token'][i]
        n_offset = full_text[:start_pos + offset].count('\n')
        temp_offset = 0

        if (n_offset % 2 == 1):
            temp_offset += 1
            
        if full_text[start_pos + offset:][:2] == '\n\n':
            temp_offset += 2
            
        if (full_text[start_pos + offset:][0] == ' '):
            offset += 1
            
        if full_text[start_pos + offset:][0] == '.':
            offset += 1
            
        full_text = full_text[:start_pos + offset + temp_offset] + s_token + full_text[start_pos + offset + temp_offset:]
        offset += len(s_token)
            
    essay_id_list.append(essay_id)
    tokened_text.append(full_text)

In [12]:
nko_sen = tp_df.join(pl.DataFrame({'essay_id': essay_id_list, 'tokened_text': tokened_text}), on='essay_id', how='inner')
nko_sen

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[LEAD]Many peo…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""Dear, State Se…"
"""0030e86""","""If I were to c…","""Does the elect…",4,false,"""[POSITION]If I…"
"""0033bf4""","""What is the Se…","""""A Cowboy Who …",3,false,"""[LEAD]What is …"
"""0036253""","""The challenge …","""Exploring Venu…",2,false,"""The challenge …"
…,…,…,…,…,…
"""ffc11a8""","""You should joi…","""""A Cowboy Who …",3,false,"""[POSITION]You …"
"""ffc9095""","""Venus, an extr…","""Exploring Venu…",3,false,"""[LEAD]Venus, a…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[LEAD]Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[LEAD]If you d…"


## nko_sen_8 contains all DT

In [13]:
nko_sen_8 = ol_df_8.with_columns(pl.col('discourse_text').str.strip_chars()).filter((pl.col('discourse_text') != '')).filter(pl.col('discourse_text') != '.')
nko_sen_8 = nko_sen_8.with_columns(pl.col('discourse_type').replace(token_dt_map).alias('s_token'))
nko_sen_8

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name,s_token
str,str,i64,i64,str,str,i64,str,str
"""d5b6859""","""Cars have been…",0,163,"""Cars have been…","""Lead""",5,"""Car-free citie…","""[LEAD]"""
"""d5b6859""","""Cars have been…",164,167,"""ut,""","""Unannotated""",5,"""Car-free citie…","""[UNANNOTATED]"""
"""d5b6859""","""Cars have been…",168,209,"""there are many…","""Position""",5,"""Car-free citie…","""[POSITION]"""
"""d5b6859""","""Cars have been…",210,216,"""uch as""","""Unannotated""",5,"""Car-free citie…","""[UNANNOTATED]"""
"""d5b6859""","""Cars have been…",217,250,"""improving our …","""Claim""",5,"""Car-free citie…","""[CLAIM]"""
…,…,…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,439,"""Because whats …","""Evidence""",2,"""Does the elect…","""[EVIDENCE]"""
"""ac8fa95""","""This goes out …",440,822,"""Now that I've …","""Evidence""",2,"""Does the elect…","""[EVIDENCE]"""
"""ac8fa95""","""This goes out …",823,884,"""Can voters con…","""Claim""",2,"""Does the elect…","""[CLAIM]"""
"""ac8fa95""","""This goes out …",885,980,"""Do voters some…","""Claim""",2,"""Does the elect…","""[CLAIM]"""


In [14]:
essay_id_list = []
tokened_text = []
for name, data in nko_sen_8.group_by(['essay_id', 'full_text'], maintain_order=True):
    essay_id = name[0]
    full_text = name[1]
    dt_len = data.shape[0]
    offset = 0
    n_offset = 0
    for i in range(dt_len):
        
        start_pos = data['discourse_start'][i]
        s_token = data['s_token'][i]
        n_offset = full_text[:start_pos + offset].count('\n')
        temp_offset = 0

        if (n_offset % 2 == 1):
            temp_offset += 1
            
        if full_text[start_pos + offset:][:2] == '\n\n':
            temp_offset += 2
            
        if (full_text[start_pos + offset:][0] == ' '):
            offset += 1
            
        if full_text[start_pos + offset:][0] == '.':
            offset += 1
            
        full_text = full_text[:start_pos + offset + temp_offset] + s_token + full_text[start_pos + offset + temp_offset:]
        offset += len(s_token)
            
    essay_id_list.append(essay_id)
    tokened_text.append(full_text)

In [15]:
nko_sen_8 = tp_df.join(pl.DataFrame({'essay_id': essay_id_list, 'tokened_text': tokened_text}), on='essay_id', how='inner')
nko_sen_8

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[LEAD]Many peo…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[UNANNOTATED]D…"
"""0030e86""","""If I were to c…","""Does the elect…",4,false,"""[POSITION]If I…"
"""0033bf4""","""What is the Se…","""""A Cowboy Who …",3,false,"""[LEAD]What is …"
"""0036253""","""The challenge …","""Exploring Venu…",2,false,"""[UNANNOTATED]T…"
…,…,…,…,…,…
"""ffc11a8""","""You should joi…","""""A Cowboy Who …",3,false,"""[POSITION]You …"
"""ffc9095""","""Venus, an extr…","""Exploring Venu…",3,false,"""[LEAD]Venus, a…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[LEAD]Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[LEAD]If you d…"


## mlm bare for mlm predictions

In [16]:
mlm_bare = nko_sen_8.with_columns(pl.col('tokened_text').str.replace_all(mlm_pat_1, mlm_pat_2).alias('masked'))
mlm_bare

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text,masked
str,str,str,i64,bool,str,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[LEAD]Many peo…","""[MASK]Many peo…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[UNANNOTATED]D…","""[MASK]Dear, St…"
"""0030e86""","""If I were to c…","""Does the elect…",4,false,"""[POSITION]If I…","""[MASK]If I wer…"
"""0033bf4""","""What is the Se…","""""A Cowboy Who …",3,false,"""[LEAD]What is …","""[MASK]What is …"
"""0036253""","""The challenge …","""Exploring Venu…",2,false,"""[UNANNOTATED]T…","""[MASK]The chal…"
…,…,…,…,…,…,…
"""ffc11a8""","""You should joi…","""""A Cowboy Who …",3,false,"""[POSITION]You …","""[MASK]You shou…"
"""ffc9095""","""Venus, an extr…","""Exploring Venu…",3,false,"""[LEAD]Venus, a…","""[MASK]Venus, a…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[LEAD]Technolo…","""[MASK]Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[LEAD]If you d…","""[MASK]If you d…"


In [17]:
deberta_v3_tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
deberta_v3_tokenizer_addition_tokens = [
    '[UNANNOTATED]',
    '[LEAD]',
    '[POSITION]',
    '[CLAIM]',
    '[EVIDENCE]',
    '[CONCLUDE]',
    '[COUNTER]',
    '[REBUTTAL]',
]
deberta_v3_tokenizer.add_special_tokens({'additional_special_tokens': ['[PARAGRAPH]'] + deberta_v3_tokenizer_addition_tokens})

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



9

In [18]:
def get_deberta_v3_tokenized_labels_idx(txt):
    tokened = deberta_v3_tokenizer(
        txt,
        add_special_tokens=True,
        max_length=4096,
        padding="max_length",
        truncation=True,
        return_tensors='pt',
        )
    labels_idx = torch.flatten(torch.argwhere(tokened['input_ids'].squeeze() == 128000))
    return labels_idx.tolist()


def get_deberta_v3_tokenized_labels(txt, idx):
    tokened = deberta_v3_tokenizer(
        txt,
        add_special_tokens=True,
        max_length=4096,
        padding="max_length",
        truncation=True,
        return_tensors='pt',
        )
    labels = tokened['input_ids'].squeeze()[idx]
    return labels.tolist()

In [19]:
mlm_bare = mlm_bare.with_columns(pl.col('masked').map_elements(get_deberta_v3_tokenized_labels_idx, return_dtype=pl.List(pl.Int64)).alias('deberta_labels_idx'))
mlm_bare = mlm_bare.with_columns(pl.struct('tokened_text', 'deberta_labels_idx').map_elements(lambda x: get_deberta_v3_tokenized_labels(x['tokened_text'], x['deberta_labels_idx']), return_dtype=pl.List(pl.Int64)).alias('deberta_labels'))
mlm_bare

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text,masked,deberta_labels_idx,deberta_labels
str,str,str,i64,bool,str,str,list[i64],list[i64]
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[LEAD]Many peo…","""[MASK]Many peo…","[1, 29, … 549]","[128003, 128002, … 128006]"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[UNANNOTATED]D…","""[MASK]Dear, St…","[1, 6, … 415]","[128002, 128004, … 128009]"
"""0030e86""","""If I were to c…","""Does the elect…",4,false,"""[POSITION]If I…","""[MASK]If I wer…","[1, 23, … 424]","[128004, 128005, … 128007]"
"""0033bf4""","""What is the Se…","""""A Cowboy Who …",3,false,"""[LEAD]What is …","""[MASK]What is …","[1, 62, … 332]","[128003, 128004, … 128007]"
"""0036253""","""The challenge …","""Exploring Venu…",2,false,"""[UNANNOTATED]T…","""[MASK]The chal…","[1, 7, … 338]","[128002, 128003, … 128007]"
…,…,…,…,…,…,…,…,…
"""ffc11a8""","""You should joi…","""""A Cowboy Who …",3,false,"""[POSITION]You …","""[MASK]You shou…","[1, 9, … 321]","[128004, 128002, … 128007]"
"""ffc9095""","""Venus, an extr…","""Exploring Venu…",3,false,"""[LEAD]Venus, a…","""[MASK]Venus, a…","[1, 80, … 374]","[128003, 128004, … 128007]"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[LEAD]Technolo…","""[MASK]Technolo…","[1, 66, … 611]","[128003, 128004, … 128007]"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[LEAD]If you d…","""[MASK]If you d…","[1, 27, … 177]","[128003, 128004, … 128007]"


## masked_ko for infer

In [20]:
# masked_ko = tp_df.filter(~pl.col('essay_id').is_in(train_dropped)).filter(pl.col('kaggle_only') == True)
# masked_ko = masked_ko.with_columns(pl.col('full_text').str.replace_all('\n\n', '[PARAGRAPH]').alias('masked'))
# masked_ko = masked_ko.with_columns(pl.col('masked').map_elements(sent_tokenize, return_dtype=pl.List(pl.String))).explode('masked')
################################################################ replace with dt steps'

masked = tp_df.filter(~pl.col('essay_id').is_in(train_dropped))
masked = masked.with_columns(pl.col('full_text').str.split('.', inclusive=True).alias('masked'))
masked = masked.explode('masked').with_columns(pl.col('masked').str.strip_chars_end())
masked = masked.filter(pl.col('masked') != "").filter(pl.col('masked') != '.')
masked = masked.with_columns(pl.concat_str([pl.lit('[MASK]'), pl.col('masked')]).alias('masked'))
masked = masked.group_by(['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only'], maintain_order=True).agg([
    pl.col('masked').str.concat(' ')
])
masked = masked.with_columns(pl.col('masked').str.replace_all(mlm_pat_3_2, mlm_pat_4_2))
masked = masked.with_columns(pl.col('masked').map_elements(get_deberta_v3_tokenized_labels_idx, return_dtype=pl.List(pl.Int64)).alias('deberta_labels_idx'))
masked

essay_id,full_text,prompt_name,score,kaggle_only,masked,deberta_labels_idx
str,str,str,i64,bool,str,list[i64]
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""[MASK]I am a s…","[1, 18, … 379]"
"""001ab80""","""People always …","""Driverless car…",4,true,"""[MASK]People a…","[1, 31, … 637]"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""[MASK]We all h…","[1, 72, … 537]"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[MASK]Dear, St…","[1, 20, … 455]"
…,…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""[MASK]the stor…","[1, 29, … 185]"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[MASK]Technolo…","[1, 14, … 682]"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[MASK]If you d…","[1, 27, … 249]"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""[MASK]In ""The …","[1, 46, … 267]"


In [21]:
mlm_bare.write_parquet('train_mlm_bare.parquet')
masked.write_parquet('masked.parquet')