In [1]:
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, recall_score, precision_score
import torch
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize
import pickle
from scipy.special import softmax


In [2]:
train_dropped = ['e9be80d', '6017fea']
# dt_map_8 = {
#     'Unannotated': 0,
#     'Lead': 1,
#     'Position': 2,
#     'Claim': 3,
#     'Evidence': 4,
#     'Concluding Statement': 5,
#     'Counterclaim': 6,
#     'Rebuttal': 7,
#     'Repeated': -1
# }
dt_map = {
    'Lead': 0,
    'Position': 1,
    'Claim': 2,
    'Evidence': 3,
    'Concluding Statement': 4,
    'Counterclaim': 5,
    'Rebuttal': 6,
    'Repeated': -1
}

rev_dt_map = {v: k for k, v in dt_map.items()}

token_dt_map = {
    'Unannotated': '<Unannotated>',
    'Lead': '<Lead>',
    'Position': '<Position>',
    'Claim': '<Claim>',
    'Evidence': '<Evidence>',
    'Concluding Statement': '<Concluding>',
    'Counterclaim': '<Counterclaim>',
    'Rebuttal': '<Rebuttal>',
    'Repeated': '',
}

mlm_token_map = {
    128000: '',
    128001: '',
    128002: '<Unannotated>',
    128003: '<Lead>',
    128004: '<Position>',
    128005: '<Claim>',
    128006: '<Evidence>',
    128007: '<Concluding>',
    128008: '<Counterclaim>',
    128009: '<Rebuttal>',
    -1: '',
}

count_dt = {
    0: '<Lead>',
    1: '<Position>',
    2: '<Claim>',
    3: '<Evidence>',
    4: '<Concluding>',
    5: '<Counterclaim>',
    6: '<Rebuttal>',
}


selected_cols = ['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only']

pat_1 = '<(Unannotated|Lead|Position|Claim|Evidence|Concluding|Counterclaim|Rebuttal)>\n\n'
pat_2 = '\n\n<${1}>'

mlm_pat_1 = '<(Unannotated|Lead|Position|Claim|Evidence|Concluding|Counterclaim|Rebuttal)>'
mlm_pat_2 = '[MASK]'

mlm_pat_3 = '\[(MASK)\]\[PARAGRAPH\]'
mlm_pat_4 = '[PARAGRAPH][${1}]'

# create discourse training dataset

In [3]:
tp_df = pl.read_csv('/kaggle/input/lal-aes2-create-prompt-data/train_df_with_prompt.csv')
t_df = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
p1_df = pl.read_csv('/kaggle/input/persuade-data/persuade_data/persuade_corpus_1.0.csv')
p2_df = pl.read_csv('/kaggle/input/persuade-data/persuade_data/persuade_2.0_human_scores_demo_id_github.csv')

In [4]:
t = p1_df.join(t_df, on='full_text').pivot(index='full_text', columns='discourse_type',values='discourse_type_num', aggregate_function='len').fill_null(0)
t = t.join(t_df, on='full_text').group_by('score').mean().sort('score').to_pandas().style.background_gradient()
t

Unnamed: 0,score,full_text,Lead,Unannotated,Position,Claim,Evidence,Concluding Statement,Counterclaim,Rebuttal,essay_id
0,1,,0.312169,0.515344,0.612698,0.675132,1.601058,0.344974,0.065608,0.022222,
1,2,,0.423491,1.100754,0.997037,2.054688,2.255388,0.734375,0.207974,0.11611,
2,3,,0.566529,1.370306,1.003039,3.051226,2.997395,0.91209,0.316258,0.215759,
3,4,,0.692425,1.588123,1.002665,3.778455,3.323944,0.966121,0.398934,0.323944,
4,5,,0.802145,1.589988,1.004768,4.070322,3.526818,0.990465,0.638856,0.551847,
5,6,,0.858156,1.460993,1.014184,4.234043,3.836879,1.014184,1.113475,1.056738,


In [5]:
ol_df = p1_df.join(t_df, on='full_text').join(p2_df.select(['essay_id_comp', 'full_text', 'prompt_name']), on='full_text')
ol_df = ol_df.select(['essay_id', 'full_text', 'discourse_start', 'discourse_end', 'discourse_text', 'discourse_type', 'score', 'prompt_name'])
ol_df = ol_df.filter(pl.col('discourse_type') != "Unannotated")
ol_df

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name
str,str,i64,i64,str,str,i64,str
"""d5b6859""","""Cars have been…",0,163,"""Cars have been…","""Lead""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",168,209,"""there are many…","""Position""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",217,250,"""improving our …","""Claim""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",251,268,"""improving safe…","""Claim""",5,"""Car-free citie…"
"""d5b6859""","""Cars have been…",273,294,"""decreasing pol…","""Claim""",5,"""Car-free citie…"
…,…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,439,"""Because whats …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",440,822,"""Now that I've …","""Evidence""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",823,884,"""Can voters con…","""Claim""",2,"""Does the elect…"
"""ac8fa95""","""This goes out …",885,980,"""Do voters some…","""Claim""",2,"""Does the elect…"


## nko_sen contains all DT except unannotated

In [6]:
nko_sen = ol_df.with_columns(pl.col('discourse_text').str.strip_chars()).filter((pl.col('discourse_text') != '')).filter(pl.col('discourse_text') != '.')
nko_sen = nko_sen.with_columns(pl.col('discourse_type').replace(token_dt_map).alias('s_token'))
nko_sen

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name,s_token
str,str,i64,i64,str,str,i64,str,str
"""d5b6859""","""Cars have been…",0,163,"""Cars have been…","""Lead""",5,"""Car-free citie…","""<Lead>"""
"""d5b6859""","""Cars have been…",168,209,"""there are many…","""Position""",5,"""Car-free citie…","""<Position>"""
"""d5b6859""","""Cars have been…",217,250,"""improving our …","""Claim""",5,"""Car-free citie…","""<Claim>"""
"""d5b6859""","""Cars have been…",251,268,"""improving safe…","""Claim""",5,"""Car-free citie…","""<Claim>"""
"""d5b6859""","""Cars have been…",273,294,"""decreasing pol…","""Claim""",5,"""Car-free citie…","""<Claim>"""
…,…,…,…,…,…,…,…,…
"""ac8fa95""","""This goes out …",204,439,"""Because whats …","""Evidence""",2,"""Does the elect…","""<Evidence>"""
"""ac8fa95""","""This goes out …",440,822,"""Now that I've …","""Evidence""",2,"""Does the elect…","""<Evidence>"""
"""ac8fa95""","""This goes out …",823,884,"""Can voters con…","""Claim""",2,"""Does the elect…","""<Claim>"""
"""ac8fa95""","""This goes out …",885,980,"""Do voters some…","""Claim""",2,"""Does the elect…","""<Claim>"""


In [7]:
essay_id_list = []
tokened_text = []
for name, data in nko_sen.group_by(['essay_id', 'full_text'], maintain_order=True):
    essay_id = name[0]
    full_text = name[1]
    dt_len = data.shape[0]
    offset = 0
    n_offset = 0
    for i in range(dt_len):
        
        start_pos = data['discourse_start'][i]
        s_token = data['s_token'][i]
        n_offset = full_text[:start_pos + offset].count('\n')
        temp_offset = 0

        if (n_offset % 2 == 1):
            temp_offset += 1
            
        if full_text[start_pos + offset:][:2] == '\n\n':
            temp_offset += 2
            
        if (full_text[start_pos + offset:][0] == ' '):
            offset += 1
            
        if full_text[start_pos + offset:][0] == '.':
            offset += 1
            
        full_text = full_text[:start_pos + offset + temp_offset] + s_token + full_text[start_pos + offset + temp_offset:]
        offset += len(s_token)
            
    essay_id_list.append(essay_id)
    tokened_text.append(full_text)

In [8]:
nko_sen = tp_df.join(pl.DataFrame({'essay_id': essay_id_list, 'tokened_text': tokened_text}), on='essay_id', how='inner')
nko_sen

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""<Lead>Many peo…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""Dear, State Se…"
"""0030e86""","""If I were to c…","""Does the elect…",4,false,"""<Position>If I…"
"""0033bf4""","""What is the Se…","""""A Cowboy Who …",3,false,"""<Lead>What is …"
"""0036253""","""The challenge …","""Exploring Venu…",2,false,"""The challenge …"
…,…,…,…,…,…
"""ffc11a8""","""You should joi…","""""A Cowboy Who …",3,false,"""<Position>You …"
"""ffc9095""","""Venus, an extr…","""Exploring Venu…",3,false,"""<Lead>Venus, a…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""<Lead>Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""<Lead>If you d…"


## Non-Overlapping Persuade2.0

In [9]:
nol_df = p1_df.join(t_df, on='full_text', how='anti').join(p2_df.select(['essay_id_comp', 'full_text', 'prompt_name', 'holistic_essay_score']), on='full_text')
nol_df = nol_df.rename({'essay_id_comp': 'essay_id', 'holistic_essay_score': 'score'})
nol_df = nol_df.select(['essay_id', 'full_text', 'discourse_start', 'discourse_end', 'discourse_text', 'discourse_type', 'score', 'prompt_name'])
nol_df = nol_df.filter(pl.col('discourse_type') != "Unannotated")
nol_df

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name
str,str,i64,i64,str,str,i64,str
"""423A1CA112E2""","""Phones Modern…",8,229,"""Modern humans …","""Lead""",3,"""Phones and dri…"
"""423A1CA112E2""","""Phones Modern…",230,312,"""They are some …","""Position""",3,"""Phones and dri…"
"""423A1CA112E2""","""Phones Modern…",313,400,"""Some certain a…","""Evidence""",3,"""Phones and dri…"
"""423A1CA112E2""","""Phones Modern…",401,756,"""When people ha…","""Evidence""",3,"""Phones and dri…"
"""423A1CA112E2""","""Phones Modern…",757,884,"""Driving is one…","""Claim""",3,"""Phones and dri…"
…,…,…,…,…,…,…,…
"""DF920E0A7337""","""Have you ever …",513,558,"""it informs you…","""Claim""",4,"""Seeking multip…"
"""DF920E0A7337""","""Have you ever …",610,1569,"""One opinion of…","""Evidence""",4,"""Seeking multip…"
"""DF920E0A7337""","""Have you ever …",1621,2395,"""One person can…","""Evidence""",4,"""Seeking multip…"
"""DF920E0A7337""","""Have you ever …",2452,3263,"""Having more th…","""Evidence""",4,"""Seeking multip…"


In [10]:
nol_sen_ = nol_df.with_columns(pl.col('discourse_text').str.strip_chars()).filter((pl.col('discourse_text') != '')).filter(pl.col('discourse_text') != '.')
nol_sen_ = nol_sen_.with_columns(pl.col('discourse_type').replace(token_dt_map).alias('s_token'))
nol_sen_

essay_id,full_text,discourse_start,discourse_end,discourse_text,discourse_type,score,prompt_name,s_token
str,str,i64,i64,str,str,i64,str,str
"""423A1CA112E2""","""Phones Modern…",8,229,"""Modern humans …","""Lead""",3,"""Phones and dri…","""<Lead>"""
"""423A1CA112E2""","""Phones Modern…",230,312,"""They are some …","""Position""",3,"""Phones and dri…","""<Position>"""
"""423A1CA112E2""","""Phones Modern…",313,400,"""Some certain a…","""Evidence""",3,"""Phones and dri…","""<Evidence>"""
"""423A1CA112E2""","""Phones Modern…",401,756,"""When people ha…","""Evidence""",3,"""Phones and dri…","""<Evidence>"""
"""423A1CA112E2""","""Phones Modern…",757,884,"""Driving is one…","""Claim""",3,"""Phones and dri…","""<Claim>"""
…,…,…,…,…,…,…,…,…
"""DF920E0A7337""","""Have you ever …",513,558,"""it informs you…","""Claim""",4,"""Seeking multip…","""<Claim>"""
"""DF920E0A7337""","""Have you ever …",610,1569,"""One opinion of…","""Evidence""",4,"""Seeking multip…","""<Evidence>"""
"""DF920E0A7337""","""Have you ever …",1621,2395,"""One person can…","""Evidence""",4,"""Seeking multip…","""<Evidence>"""
"""DF920E0A7337""","""Have you ever …",2452,3263,"""Having more th…","""Evidence""",4,"""Seeking multip…","""<Evidence>"""


In [11]:
essay_id_list = []
tokened_text = []
for name, data in nol_sen_.group_by(['essay_id', 'full_text'], maintain_order=True):
    essay_id = name[0]
    full_text = name[1]
    dt_len = data.shape[0]
    offset = 0
    n_offset = 0
    for i in range(dt_len):
        
        start_pos = data['discourse_start'][i]
        s_token = data['s_token'][i]
        n_offset = full_text[:start_pos + offset].count('\n')
        temp_offset = 0

        if (n_offset % 2 == 1):
            temp_offset += 1
            
        if full_text[start_pos + offset:][:2] == '\n\n':
            temp_offset += 2
            
        if (full_text[start_pos + offset:][0] == ' '):
            offset += 1
            
        if full_text[start_pos + offset:][0] == '.':
            offset += 1
            
        full_text = full_text[:start_pos + offset + temp_offset] + s_token + full_text[start_pos + offset + temp_offset:]
        offset += len(s_token)
            
    essay_id_list.append(essay_id)
    tokened_text.append(full_text)

In [12]:
nol_sen = nol_sen_.join(pl.DataFrame({'essay_id': essay_id_list, 'tokened_text': tokened_text}), on='essay_id')
nol_sen = nol_sen.with_columns(kaggle_only=None)
nol_sen = nol_sen.select(['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only', 'tokened_text']).unique().sort('essay_id')
nol_sen

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,null,str
"""000A58BC095E""","""To Whom It May…","""Community serv…",2,,"""To Whom It May…"
"""000BAD50D026""","""Do you think s…","""Distance learn…",3,,"""<Lead>Do you t…"
"""000E6DE9E817""","""Dear: Principa…","""Grades for ext…",3,,"""Dear: Principa…"
"""0016926B079C""","""I think that s…","""Distance learn…",3,,"""<Position>I th…"
"""00203C45FC55""","""It is every st…","""Distance learn…",6,,"""<Lead>It is ev…"
…,…,…,…,…,…
"""FFE4B98E0B1E""","""Although teach…","""Summer project…",4,,"""<Counterclaim>…"
"""FFE91DA2A101""","""Students shoul…","""Distance learn…",4,,"""<Position>Stud…"
"""FFF1442D6698""","""Every student …","""Summer project…",6,,"""<Lead>Every st…"
"""FFF868E06176""","""Every summer b…","""Summer project…",3,,"""<Lead>Every su…"


In [13]:
nol_sen.write_csv('nol_combined_tokened.csv')

# post-processing predicted discourse type

In [14]:
dt_pred = pl.read_csv('/kaggle/input/lal-aes2-infer-discourse-ds2/dt_pred.csv')
pred = softmax(dt_pred.select([f'pred_class_{c}' for c in range(7)]).to_numpy(), axis=-1).argmax(-1)
dt_pred = pl.concat([dt_pred, pl.DataFrame({'preds': pred})], how='horizontal')
dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds
str,str,str,i64,bool,str,f64,f64,f64,f64,f64,f64,f64,i64
"""000fe60_0""","""I am a scienti…","""The Face on Ma…",3,true,"""I am a scienti…",4.4566317,1.634598,1.1193498,-0.803673,-1.611443,-2.436806,-2.911818,0
"""000fe60_1""","""I am a scienti…","""The Face on Ma…",3,true,""" I will be exp…",1.2987336,3.5842304,2.4188561,-0.358999,-1.653093,-2.031184,-2.989719,1
"""000fe60_2""","""I am a scienti…","""The Face on Ma…",3,true,""" By sharing my…",1.2966378,3.019608,1.8124192,-0.30381,-1.293672,-2.232578,-2.454375,1
"""000fe60_3""","""I am a scienti…","""The Face on Ma…",3,true,""" First off, h…",-0.510491,-0.285109,4.8061514,0.0678743,-2.514192,0.083378,-2.089296,2
"""000fe60_4""","""I am a scienti…","""The Face on Ma…",3,true,""" There is no p…",-1.368594,0.071298,3.123221,2.021754,-1.797388,-1.81876,-2.286569,2
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e_7""","""Venus is worth…","""Exploring Venu…",2,true,""" if a human is…",-3.155074,-2.27939,3.6671963,3.3792126,-1.769132,-2.329039,-0.179665,2
"""fffed3e_8""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028398,-1.810841,3.6980405,3.4158745,-1.974055,-1.822579,-0.801626,2
"""fffed3e_9""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.1227262,1.032075,-1.675161,0.400315,3.4489958,6
"""fffed3e_10""","""Venus is worth…","""Exploring Venu…",2,true,""" Now they are …",-3.581977,-0.506038,2.8079379,1.1595439,-0.863774,-0.936936,1.0462234,2


## Merge consecutive dicsourse type (only position and claim)

In [15]:
new_repeated = []
for name, data in dt_pred.group_by(['full_text'], maintain_order=True):
    dtype = data['preds'].to_list()
    hold = -1
    for i in range(1, data.shape[0]):
        if dtype[i] not in [1, 2]:
            hold = dtype[i]
            continue
        if dtype[i] == dtype[i - 1] or dtype[i] == hold:
            hold = dtype[i]
            dtype[i] = -1
    
    new_repeated.append(dtype)

In [16]:
dt_map

{'Lead': 0,
 'Position': 1,
 'Claim': 2,
 'Evidence': 3,
 'Concluding Statement': 4,
 'Counterclaim': 5,
 'Rebuttal': 6,
 'Repeated': -1}

In [17]:
dt_pred = dt_pred.with_columns(pl.Series(np.concatenate(new_repeated, axis=0)).alias('m_preds'))
dt_pred = dt_pred.with_columns(pl.col('m_preds').replace(rev_dt_map))
dt_pred = dt_pred.with_columns(pl.col('m_preds').replace(token_dt_map).alias('s_token'))
dt_pred = dt_pred.with_columns(pl.col('essay_id').str.replace_all('(_.*)', ''))
dt_pred = dt_pred.with_columns(pl.col('discourse_text').str.strip_chars(' ')).filter(pl.col('discourse_text') != '')
dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token
str,str,str,i64,bool,str,f64,f64,f64,f64,f64,f64,f64,i64,str,str
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I am a scienti…",4.4566317,1.634598,1.1193498,-0.803673,-1.611443,-2.436806,-2.911818,0,"""Lead""","""<Lead>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I will be expl…",1.2987336,3.5842304,2.4188561,-0.358999,-1.653093,-2.031184,-2.989719,1,"""Position""","""<Position>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""By sharing my …",1.2966378,3.019608,1.8124192,-0.30381,-1.293672,-2.232578,-2.454375,1,"""Repeated""",""""""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,""" First off, h…",-0.510491,-0.285109,4.8061514,0.0678743,-2.514192,0.083378,-2.089296,2,"""Claim""","""<Claim>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""There is no pl…",-1.368594,0.071298,3.123221,2.021754,-1.797388,-1.81876,-2.286569,2,"""Repeated""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.6671963,3.3792126,-1.769132,-2.329039,-0.179665,2,"""Claim""","""<Claim>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028398,-1.810841,3.6980405,3.4158745,-1.974055,-1.822579,-0.801626,2,"""Repeated""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.1227262,1.032075,-1.675161,0.400315,3.4489958,6,"""Rebuttal""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581977,-0.506038,2.8079379,1.1595439,-0.863774,-0.936936,1.0462234,2,"""Claim""","""<Claim>"""


## Make sure there is only one Lead and Conclude

In [18]:
new_s_token = []

for name, data in dt_pred.group_by(['essay_id', 'full_text'], maintain_order=True):

    rev_s_token = ','.join(data['s_token'].reverse().to_list())
    t_len = data.group_by("s_token").len()
    
    for k in t_len.to_dicts():
        if k['s_token'] == '<Lead>':
            if k['len'] > 1:
                lead_occur = rev_s_token.count('<Lead>')
                rev_s_token = rev_s_token.replace('<Lead>', '', lead_occur - 1)

    new_s_token.extend(reversed(rev_s_token.split(',')))

In [19]:
dt_pred = dt_pred.with_columns(new_s_token=pl.Series(new_s_token))
dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token,new_s_token
str,str,str,i64,bool,str,f64,f64,f64,f64,f64,f64,f64,i64,str,str,str
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I am a scienti…",4.4566317,1.634598,1.1193498,-0.803673,-1.611443,-2.436806,-2.911818,0,"""Lead""","""<Lead>""","""<Lead>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I will be expl…",1.2987336,3.5842304,2.4188561,-0.358999,-1.653093,-2.031184,-2.989719,1,"""Position""","""<Position>""","""<Position>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""By sharing my …",1.2966378,3.019608,1.8124192,-0.30381,-1.293672,-2.232578,-2.454375,1,"""Repeated""","""""",""""""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,""" First off, h…",-0.510491,-0.285109,4.8061514,0.0678743,-2.514192,0.083378,-2.089296,2,"""Claim""","""<Claim>""","""<Claim>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""There is no pl…",-1.368594,0.071298,3.123221,2.021754,-1.797388,-1.81876,-2.286569,2,"""Repeated""","""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.6671963,3.3792126,-1.769132,-2.329039,-0.179665,2,"""Claim""","""<Claim>""","""<Claim>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028398,-1.810841,3.6980405,3.4158745,-1.974055,-1.822579,-0.801626,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.1227262,1.032075,-1.675161,0.400315,3.4489958,6,"""Rebuttal""","""<Rebuttal>""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581977,-0.506038,2.8079379,1.1595439,-0.863774,-0.936936,1.0462234,2,"""Claim""","""<Claim>""","""<Claim>"""


In [20]:
new_s_token = []

for name, data in dt_pred.group_by(['essay_id', 'full_text'], maintain_order=True):

    rev_s_token = ','.join(data['new_s_token'].to_list())
    t_len = data.group_by("new_s_token").len()
    
    for k in t_len.to_dicts():
        if k['new_s_token'] == '<Concluding>':
            if k['len'] > 1:
                conclude_occur = rev_s_token.count('<Concluding>')
                rev_s_token = rev_s_token.replace('<Concluding>', '', conclude_occur - 1)

    new_s_token.extend(rev_s_token.split(','))

In [21]:
dt_pred = dt_pred.with_columns(new_s_token=pl.Series(new_s_token))
dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token,new_s_token
str,str,str,i64,bool,str,f64,f64,f64,f64,f64,f64,f64,i64,str,str,str
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I am a scienti…",4.4566317,1.634598,1.1193498,-0.803673,-1.611443,-2.436806,-2.911818,0,"""Lead""","""<Lead>""","""<Lead>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""I will be expl…",1.2987336,3.5842304,2.4188561,-0.358999,-1.653093,-2.031184,-2.989719,1,"""Position""","""<Position>""","""<Position>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""By sharing my …",1.2966378,3.019608,1.8124192,-0.30381,-1.293672,-2.232578,-2.454375,1,"""Repeated""","""""",""""""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,""" First off, h…",-0.510491,-0.285109,4.8061514,0.0678743,-2.514192,0.083378,-2.089296,2,"""Claim""","""<Claim>""","""<Claim>"""
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""There is no pl…",-1.368594,0.071298,3.123221,2.021754,-1.797388,-1.81876,-2.286569,2,"""Repeated""","""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.6671963,3.3792126,-1.769132,-2.329039,-0.179665,2,"""Claim""","""<Claim>""","""<Claim>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028398,-1.810841,3.6980405,3.4158745,-1.974055,-1.822579,-0.801626,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.1227262,1.032075,-1.675161,0.400315,3.4489958,6,"""Rebuttal""","""<Rebuttal>""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581977,-0.506038,2.8079379,1.1595439,-0.863774,-0.936936,1.0462234,2,"""Claim""","""<Claim>""","""<Claim>"""


In [22]:
ko_sen = dt_pred.with_columns(pl.concat_str([pl.col('new_s_token'), pl.col('discourse_text')], separator="").alias('tokened_text'))
ko_sen = ko_sen.group_by(selected_cols, maintain_order=True).agg([
    pl.col('tokened_text').str.concat(' ')
])
ko_sen

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""<Claim>People …"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""<Lead>We all h…"
"""0033037""","""The posibilty …","""Facial action …",2,true,"""<Position>The …"
"""0065bd6""","""Driverless car…","""Driverless car…",3,true,"""<Claim>Driverl…"
…,…,…,…,…,…
"""ffbd0b4""","""Do you think y…","""Exploring Venu…",2,true,"""<Lead>Do you t…"
"""ffcb061""","""Becoming a Sea…","""""A Cowboy Who …",3,true,"""<Position>Beco…"
"""ffcb264""","""Using technolo…","""Facial action …",2,true,"""<Claim>Using t…"
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""<Lead>the stor…"


In [23]:
combined_token = pl.concat([ko_sen, nko_sen]).sort('essay_id')
combined_token = combined_token.with_columns(pl.col('tokened_text').str.replace_all(pat_1, pat_2))
combined_token # un-filtered

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""<Lead>Many peo…"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""<Claim>People …"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""<Lead>We all h…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""Dear, State Se…"
…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""<Lead>the stor…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""<Lead>Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""<Lead>If you d…"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""<Position>In ""…"


In [24]:
tt = combined_token.with_columns([pl.col('tokened_text').str.count_matches(count_dt[i]).alias(f'{rev_dt_map[i]}' +'_count') for i in range(7)]).group_by(['score']).mean().sort('score')
tt.to_pandas().style.background_gradient()

Unnamed: 0,score,essay_id,full_text,prompt_name,kaggle_only,tokened_text,Lead_count,Position_count,Claim_count,Evidence_count,Concluding Statement_count,Counterclaim_count,Rebuttal_count
0,1,,,,0.245208,,0.320288,0.623802,1.186901,1.888179,0.34984,0.096645,0.076677
1,2,,,,0.214059,,0.403769,1.043405,2.206013,2.238408,0.683676,0.225916,0.142071
2,3,,,,0.266168,,0.522619,1.141765,3.233992,2.992673,0.818732,0.337369,0.25008
3,4,,,,0.330871,,0.62812,1.242741,4.153591,3.525981,0.845899,0.464595,0.382323
4,5,,,,0.135052,,0.775258,1.128866,4.386598,3.806186,0.942268,0.641237,0.580412
5,6,,,,0.096154,,0.833333,1.108974,4.480769,3.916667,0.980769,1.128205,1.108974


In [25]:
tt = combined_token.filter(pl.col('kaggle_only') == True).with_columns([pl.col('tokened_text').str.count_matches(count_dt[i]).alias(f'{rev_dt_map[i]}' +'_count') for i in range(7)]).group_by(['score']).mean().sort('score')
tt.to_pandas().style.background_gradient()

Unnamed: 0,score,essay_id,full_text,prompt_name,kaggle_only,tokened_text,Lead_count,Position_count,Claim_count,Evidence_count,Concluding Statement_count,Counterclaim_count,Rebuttal_count
0,1,,,,1.0,,0.345277,0.65798,2.762215,2.771987,0.364821,0.192182,0.2443
1,2,,,,1.0,,0.331355,1.21365,2.761622,2.176063,0.497527,0.29179,0.237389
2,3,,,,1.0,,0.401556,1.524237,3.737882,2.979653,0.561341,0.395572,0.344704
3,4,,,,1.0,,0.498075,1.728253,4.91224,3.934565,0.602771,0.597383,0.500385
4,5,,,,1.0,,0.603053,1.923664,6.412214,5.59542,0.633588,0.656489,0.763359
5,6,,,,1.0,,0.6,2.0,6.8,4.666667,0.666667,1.266667,1.6


In [26]:
dt_map

{'Lead': 0,
 'Position': 1,
 'Claim': 2,
 'Evidence': 3,
 'Concluding Statement': 4,
 'Counterclaim': 5,
 'Rebuttal': 6,
 'Repeated': -1}

In [27]:
combined_token.write_csv('train_combined_tokened.csv')

# ALL DT Predcited from D012

In [28]:
all_dt_pred = pl.read_parquet('/kaggle/input/lal-aes2-infer-discourse-ds/all_dt_pred.parquet')
all_pred = softmax(all_dt_pred.select([f'pred_class_{c}' for c in range(7)]).to_numpy(), axis=-1).argmax(-1)
all_dt_pred = pl.concat([all_dt_pred, pl.DataFrame({'preds': all_pred})], how='horizontal')
all_dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds
str,str,str,i64,bool,str,f32,f32,f32,f32,f32,f32,f32,i64
"""000d118_0""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…",1.348422,-2.03921,2.770183,2.257207,-2.593693,-1.103083,-2.095595,2
"""000d118_1""","""Many people ha…","""Car-free citie…",3,false,""" The thing the…",-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058,3
"""000d118_2""","""Many people ha…","""Car-free citie…",3,false,""" Street parkig…",-1.69301,-1.473693,1.431878,3.76982,-1.916697,-2.047137,-1.346176,3
"""000d118_3""","""Many people ha…","""Car-free citie…",3,false,""" You probaly w…",-1.512254,-3.646935,-0.515993,6.05601,-1.528462,-1.853261,-0.455074,3
"""000d118_4""","""Many people ha…","""Car-free citie…",3,false,""" The vauban pe…",-2.831292,-1.682352,0.927727,3.703798,-1.5447,-1.42729,0.498887,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e_7""","""Venus is worth…","""Exploring Venu…",2,true,""" if a human is…",-3.155074,-2.27939,3.667197,3.379212,-1.769132,-2.329039,-0.179665,2
"""fffed3e_8""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626,2
"""fffed3e_9""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996,6
"""fffed3e_10""","""Venus is worth…","""Exploring Venu…",2,true,""" Now they are …",-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222,2


In [29]:
new_repeated = []
for name, data in all_dt_pred.group_by(['full_text'], maintain_order=True):
    dtype = data['preds'].to_list()
    hold = -1
    for i in range(1, data.shape[0]):
#         if dtype[i] not in [1, 2]:
#             hold = dtype[i]
#             continue
        if dtype[i] == dtype[i - 1] or dtype[i] == hold:
            hold = dtype[i]
            dtype[i] = -1
    
    new_repeated.append(dtype)

In [30]:
all_dt_pred = all_dt_pred.with_columns(pl.Series(np.concatenate(new_repeated, axis=0)).alias('m_preds'))
all_dt_pred = all_dt_pred.with_columns(pl.col('m_preds').replace(rev_dt_map))
all_dt_pred = all_dt_pred.with_columns(pl.col('m_preds').replace(token_dt_map).alias('s_token'))
all_dt_pred = all_dt_pred.with_columns(pl.col('essay_id').str.replace_all('(_.*)', ''))
all_dt_pred = all_dt_pred.with_columns(pl.col('discourse_text').str.strip_chars(' ')).filter(pl.col('discourse_text') != '')
all_dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token
str,str,str,i64,bool,str,f32,f32,f32,f32,f32,f32,f32,i64,str,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…",1.348422,-2.03921,2.770183,2.257207,-2.593693,-1.103083,-2.095595,2,"""Claim""","""<Claim>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The thing they…",-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058,3,"""Evidence""","""<Evidence>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Street parkig …",-1.69301,-1.473693,1.431878,3.76982,-1.916697,-2.047137,-1.346176,3,"""Repeated""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""You probaly wo…",-1.512254,-3.646935,-0.515993,6.05601,-1.528462,-1.853261,-0.455074,3,"""Repeated""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The vauban peo…",-2.831292,-1.682352,0.927727,3.703798,-1.5447,-1.42729,0.498887,3,"""Repeated""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.667197,3.379212,-1.769132,-2.329039,-0.179665,2,"""Repeated""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626,2,"""Repeated""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996,6,"""Rebuttal""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222,2,"""Repeated""",""""""


In [31]:
new_s_token = []

for name, data in all_dt_pred.group_by(['essay_id', 'full_text'], maintain_order=True):

    rev_s_token = ','.join(data['s_token'].reverse().to_list())
    t_len = data.group_by("s_token").len()
    
    for k in t_len.to_dicts():
        if k['s_token'] == '<Lead>':
            if k['len'] > 1:
                lead_occur = rev_s_token.count('<Lead>')
                rev_s_token = rev_s_token.replace('<Lead>', '', lead_occur - 1)

    new_s_token.extend(reversed(rev_s_token.split(',')))

In [32]:
all_dt_pred = all_dt_pred.with_columns(new_s_token=pl.Series(new_s_token))
all_dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token,new_s_token
str,str,str,i64,bool,str,f32,f32,f32,f32,f32,f32,f32,i64,str,str,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…",1.348422,-2.03921,2.770183,2.257207,-2.593693,-1.103083,-2.095595,2,"""Claim""","""<Claim>""","""<Claim>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The thing they…",-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058,3,"""Evidence""","""<Evidence>""","""<Evidence>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Street parkig …",-1.69301,-1.473693,1.431878,3.76982,-1.916697,-2.047137,-1.346176,3,"""Repeated""","""""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""You probaly wo…",-1.512254,-3.646935,-0.515993,6.05601,-1.528462,-1.853261,-0.455074,3,"""Repeated""","""""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The vauban peo…",-2.831292,-1.682352,0.927727,3.703798,-1.5447,-1.42729,0.498887,3,"""Repeated""","""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.667197,3.379212,-1.769132,-2.329039,-0.179665,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996,6,"""Rebuttal""","""<Rebuttal>""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222,2,"""Repeated""","""""",""""""


In [33]:
new_s_token = []

for name, data in all_dt_pred.group_by(['essay_id', 'full_text'], maintain_order=True):

    rev_s_token = ','.join(data['new_s_token'].to_list())
    t_len = data.group_by("new_s_token").len()
    
    for k in t_len.to_dicts():
        if k['new_s_token'] == '<Concluding>':
            if k['len'] > 1:
                conclude_occur = rev_s_token.count('<Concluding>')
                rev_s_token = rev_s_token.replace('<Concluding>', '', conclude_occur - 1)

    new_s_token.extend(rev_s_token.split(','))

In [34]:
all_dt_pred = all_dt_pred.with_columns(new_s_token=pl.Series(new_s_token))
all_dt_pred

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6,preds,m_preds,s_token,new_s_token
str,str,str,i64,bool,str,f32,f32,f32,f32,f32,f32,f32,i64,str,str,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…",1.348422,-2.03921,2.770183,2.257207,-2.593693,-1.103083,-2.095595,2,"""Claim""","""<Claim>""","""<Claim>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The thing they…",-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058,3,"""Evidence""","""<Evidence>""","""<Evidence>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""Street parkig …",-1.69301,-1.473693,1.431878,3.76982,-1.916697,-2.047137,-1.346176,3,"""Repeated""","""""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""You probaly wo…",-1.512254,-3.646935,-0.515993,6.05601,-1.528462,-1.853261,-0.455074,3,"""Repeated""","""""",""""""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""The vauban peo…",-2.831292,-1.682352,0.927727,3.703798,-1.5447,-1.42729,0.498887,3,"""Repeated""","""""",""""""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""if a human is …",-3.155074,-2.27939,3.667197,3.379212,-1.769132,-2.329039,-0.179665,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626,2,"""Repeated""","""""",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996,6,"""Rebuttal""","""<Rebuttal>""","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""Now they are t…",-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222,2,"""Repeated""","""""",""""""


In [35]:
all_sen = all_dt_pred.with_columns(pl.concat_str([pl.col('new_s_token'), pl.col('discourse_text')], separator="").alias('tokened_text'))
all_sen = all_sen.group_by(selected_cols, maintain_order=True).agg([
    pl.col('tokened_text').str.concat(' ')
])
all_sen = all_sen.with_columns(pl.col('tokened_text').str.replace_all(pat_1, pat_2))
all_sen

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""<Claim>Many pe…"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""<Claim>People …"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""<Lead>We all h…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""<Position>Dear…"
…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""<Lead>the stor…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""<Claim>Technol…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""<Claim>If you …"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""<Lead>In ""The …"


In [36]:
tt = all_sen.with_columns([pl.col('tokened_text').str.count_matches(count_dt[i]).alias(f'{rev_dt_map[i]}' +'_count') for i in range(7)]).group_by(['score']).mean().sort('score')
tt.to_pandas().style.background_gradient()

Unnamed: 0,score,essay_id,full_text,prompt_name,kaggle_only,tokened_text,Lead_count,Position_count,Claim_count,Evidence_count,Concluding Statement_count,Counterclaim_count,Rebuttal_count
0,1,,,,0.245208,,0.354633,0.653355,2.046326,2.177316,0.369808,0.172524,0.279553
1,2,,,,0.214059,,0.344061,1.236714,1.919754,1.785306,0.519162,0.267838,0.253652
2,3,,,,0.266168,,0.428003,1.577891,2.266805,2.351227,0.568493,0.38165,0.365881
3,4,,,,0.330871,,0.515537,1.789353,2.677534,3.046103,0.615894,0.519358,0.519103
4,5,,,,0.135052,,0.575258,2.063918,3.16701,3.950515,0.646392,0.718557,0.863918
5,6,,,,0.096154,,0.615385,2.083333,3.224359,4.532051,0.711538,1.032051,1.448718


In [37]:
all_sen.write_csv('train_combined_tokened_D012.csv')

# Discourse type predictions from MLM

In [38]:
mlm_preds = pl.read_parquet('/kaggle/input/lal-aes2-infer-mlm/mlm_pred.parquet')
mlm_preds

essay_id,full_text,prompt_name,score,kaggle_only,masked,deberta_labels_idx,preds
str,str,str,i64,bool,str,list[i64],list[i64]
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","[128003, 128006, … 128004]"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""[MASK]I am a s…","[1, 18, … 379]","[128003, 128004, … 128007]"
"""001ab80""","""People always …","""Driverless car…",4,true,"""[MASK]People a…","[1, 31, … 637]","[128003, 128004, … 128007]"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""[MASK]We all h…","[1, 72, … 537]","[128003, 128005, … 128007]"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[MASK]Dear, St…","[1, 20, … 455]","[128003, 128006, … 128007]"
…,…,…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""[MASK]the stor…","[1, 29, … 185]","[128003, 128005, … 128007]"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[MASK]Technolo…","[1, 14, … 682]","[128003, 128006, … 128007]"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[MASK]If you d…","[1, 27, … 249]","[128003, 128004, … 128007]"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""[MASK]In ""The …","[1, 46, … 267]","[128004, 128006, … 128007]"


In [39]:
new_repeated = []
for name, data in mlm_preds.explode('preds').group_by(['essay_id', 'full_text'], maintain_order=True):
    dtype = data['preds'].to_list()
    hold = -1
    for i in range(1, data.shape[0]):
        if dtype[i] == dtype[i - 1] or dtype[i] == hold:
            hold = dtype[i]
            dtype[i] = -1
    
    new_repeated.append(dtype)

In [40]:
tmp = mlm_preds.explode('preds').with_columns(preds=pl.Series(np.concatenate(new_repeated, axis=0)))
tmp = tmp.with_columns(pl.col('preds').replace(mlm_token_map, return_dtype=pl.String))
tmp

essay_id,full_text,prompt_name,score,kaggle_only,masked,deberta_labels_idx,preds
str,str,str,i64,bool,str,list[i64],str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","""<Lead>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","""<Evidence>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","""<Claim>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","""<Evidence>"""
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]",""""""
…,…,…,…,…,…,…,…
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""[MASK]Venus is…","[1, 11, … 167]","""<Evidence>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""[MASK]Venus is…","[1, 11, … 167]",""""""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""[MASK]Venus is…","[1, 11, … 167]","""<Rebuttal>"""
"""fffed3e""","""Venus is worth…","""Exploring Venu…",2,true,"""[MASK]Venus is…","[1, 11, … 167]",""""""


In [41]:
# tmp = mlm_preds.explode('preds').with_columns(pl.col('preds').replace(mlm_token_map, return_dtype=pl.String))
# tmp

In [42]:
# new_s_token = []

# for name, data in tmp.group_by(['essay_id', 'full_text'], maintain_order=True):

#     rev_s_token = ','.join(data['preds'].reverse().to_list())
#     t_len = data.group_by("preds").len()
    
#     for k in t_len.to_dicts():
#         if k['preds'] == '<Concluding>':
#             if k['len'] > 1:
#                 conclude_occur = rev_s_token.count('<Concluding>')
#                 rev_s_token = rev_s_token.replace('<Concluding>', '', conclude_occur - 1)

#     new_s_token.extend(reversed(rev_s_token.split(',')))

In [43]:
# tmp = tmp.with_columns(preds=pl.Series(new_s_token))
# tmp

In [44]:
masked_list = []
for name, data in tmp.group_by(['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only'], maintain_order=True):
    s_token = data['preds'].to_list()
    masked = data['masked'][0]
    
    for token in s_token:
        masked = masked.replace('[MASK]', token, 1)
    
    masked_list.append(masked)

In [45]:
mlm_preds = mlm_preds.with_columns(tokened_text=pl.Series(masked_list).str.replace_all('\[(MASK)\]', ''))
mlm_preds

essay_id,full_text,prompt_name,score,kaggle_only,masked,deberta_labels_idx,preds,tokened_text
str,str,str,i64,bool,str,list[i64],list[i64],str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""[MASK]Many peo…","[1, 10, … 596]","[128003, 128006, … 128004]","""<Lead>Many peo…"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""[MASK]I am a s…","[1, 18, … 379]","[128003, 128004, … 128007]","""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""[MASK]People a…","[1, 31, … 637]","[128003, 128004, … 128007]","""<Lead>People a…"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""[MASK]We all h…","[1, 72, … 537]","[128003, 128005, … 128007]","""<Lead>We all h…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""[MASK]Dear, St…","[1, 20, … 455]","[128003, 128006, … 128007]","""<Lead>Dear, St…"
…,…,…,…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""[MASK]the stor…","[1, 29, … 185]","[128003, 128005, … 128007]","""<Lead>the stor…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""[MASK]Technolo…","[1, 14, … 682]","[128003, 128006, … 128007]","""<Lead>Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""[MASK]If you d…","[1, 27, … 249]","[128003, 128004, … 128007]","""<Lead>If you d…"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""[MASK]In ""The …","[1, 46, … 267]","[128004, 128006, … 128007]","""<Position>In ""…"


In [46]:
mlm_preds = mlm_preds.select(['essay_id', 'full_text', 'prompt_name', 'score', 'kaggle_only', 'tokened_text'])
mlm_preds

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""<Lead>Many peo…"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""<Lead>People a…"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""<Lead>We all h…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""<Lead>Dear, St…"
…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""<Lead>the stor…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""<Lead>Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""<Lead>If you d…"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""<Position>In ""…"


In [47]:
mlm_combined_token = mlm_preds.with_columns(pl.col('tokened_text').str.replace_all(pat_1, pat_2))
mlm_combined_token

essay_id,full_text,prompt_name,score,kaggle_only,tokened_text
str,str,str,i64,bool,str
"""000d118""","""Many people ha…","""Car-free citie…",3,false,"""<Lead>Many peo…"
"""000fe60""","""I am a scienti…","""The Face on Ma…",3,true,"""<Lead>I am a s…"
"""001ab80""","""People always …","""Driverless car…",4,true,"""<Lead>People a…"
"""001bdc0""","""We all heard a…","""Exploring Venu…",4,true,"""<Lead>We all h…"
"""002ba53""","""Dear, State Se…","""Does the elect…",3,false,"""<Lead>Dear, St…"
…,…,…,…,…,…
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true,"""<Lead>the stor…"
"""ffddf1f""","""Technology has…","""Facial action …",4,false,"""<Lead>Technolo…"
"""fff016d""","""If you don't l…","""""A Cowboy Who …",2,false,"""<Lead>If you d…"
"""fffb49b""","""In ""The Challe…","""Exploring Venu…",1,false,"""<Position>In ""…"


In [48]:
mlm_combined_token.write_csv('train_mlm_combined_tokened_M007.csv')

In [49]:
tt = mlm_combined_token.with_columns([pl.col('tokened_text').str.count_matches(count_dt[i]).alias(f'{rev_dt_map[i]}' +'_count') for i in range(7)]).group_by(['score']).mean().sort('score')
tt.drop(['full_text', 'prompt_name', 'masked', 'deberta_labels_idx', 'preds', 'tokened_text']).to_pandas().style.background_gradient()

Unnamed: 0,score,essay_id,kaggle_only,Lead_count,Position_count,Claim_count,Evidence_count,Concluding Statement_count,Counterclaim_count,Rebuttal_count
0,1,,0.245208,0.486422,0.890575,2.545527,2.36901,0.579872,0.255591,0.232428
1,2,,0.214059,0.424518,0.938598,2.536312,2.536947,0.862587,0.348931,0.22888
2,3,,0.266168,0.5223,0.962249,3.344059,3.116279,1.066263,0.560688,0.402517
3,4,,0.330871,0.645186,0.970708,4.214213,3.763627,1.200204,0.856342,0.658686
4,5,,0.135052,0.736082,0.93299,5.209278,4.55567,1.231959,1.3,1.062887
5,6,,0.096154,0.74359,0.987179,5.724359,4.878205,1.24359,2.032051,1.679487


In [50]:
# 3d13ded