In [1]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, GroupKFold
import seaborn as sns
import warnings

warnings.simplefilter('ignore')

In [2]:
train_df = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
persuade_df = pl.read_csv('/kaggle/input/persuade-data/persuade_2.0_human_scores_demo_id_github.csv')
persuade_df = persuade_df.select(['essay_id_comp', 'full_text', 'prompt_name', 'holistic_essay_score',]).rename({'essay_id_comp': 'essay_id', 'holistic_essay_score': 'score'})

In [3]:
p_drop_list = [
    'AA994A6CAF65',
    '288639E7060E',
    '98054C89C0F9',
    '3EB727C8562F',
    '9B23715DFB32',
    '0F1B167D414E',
    'CDD78907A391',
    '97C1CFD04E4B',
    '756D1A1C92B8',
    'BA13728424E7',
    
    
] # "3.25E+11"

same_hook = [
    'E620DC04735F',
    'CDF90A57A956',
    '860CB3A4935E',
    '7742D58270C9',
    'E856D94C6B6B',
    '887ABF4584C2',
    '45A33DB5C7F7',
    '9741343661EE',
    'F1860C4149BF',
    '381E6F5B0009',
    '1A135C4E302F',
    '571F81ED5EC3',
    '6FCAA7BE2809',
    '80DD6E906303'
]

same_hook2 = [
     'DE50DF03FAD8',
     '6F101B276EB1',
     '2C44C3912AC9',
     '27E057AD3D7D',
     'F054050F442F',
     '3D8260196DC0',
     'D0D7E09A5578',
     '3453B0EC32D3',
     'E3ED7398948E',
     '4B1EE53F33D4',
     '518650734A2D',
     'FE72572BE11A'
]
p_drop_list = p_drop_list + same_hook + same_hook2
train_drop_list = ['e9be80d', '6017fea', 'e9be80d']

In [4]:
train_df = train_df.filter(~pl.col('essay_id').is_in(train_drop_list))
persuade_df = persuade_df.filter(~pl.col('essay_id').is_in(p_drop_list))

In [5]:
overlapped = train_df.join(persuade_df, on=['full_text'], how='inner').select(['essay_id', 'full_text', 'prompt_name', 'score'])
overlapped

essay_id,full_text,prompt_name,score
str,str,str,i64
"""d5b6859""","""Cars have been…","""Car-free citie…",5
"""bb7cf0c""","""Have you ever …","""Car-free citie…",2
"""6f4d54b""","""cars have many…","""Car-free citie…",2
"""be705c4""","""There are many…","""Car-free citie…",4
"""7a43765""","""Wouldnt it be …","""Car-free citie…",4
…,…,…,…
"""01ec06b""","""Dear State Sen…","""Does the elect…",3
"""86ae7bb""","""Dear state sen…","""Does the elect…",3
"""a4801ba""","""I would like t…","""Does the elect…",2
"""864ba12""","""Dear State Sen…","""Does the elect…",4


In [6]:
non_overlapped = train_df.filter(~pl.col('full_text').is_in(overlapped['full_text']))
non_overlapped

essay_id,full_text,score
str,str,i64
"""000fe60""","""I am a scienti…",3
"""001ab80""","""People always …",4
"""001bdc0""","""We all heard a…",4
"""0033037""","""The posibilty …",2
"""0065bd6""","""Driverless car…",3
…,…,…
"""ffbd0b4""","""Do you think y…",2
"""ffcb061""","""Becoming a Sea…",3
"""ffcb264""","""Using technolo…",2
"""ffd378d""","""the story "" Th…",2


In [7]:
# non_overlapped.to_pandas().to_csv('nol_train_df.csv', index=False)
# overlapped.to_pandas().to_csv('train_df_OL_with_prompt.csv', index=False)

In [8]:
predicted = pl.read_csv('/kaggle/input/lal-aes2-infer-prompt-name/train_df_with_pred_prompt.csv')
predicted

essay_id,class
str,i64
"""000fe60""",4
"""001ab80""",5
"""001bdc0""",2
"""0033037""",3
"""0065bd6""",5
…,…
"""ffbd0b4""",2
"""ffcb061""",1
"""ffcb264""",3
"""ffd378d""",2


In [9]:
labels_map = {
    0: 'Car-free cities',
    1: '"A Cowboy Who Rode the Waves"',
    2: 'Exploring Venus',
    3: 'Facial action coding system',
    4: 'The Face on Mars',
    5: 'Driverless cars',
    6: 'Does the electoral college work?'
}

predicted = predicted.with_columns(prompt_name=pl.col('class').map_dict(labels_map, default=None))

In [10]:
non_overlapped_train_with_prompt = non_overlapped.join(predicted, on='essay_id').select(['essay_id', 'full_text', 'prompt_name', 'score'])
non_overlapped_train_with_prompt

essay_id,full_text,prompt_name,score
str,str,str,i64
"""000fe60""","""I am a scienti…","""The Face on Ma…",3
"""001ab80""","""People always …","""Driverless car…",4
"""001bdc0""","""We all heard a…","""Exploring Venu…",4
"""0033037""","""The posibilty …","""Facial action …",2
"""0065bd6""","""Driverless car…","""Driverless car…",3
…,…,…,…
"""ffbd0b4""","""Do you think y…","""Exploring Venu…",2
"""ffcb061""","""Becoming a Sea…","""""A Cowboy Who …",3
"""ffcb264""","""Using technolo…","""Facial action …",2
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2


In [11]:
train_df_with_prompt = pl.concat([overlapped.with_columns(kaggle_only=False), non_overlapped_train_with_prompt.with_columns(kaggle_only=True)])
train_df_with_prompt

essay_id,full_text,prompt_name,score,kaggle_only
str,str,str,i64,bool
"""d5b6859""","""Cars have been…","""Car-free citie…",5,false
"""bb7cf0c""","""Have you ever …","""Car-free citie…",2,false
"""6f4d54b""","""cars have many…","""Car-free citie…",2,false
"""be705c4""","""There are many…","""Car-free citie…",4,false
"""7a43765""","""Wouldnt it be …","""Car-free citie…",4,false
…,…,…,…,…
"""ffbd0b4""","""Do you think y…","""Exploring Venu…",2,true
"""ffcb061""","""Becoming a Sea…","""""A Cowboy Who …",3,true
"""ffcb264""","""Using technolo…","""Facial action …",2,true
"""ffd378d""","""the story "" Th…","""Exploring Venu…",2,true


In [12]:
nol_persuade_df = persuade_df.join(train_df_with_prompt, on=['full_text', 'prompt_name', 'score'], how='anti')
nol_persuade_df

essay_id,full_text,prompt_name,score
str,str,str,i64
"""423A1CA112E2""","""Phones Modern…","""Phones and dri…",3
"""BC75783F96E3""","""This essay wil…","""Phones and dri…",4
"""74C8BC7417DE""","""Driving while …","""Phones and dri…",2
"""A8445CABFECE""","""Phones & Drivi…","""Phones and dri…",3
"""6B4F7A0165B9""","""Cell Phone Ope…","""Phones and dri…",4
…,…,…,…
"""18409261F5C2""","""80% of America…","""Seeking multip…",5
"""D46BCB48440A""","""When people as…","""Seeking multip…",4
"""0FB0700DAF44""","""During a group…","""Seeking multip…",4
"""D72CB1C11673""","""Making choices…","""Seeking multip…",4


In [13]:
train_df_with_prompt.to_pandas().sort_values('essay_id').to_csv('train_df_with_prompt.csv', index=False)
persuade_df.to_pandas().sort_values('essay_id').to_csv('cleaned_persaude.csv', index=False)
nol_persuade_df.to_pandas().sort_values('essay_id').to_csv('nol_cleaned_persuade.csv', index=False)