## Libraries

In [None]:
!pip install datasets==2.15

In [None]:
# Importing necessary libraries
import nltk
import numpy as np # linear algebra
import lightgbm as lgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import cohen_kappa_score
from lightgbm import log_evaluation, early_stopping
from sklearn.linear_model import SGDClassifier
import polars as pl
import torch
from IPython.display import display
from datasets import Dataset,DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModel

nltk.download('wordnet')

### Reading the Data

In [None]:
columns = [  
    (
        pl.col("full_text").str.split(by="\n\n").alias("paragraph")
    ),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
# 载入训练集和测试集，同时对full_text数据使用\n\n字符分割为列表，重命名为paragraph
# Load training and testing sets, while using \ n \ n character segmentation to list and renaming to paragraph for full_text data
train = pl.read_csv(PATH + "train.csv").with_columns(columns)

# for test only
#train = train.sample(200)
test = pl.read_csv(PATH + "test.csv").with_columns(columns)
# 显示训练集中的第一个样本数据
# Display the first sample data in the training set
train.head(1)

### Preprocessing 

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)
def dataPreprocessing(x):
    # 将单词转化为小写
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
    # 移除html
    x = removeHTML(x)
    # 删除以@作为首字母的字符串
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # 删除数字
    # Delete Numbers
    x = re.sub("'\d+", '',x) # can delete it
    x = re.sub("\d+", '',x)
    # 删除网址
    # Delete URL
    x = re.sub("http\w+", '',x)
    # 将连续空白符替换为一个空格字符
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # 替换连续的句号和逗号为一个
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # 去除开头结尾的空白符
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

### Feature extraction

In [None]:
# convert the df to dataset 
train = train.with_columns(pl.col('full_text').map_elements(dataPreprocessing,return_dtype=str))
test = test.with_columns(pl.col('full_text').map_elements(dataPreprocessing,return_dtype=str))

ds_train = Dataset.from_pandas(train.to_pandas())
ds_test = Dataset.from_pandas(test.to_pandas())

ds = DatasetDict({"train": ds_train, "test": ds_test})

# Print the dataset dictionary keys and sizes
for name, dataset in ds.items():
    print(f"{name} size:", len(dataset))


In [None]:
# tokenize 
model_ckpt = "/kaggle/input/es-deberta-large-fold0"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch["full_text"], padding=True)

dst = ds.map(tokenize, batched=True ,batch_size = 64)
print(dst)

In [None]:
# load the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
# set format to tensors, ( needed for the model Feedforward )
dst.set_format("torch",columns=["input_ids", "attention_mask"])

# remove unwanted features
columns_to_remove = ['full_text', 'paragraph']
dstr = dst.remove_columns(columns_to_remove)

In [None]:
# extract the features
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

# prepare the vectors
dstr_hidden = dstr.map(extract_hidden_states,batched=True,batch_size = 2)

In [None]:
features_df = pd.DataFrame(dstr_hidden['train']['hidden_state'].numpy())
features_df.head(3)

In [None]:
features_df['essay_id'] = dstr_hidden['train']['essay_id']
features_df.head(3)

In [None]:
features_df.to_csv("features_deberta.csv",index=False)