## Feature from Feedback Competion

In [None]:
import sys
sys.path.append("../input/omegaconf")

In [None]:
# basics
import os
import sys
import json
from copy import deepcopy
from itertools import chain
from omegaconf import OmegaConf

# Processing
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

# ipython
from IPython.display import display
from IPython.core.debugger import set_trace

In [None]:
import os 
os.makedirs("./datasets", exist_ok=True)
os.makedirs("./predictions", exist_ok=True)

In [None]:
%%writefile eff_inference.yaml

model:
    backbone_path: /kaggle/input/debertav3xsmall/deberta-v3-xsmall
    feature_extractor:
        num_layers: 4
    max_length: 448
    target_names:
        - cohesion
        - syntax
        - vocabulary
        - phraseology
        - grammar
        - conventions
    len_tokenizer: ???
    loss_fn: mse

infer_params:
    input_path: ../input/learning-agency-lab-automated-essay-scoring-2/test.csv
    infer_bs: 4
    agg_fn: mean

In [None]:
%%time
!python /kaggle/input/feedback-utility/create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=0

In [None]:
%%time
!python /kaggle/input/feedback-utility/create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=1

In [None]:
!python /kaggle/input/feedback-utility/create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=2

In [None]:
!python /kaggle/input/feedback-utility/create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=3

In [None]:
!sh /kaggle/input/feedback-utility/run.sh

In [None]:
eff_df_0 = pd.read_csv("./predictions/eff_preds_0.csv")
# eff_df_0 = eff_df_0.reset_index(drop=True)

eff_df_1 = pd.read_csv("./predictions/eff_preds_1.csv")
eff_df_2 = pd.read_csv("./predictions/eff_preds_2.csv")
eff_df_3 = pd.read_csv("./predictions/eff_preds_3.csv")
# eff_df_1 = eff_df_1.reset_index(drop=True)

eff_df = pd.concat([eff_df_0, eff_df_1, eff_df_2, eff_df_3], axis=0)
eff_df = eff_df.sort_values(by="essay_id")
eff_df = eff_df.reset_index(drop=True)

In [None]:
MODEL_WEIGHTS = [1.0] #[0.34, 0.33, 0.33]
print(f"sum of weights {np.sum(MODEL_WEIGHTS)}")

submission_df = pd.DataFrame()

pred_dfs  = [  
    eff_df,
]

TARGET_COLS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]

submission_df["essay_id"] =  pred_dfs[0]["essay_id"].values
for model_idx, model_preds in enumerate(pred_dfs):
    if model_idx == 0:
        for target in TARGET_COLS:
            submission_df[target]  =  MODEL_WEIGHTS[model_idx] * model_preds[target]
    else:
        for target in TARGET_COLS:
            submission_df[target]  +=  MODEL_WEIGHTS[model_idx] * model_preds[target] 

eff_df
eff_df = eff_df.drop_duplicates(subset=['essay_id'], keep='first')
eff_df

## feature engineering for text

In [None]:
import re
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import VotingClassifier,VotingRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from tqdm.auto import tqdm,trange
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import pickle

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
class FeatureEngineering():
    def __init__(self):
        self.columns = [
            (pl.col("full_text").str.split(by="\n\n").alias("paragraph"))
        ]
        self.train_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv').with_columns(self.columns)
        self.test_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv').with_columns(self.columns)
        # feature_eng
        self.sentence_fea = ['sentence_len','sentence_word_cnt']
        # feature_eng
        self.paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
        self.vectorizer = TfidfVectorizer(tokenizer=lambda x: x,
                                          preprocessor=lambda x: x,
                                          token_pattern=None,
                                          strip_accents='unicode',
                                          analyzer = 'word',
                                          ngram_range=(2,3),
                                          min_df=0.05,
                                          max_df=0.9,
                                          sublinear_tf=True  
        )
    def removeHTML(self,x):
        html=re.compile(r'<.*?>')
        return html.sub(r'',x)
    def dataPreprocessing(self,x):
        x = x.lower()             # covert all letter to lower form
        x = self.removeHTML(x)
        x = re.sub("@\w+", '',x)
        x = re.sub("'\d+", '',x)
        x = re.sub("\d+", '',x)
        x = re.sub("http\w+", '',x)
        x = re.sub(r"\s+", " ",x) # replace any sequence of whitespace characters with a sigle whitespace
        x = re.sub(r"\.+", ".",x) # replace any sequence of periods with a sigle periods
        x = re.sub(r"\,+", ",",x) # replace any sequence of commas with a sigle comma
        x = x.strip()
        return x 
    def Paragraph_Preprocess(self,tmp):
        tmp = tmp.explode('paragraph')
        # preprocess
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(self.dataPreprocessing))
        # paragraph_len
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x:len(x)).alias("paragraph_len"))
        # filter
        tmp = tmp.filter(pl.col('paragraph_len')>=25)
        # paragraph_sentence_count/paragraph_word_count
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split("."))).alias("paragraph_sentence_cnt"),
                               pl.col('paragraph').map_elements(lambda x: len(x.split(" "))).alias("paragraph_word_cnt")
                              )
        return tmp
    def Paragraph_Eng(self,train_tmp):
        aggs = [
            # paragraph_len_cnt
            *[pl.col('paragraph').filter(pl.col('paragraph_len')>=i)
            .count().alias(f'paragraph_{i}_cnt') for i in [25,100,200,300,400,500,600,700]],
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.paragraph_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.paragraph_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.paragraph_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.paragraph_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.paragraph_fea],
        ]
        df = train_tmp.group_by(["essay_id"], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Paragraph_Eng +",len(df.columns),"features")
        return df
    def Sentence_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=".").alias("sentence"))
        tmp = tmp.explode('sentence')
        # sentence_len
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
        # filter
        tmp = tmp.filter(pl.col('sentence_len')>=15)
        # sentence_word_cnt
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))

        return tmp
    def Sentence_Eng(self,train_tmp):
        aggs = [
            # sentence_cnt
            *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.sentence_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.sentence_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.sentence_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.sentence_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.sentence_fea],
            ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Sentence_Eng +",len(df.columns),"features")
        return df
    # word feature
    def Word_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=" ").alias("word"))
        tmp = tmp.explode('word')
        # word_len
        tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
        # filter
        tmp = tmp.filter(pl.col('word_len')!=0)

        return tmp
    # feature_eng
    def Word_Eng(self,train_tmp):
        aggs = [
            # word_cnt
            *[pl.col('word').filter(pl.col('word_len') >= i+1)
              .count().alias(f"word_{i+1}_cnt") for i in range(15)], 
            # other
            pl.col('word_len').max().alias(f"word_len_max"),
            pl.col('word_len').mean().alias(f"word_len_mean"),
            pl.col('word_len').std().alias(f"word_len_std"),
            pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
            pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
            pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Word_Eng +",len(df.columns),"features")
        return df
    def process(self):
        tmp = self.Paragraph_Preprocess(self.train_dataset)
        train_feats = self.Paragraph_Eng(tmp)
        train_feats['score'] = self.train_dataset['score']
        
        tmp = self.Sentence_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Sentence_Eng(tmp), on='essay_id', how='left')
        
        tmp = self.Word_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Word_Eng(tmp), on='essay_id', how='left')
        
        train_tfid = self.vectorizer.fit_transform([i for i in self.train_dataset['full_text']])
        dense_matrix = train_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
#         print("----------------------------------------------------------")
        df['essay_id'] = train_feats['essay_id']
        # merge
        train_feats = train_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(train_feats.columns)-2)
        return train_feats
    def process_test(self):
        temp = self.Paragraph_Preprocess(self.test_dataset)
        test_feats = self.Paragraph_Eng(temp)
        
        temp = self.Sentence_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Sentence_Eng(temp), on='essay_id', how='left')
        
        temp = self.Word_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Word_Eng(temp), on='essay_id', how='left')
        
        test_tfid = self.vectorizer.transform([i for i in self.test_dataset['full_text']])
        dense_matrix = test_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
        df['essay_id'] = test_feats['essay_id']
        # merge
        test_feats = test_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(test_feats.columns)-2)
        
        return test_feats

In [None]:
class LGBM():
    def __init__(self):
        self.data_train = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
        self.data_test = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
        self.num_models = 3
        self.acc_metrics = []
        self.cohen_metrics = []
        
        # coef for cohen kappa score
        self.a = 2.948
        self.b = 1.092
        
        self.lgb_parameters = {  
                                 'metrics': 'None',
                                 'objective': self.qwk_obj,
                                 'learning_rate': 0.05,
                                 'max_depth': 5,
                                 'num_leaves': 10, # should be a number smaller than "max_depth"^2
                                 'colsample_bytree': 0.3,
                                 'min_data_in_leaf': 100,
                                 'reg_alpha': 0.7,
                                 'reg_lambda' : 0.1,
                                 'n_estimators': 700,
                                 'extra_trees' : True,
                                 'verbosity': -100,
#                                  'device' : "gpu"
        }
        self.model = VotingRegressor(
            estimators = [
                            (f"lgb_{i}",lgb.LGBMRegressor(**self.lgb_parameters, random_state=i+40),)for i in range(self.num_models)
                         ],
                        n_jobs=-1
        )
        
    def quadratic_weighted_kappa(self,y_true,y_pred):
        y_true = y_true + self.a
        y_pred = (y_pred + self.a).clip(1,6).round()
#         print(y_true)
#         print(y_pred)
        qwk = cohen_kappa_score(y_true,y_pred,weights='quadratic')
        
        return "QWK",qwk,True
    def qwk_obj(self,y_true,y_pred):
        labels = y_true + self.a
        preds = y_pred + self.a
        preds = preds.clip(1,6)
        f = 1/2 * np.sum((preds-labels)**2)
        g = 1/2 * np.sum((preds-self.a)**2+self.b)
        df = preds - labels
        dg = preds - self.a
        grad = (df/g - f*dg/g**2)*len(labels)
        hess = np.ones(len(labels))
        
        return grad,hess
    def split_folds(self, df):
        feature_names = [col for col in df.columns if col not in ['essay_id', 'score']]
        x = df[feature_names].values
        y = df['score'].values
        
        kfold = KFold(n_splits=5, random_state=44, shuffle=True)
        
        return kfold.split(x, y)
    
    def fit(self, df,debug=False):
        folds = self.split_folds(df)

        for fold_id, (trn_idx, val_idx) in enumerate(folds):
            if fold_id != 0 and debug==True:
                break 

            X_train, X_val = df.iloc[trn_idx][feature_names], df.iloc[val_idx][feature_names]
            Y_train, Y_val = df.iloc[trn_idx]['score'] - self.a, df.iloc[val_idx]['score'] - self.a
            
            print(f'\nFold_{fold_id} Training ================================\n')
            
            self.model.fit(X_train, Y_train)
            pred_val = self.model.predict(X_val)
            
            df_tmp = df.iloc[val_idx][['essay_id', 'score']].copy()
            df_tmp['pred'] = pred_val
            
            # plot confusion matrix
            y_true = Y_val.values+np.ones_like(Y_val.shape)*self.a
            y_pred = (pred_val + np.ones_like(pred_val)*self.a).clip(1,6).round()
            cm = confusion_matrix(y_true,y_pred)
            sns.heatmap(cm, 
                        annot=True,
                        fmt='g', 
                        xticklabels=['1','2','3','4','5','6'],
                        yticklabels=['1','2','3','4','6','6'])
            plt.ylabel('Prediction',fontsize=13)
            plt.xlabel('Actual',fontsize=13)
            plt.title('Confusion Matrix',fontsize=17)
            plt.show()
                                  
            cohen_score = self.quadratic_weighted_kappa(Y_val.values, df_tmp['pred'])
            self.cohen_metrics.append(cohen_score[1])

        average_cohen = np.mean(self.cohen_metrics)
        print(f'Average Cohen all fold: {average_cohen:.4f}')
    def save_model(self):
        pass
    def predict(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id','score'], df.columns))
        
        predictions = self.model.predict(df[feature_names])
        predictions += self.a
        predictions = predictions.clip(1,6).round()
#         predictions = self.model.predict(df[feature_names])
        return predictions
    def submit(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id'], df.columns))
        return self.data_test.select('essay_id').with_columns(score = (self.model.predict(df[feature_names])+self.a).clip(1, 6).round())

In [None]:
FE = FeatureEngineering()
train_feature = FE.process()
test_feature = FE.process_test()
test_feature = pd.merge(test_feature,eff_df,on='essay_id')
test_feature.shape

In [None]:
test_feature = test_feature.drop(['essay_id'],axis=1)
test_feature

In [None]:
train_feature['essay_id']

## predict

In [None]:
from joblib import load
model = load(f'/kaggle/input/train-lgbm-extra-feature-from-feedback-comp/saved_models/model.joblib')

In [None]:
pred = model.predict(test_feature)

## submission

In [None]:
submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission['score'] = pred
submission['score']=submission['score'].astype(int)
submission.to_csv("submission.csv",index=None)
display(submission.head())

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

# Example DataFrame
data = {
    'feature1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'feature2': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'unique_column': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5]
}
df = pd.DataFrame(data)

# Number of unique values
unique_values = df['unique_column'].unique()
n_splits = len(unique_values)

# Initialize GroupKFold
gkf = KFold(n_splits=n_splits)

# Create folds
folds = {}
for fold, (train_index, test_index) in enumerate(gkf.split(df, groups=df['unique_column'])):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]
    folds[fold] = {'train': train_df, 'test': test_df}
    print(f"Fold {fold + 1} - Test Data (unique_column={test_df['unique_column'].unique()[0]}):\n{test_df}\n")
