# Library Imports

In [None]:
import re
from collections import Counter

import numpy as np
import pandas as pd
import torch
import spacy
import textstat
import xgboost as xgb
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer, AutoModel

# Use the English model for spaCy

In [2]:
nlp = spacy.load('en_core_web_sm')

# Import Deberta Model from transformer

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

Using device: cuda




DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 1024, padding_idx=0)
    (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-23): 24 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (pos_dropout): Dropout(p=0.1, inplace=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
            (dropout): Drop

# Functions to enrich the data

In [12]:
def get_embeddings(texts, tokenizer, model, device, max_length=100):
    """Extract DeBERTa embeddings for a list of texts"""
    embeddings = []
    
    # Process texts in batches
    for text in tqdm(texts, desc="Extracting embeddings"):
        # Tokenize
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            max_length=max_length, 
            padding="max_length", 
            truncation=True
        )
        
        # Move to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get outputs
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the [CLS] token embedding from the last hidden state
        # This represents the whole sequence
        #cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_embedding)
    # Stack tensors (still on GPU)
    return (torch.cat(embeddings, dim=0)).cpu().numpy()

def extract_readability_metrics(text):
    """Extract readability metrics using textstat"""
    if pd.isna(text):
        return pd.Series({
            'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0,
            'smog_index': 0, 'coleman_liau_index': 0, 'automated_readability_index': 0,
            'dale_chall_readability_score': 0, 'difficult_words': 0, 'syllable_count': 0
        })
    
    return pd.Series({
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'smog_index': textstat.smog_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(text),
        'difficult_words': textstat.difficult_words(text),
        'syllable_count': textstat.syllable_count(text),
    })

def extract_vocabulary_features(text):
    """Extract vocabulary richness metrics"""
    if pd.isna(text):
        return pd.Series({
            'unique_words': 0, 'lexical_diversity': 0,
            'hapax_legomena': 0, 'hapax_ratio': 0
        })
    
    # Count words and unique words
    words = re.findall(r'\b\w+\b', text.lower())
    unique_words = set(words)
    
    # Count word frequencies
    word_freq = Counter(words)
    
    # Hapax legomena (words appearing only once)
    hapax_legomena = sum(1 for word, count in word_freq.items() if count == 1)
    
    # Calculate metrics
    lexical_diversity = len(unique_words) / max(len(words), 1)
    hapax_ratio = hapax_legomena / max(len(unique_words), 1)
    
    return pd.Series({
        'unique_words': len(unique_words),
        'lexical_diversity': lexical_diversity,
        'hapax_legomena': hapax_legomena,
        'hapax_ratio': hapax_ratio
    })

def extract_nlp_features(text):
    """Extract advanced NLP features using spaCy"""
    if pd.isna(text):
        return pd.Series({
            'noun_count': 0, 'verb_count': 0, 'adj_count': 0, 'adv_count': 0,
            'pronoun_count': 0, 'noun_ratio': 0, 'verb_ratio': 0,
            'adj_ratio': 0, 'adv_ratio': 0, 'pronoun_ratio': 0,
            'named_entities': 0, 'entity_ratio': 0
        })
    
    # Process text with spaCy
    doc = nlp(text)
    
    # POS counts
    pos_counts = Counter([token.pos_ for token in doc])
    
    # Named entities
    named_entities = len(doc.ents)
    
    # Total tokens (excluding punctuation)
    total_tokens = sum(1 for token in doc if not token.is_punct)
    
    # Calculate ratios
    noun_ratio = pos_counts.get('NOUN', 0) / max(total_tokens, 1)
    verb_ratio = pos_counts.get('VERB', 0) / max(total_tokens, 1)
    adj_ratio = pos_counts.get('ADJ', 0) / max(total_tokens, 1)
    adv_ratio = pos_counts.get('ADV', 0) / max(total_tokens, 1)
    pronoun_ratio = pos_counts.get('PRON', 0) / max(total_tokens, 1)
    entity_ratio = named_entities / max(total_tokens, 1)
    
    return pd.Series({
        'noun_count': pos_counts.get('NOUN', 0),
        'verb_count': pos_counts.get('VERB', 0),
        'adj_count': pos_counts.get('ADJ', 0),
        'adv_count': pos_counts.get('ADV', 0),
        'pronoun_count': pos_counts.get('PRON', 0),
        'noun_ratio': noun_ratio,
        'verb_ratio': verb_ratio,
        'adj_ratio': adj_ratio,
        'adv_ratio': adv_ratio,
        'pronoun_ratio': pronoun_ratio,
        'named_entities': named_entities,
        'entity_ratio': entity_ratio
    })

In [13]:
sub_df = pd.read_csv(r"kaggle_dataset\train.csv").sample(1000, random_state=4567)
sub_df = sub_df.reset_index(drop=True)

In [14]:
readability_metrics = sub_df['full_text'].apply(extract_readability_metrics)
vocabulary_features = sub_df['full_text'].apply(extract_vocabulary_features)
nlp_features = sub_df['full_text'].apply(extract_nlp_features)

In [15]:
print("Extracting DeBERTa embeddings...")
embeddings = get_embeddings(sub_df['full_text'].to_list(), tokenizer, model, device)
print(f"Embeddings shape: {embeddings.shape}")

Extracting DeBERTa embeddings...


Extracting embeddings: 100%|██████████| 1000/1000 [01:35<00:00, 10.50it/s]

Embeddings shape: (1000, 1024)





In [16]:
embeddings_df = pd.DataFrame(embeddings)

In [17]:
new_sub_df = pd.concat([sub_df, readability_metrics, vocabulary_features, nlp_features, pd.DataFrame(embeddings)],axis=1)
new_sub_df.to_csv(r"kaggle_dataset\train_embeddings.csv", index=False)

In [18]:
X = new_sub_df[[i for i in range(0,1024)]+list(readability_metrics.columns)+list(vocabulary_features.columns)+list(nlp_features.columns)]
y = np.array(sub_df['score'])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4567)


In [20]:
params_classifier = {
    # Core parameters
    'objective': 'multi:softmax',
    'num_class': 6,
    
    # Tree structure parameters
    'max_depth': 7, 
    'eta': 1e-3,
    # Regularization parameters
    'min_child_weight': 5,
    'gamma': 0.5,
    'reg_alpha': 0.01,
    'reg_lambda': 1,
    
    # Performance settings
    'booster': 'gbtree',
    'tree_method': 'exact',
    'seed': 4567,
    # Evaluation metric
    'eval_metric': 'mlogloss'
}

In [21]:
# For classification (predicting discrete score categories):
print("\nTraining XGBoost classification model...")
# Convert scores to 0-indexed for multi:softmax
y_train_cls = y_train - 1
y_test_cls = y_test - 1

dtrain_cls = xgb.DMatrix(X_train, label=y_train_cls)
dtest_cls = xgb.DMatrix(X_test, label=y_test_cls)

# Train with early stopping
evals = [(dtrain_cls, 'train'), (dtest_cls, 'eval')]
cls_model = xgb.train(
    params_classifier,
    dtrain_cls,
    num_boost_round=2000,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=True
)

# Make predictions
y_pred_cls = cls_model.predict(dtest_cls)

# Calculate accuracy
accuracy = accuracy_score(y_test_cls, y_pred_cls)
print(f"Classification Model Accuracy: {accuracy:.4f}")



Training XGBoost classification model...
[0]	train-mlogloss:1.79025	eval-mlogloss:1.79088
[1]	train-mlogloss:1.78874	eval-mlogloss:1.79001
[2]	train-mlogloss:1.78724	eval-mlogloss:1.78914
[3]	train-mlogloss:1.78574	eval-mlogloss:1.78826
[4]	train-mlogloss:1.78425	eval-mlogloss:1.78739
[5]	train-mlogloss:1.78276	eval-mlogloss:1.78653
[6]	train-mlogloss:1.78127	eval-mlogloss:1.78567
[7]	train-mlogloss:1.77978	eval-mlogloss:1.78480
[8]	train-mlogloss:1.77831	eval-mlogloss:1.78393
[9]	train-mlogloss:1.77682	eval-mlogloss:1.78307
[10]	train-mlogloss:1.77535	eval-mlogloss:1.78221
[11]	train-mlogloss:1.77388	eval-mlogloss:1.78133
[12]	train-mlogloss:1.77240	eval-mlogloss:1.78050
[13]	train-mlogloss:1.77094	eval-mlogloss:1.77962
[14]	train-mlogloss:1.76947	eval-mlogloss:1.77879
[15]	train-mlogloss:1.76801	eval-mlogloss:1.77793
[16]	train-mlogloss:1.76655	eval-mlogloss:1.77709
[17]	train-mlogloss:1.76508	eval-mlogloss:1.77626
[18]	train-mlogloss:1.76363	eval-mlogloss:1.77542
[19]	train-mloglos

In [28]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [31]:
rfc = RandomForestClassifier(n_estimators=500, random_state=4567)
rfc.fit(X_train, y_train_cls)
y_pred_rfc = rfc.predict(X_test)

In [32]:
y_pred_rfc+=1
print("Random Forest Classifier Accuracy:", accuracy_score(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc, target_names=[str(i) for i in range(1, 7)]))

Random Forest Classifier Accuracy: 0.515
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        23
           2       0.50      0.60      0.55        50
           3       0.51      0.77      0.62        70
           4       0.58      0.43      0.49        44
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         3

    accuracy                           0.52       200
   macro avg       0.27      0.30      0.28       200
weighted avg       0.43      0.52      0.46       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
