In [22]:
# !rm -rf /kaggle/working/*

In [23]:
# !python --version  
# Python 3.10.12

# Importing Libraries

In [24]:
from typing import Optional
from datetime import datetime

import os
import pickle
import re

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter

!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.8.0-py3-none-any.whl
from spellchecker import SpellChecker

import spacy

from sklearn.feature_extraction.text import TfidfVectorizer

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.8.0-py3-none-any.whl
pyspellchecker is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


# Globals

In [25]:
CSV_INPUT_PATH = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'
INPUT_PATH = '/kaggle/input/'
CATBOOST_PATH = '/kaggle/input/catboost/'

In [26]:
def printime():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time =", current_time)

# Features

## Data Preparation

In [27]:
def get_df_test():
    X_test = pd.read_csv(CSV_INPUT_PATH)
#     X_test['text'] = correct_spelling(X_test, 'text')
    X_test['clean_text'] = clean_text(X_test)
    return X_test

In [28]:
def correct_text_spelling(text):
    spell = SpellChecker()
    words = re.findall(r'\b\w+\b', text)
    misspelled = spell.unknown(words)
    corrected_text = text
    for word in misspelled:
        if spell.correction(word):
            corrected_text = corrected_text.replace(word, spell.correction(word))
    return corrected_text

def correct_spelling(df, text_col: str = 'text') -> pd.DataFrame:
    df_ = df.copy()
    df_['corrected'] = df_[text_col].apply(correct_text_spelling)
    return df_['corrected']

In [29]:
def get_corpus(df: pd.DataFrame, text_col: str = 'clean_text') -> list[str]:
    return df[text_col].to_list()

## Text Cleaning and Features

In [30]:
# cleaned text
def clean_text(df: pd.DataFrame) -> pd.Series:
    df_ = df.copy()
    
    df_['clean_text'] = (df_['text'].str.replace('\n\n', '') 
                                    .str.replace('\'s', '')
                                    .str.replace('[.,?!:;\'\\\\"]', '', regex=True)
                                    .str.lower()
                        )
    
    return df_['clean_text']

In [31]:
def tfidf(corpus: list[str], vectorizer: Optional[TfidfVectorizer] = None) -> [pd.DataFrame | tuple[pd.DataFrame, pd.DataFrame]]:
    
    if vectorizer:
        M = vectorizer.transform(corpus)
        return M
    
    else:
        vectorizer = TfidfVectorizer(
                                     lowercase=False,
#                                      token_pattern = None,
                                     sublinear_tf=True,
                                     stop_words='english',
                                     ngram_range=(3,5)
                                    )
        
        M = vectorizer.fit_transform(corpus)
        
        return M, vectorizer 

In [32]:
def pos_feature(df: pd.DataFrame, text_col: str = 'clean_text') -> pd.DataFrame:
    df_ = df.copy()
    
    nlp = spacy.load("en_core_web_sm")

    docs = [nlp(t) for t in df_[text_col]]
    pos_tags = [[token.pos_ for token in doc] for doc in docs]
    
    df_['pos_tags'] = pos_tags
    
    df_['pos_text'] = df_['pos_tags'].apply(lambda x: ' '.join(x))
    
    df_['NOUN_count'] = df_['pos_text'].str.count('NOUN')
    df_['VERB_count'] = df_['pos_text'].str.count('VERB')
    df_['ADJ_count'] = df_['pos_text'].str.count('ADJ')
    df_['ADV_count'] = df_['pos_text'].str.count('ADV')
    df_['ADP_count'] = df_['pos_text'].str.count('ADP')
    df_['PRON_count'] = df_['pos_text'].str.count('PRON')
    df_['PROPN_count'] = df_['pos_text'].str.count('PROPN')
    df_['PUNCT_count'] = df_['pos_text'].str.count('PUNCT')
    df_['AUX_count'] = df_['pos_text'].str.count('AUX')
    df_['NUM_count'] = df_['pos_text'].str.count('NUM')
    df_['X_count'] = df_['pos_text'].str.count('X')
    
    df_ = df_.drop(columns=['pos_tags', 'pos_text'])
    
    return df_
    
def stopwords_feature(df, text_col: str = 'clean_text') -> pd.DataFrame:
    df_ = df.copy()
    
    stop_words = set(stopwords.words('english'))

    def stopwords_counter_and_filter(row):
        words = word_tokenize(row[text_col])
        stopword_counter = sum(1 for w in words if w.lower() in stop_words)
        filtered_text = ' '.join(w for w in words if w.lower() not in stop_words)
        return stopword_counter, filtered_text

    df_[['number_of_stopwords', 'filtered_text']] = df_.apply(stopwords_counter_and_filter, axis=1, result_type='expand')

    return df_

def caps_to_periods_ratio_feature(df) -> pd.DataFrame:
    df_ = df.copy()
    
    parentheses_count = df_['parentheses_count']
    caps_count = df_['text'].str.count(r'[A-Z]')
    df_['caps_to_periods_ratio'] = caps_count / parentheses_count.replace(0, 1) # replace 0 by 1 to avoid deviding by 0
    
    return df_

### Feature Extraction

In [33]:
def feature_extraction(df, text_col: str = 'clean_text') -> pd.DataFrame:
    """
    
    Extracts text based features for column text_col
    
    """
    
    df_ = df.copy()
    
    # text length
    df_['num_of_words'] = (df_[text_col].str.replace('\n\n', '') # removing row
                                .str.count(' ')+1)
    # uniqe words per row
    df_['text_vocab_size'] = df_[text_col].apply(lambda x : len(set(x.split())))
    
    df_['num_of_sentences'] = df_[text_col].str.count('\.')
    
    df_['parentheses_count'] = df_['text'].str.count('\(|\)')
    
    df_['semicolon_count'] = df_['text'].str.count(';')

    df_['hypen_count'] = df_['text'].str.count('-')
    
    df_['dash_count'] = df_['text'].str.count('—')
    
    df_['comma_count'] = df_['text'].str.count(',')
    
    df_['qm_count'] = df_['text'].str.count('\?')
    
    df_['en_count'] = df_['text'].str.count('!')
    
    df_['apostrophe_count'] = df_['text'].str.count('’')
    
    df_['paragraph_count'] = df_['text'].str.count('\n\n')
    
    # extract POS features
    df_ = pos_feature(df_, text_col)
    
    # df_ = stopwords_feature(df_, text_col)
    
    # count typos
#     df_ = typos_count_feature(df_)

    # calculate the ratio between the number of capital letters and periods
    df_ = caps_to_periods_ratio_feature(df_)
    
    return df_

## Loading pickles and CSVs

In [34]:
files = os.listdir(CATBOOST_PATH)

for file_name in files:
    file_path = os.path.join(CATBOOST_PATH, file_name)
    name, extension = os.path.splitext(file_name)
    with open(file_path, 'rb') as file:
        if extension == '.pickle':
            globals()[name] = pickle.load(file)
        elif extension == '.csv':
            globals()[name] = pd.read_csv(file, index_col=0)

# Models by Feature Sets

In [35]:
def text_tfidf_models_test():
    X_test = get_df_test()

    cols = ['id', 'prompt_id']
    X_test = X_test.drop(columns=cols)
    
    corpus_test = get_corpus(X_test)
    tfidf_X_test = tfidf(corpus_test, text_tfidf_vectorizer)
    tfidf_test_pred = text_tfidf_ensemble.predict_proba(tfidf_X_test)[:,1]
    
    return tfidf_test_pred

In [36]:
def pos_tfidf_models_test():
    X_test = get_df_test()

    # Extract pos related feature
    nlp = spacy.load("en_core_web_sm")
    docs = [nlp(t) for t in X_test['clean_text']]
    pos_tags = [[token.pos_ for token in doc] for doc in docs]
    X_test['pos_tags'] = pos_tags
    X_test['pos_text'] = X_test['pos_tags'].apply(lambda x: ' '.join(x))

    cols = ['id', 'prompt_id']
    X_test = X_test.drop(columns=cols)
    
    pos_corpus_test = get_corpus(X_test, text_col='pos_text')
    pos_tfidf_X_test = tfidf(pos_corpus_test, pos_tfidf_vectorizer)
    pos_test_pred = pos_tfidf_ensemble.predict_proba(pos_tfidf_X_test)[:,1]
    
    return pos_test_pred

In [37]:
def numeric_features_models_test():    
    X_test = get_df_test()

    # Extract numeric feature
    X_test = feature_extraction(X_test, 'clean_text')

    X_test_id = X_test['id']

    cols = ['id', 'prompt_id', 'text','clean_text'] # 'pos_text'
    X_test = X_test.drop(columns=cols)
    
    test_pred = num_ensemble.predict_proba(X_test)[:,1]
    
    return X_test_id, test_pred

# Execution and Analysis

In [38]:
printime()
tfidf_test_pred = text_tfidf_models_test()
printime()
pos_test_pred = pos_tfidf_models_test()
printime()
X_test_id, test_pred = numeric_features_models_test()
printime()

Current Time = 19:06:24
Current Time = 19:06:24
Current Time = 19:06:25
Current Time = 19:06:26


In [39]:
pred_test = pd.DataFrame({'tfidf_pred': tfidf_test_pred, 'pos_pred': pos_test_pred, 'num_pred': test_pred})

In [40]:
y_test_pred = cat_clf_stack.predict_proba(pred_test)[:,1]

# Submission

In [41]:
results = pd.DataFrame({'id': X_test_id, 'generated': y_test_pred})
results.to_csv('submission.csv', index=False)

In [42]:
printime()

Current Time = 19:06:26
