In [57]:
import numpy as np
import pandas as pd
from spellchecker import SpellChecker
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy

In [58]:
train_df = pd.read_csv('/Users/catherine/Desktop/Fall 2024/nlp-predict-AD/data/train_complete_v1_800.csv', index_col='index')
train_df.drop(columns=['example_index',], inplace=True)
test_df = pd.read_csv('/Users/catherine/Desktop/Fall 2024/nlp-predict-AD/data/test_complete_v1_149.csv', index_col='index')
test_df.drop(columns=['example_index',], inplace=True)

## Total Number of Sentences & Average Length of Sentence

In [59]:
def count_sentences_average_length(text):
    # Split the text based on sentence-ending punctuation (.!?)
    text = re.sub(r'\([^)]*\)', '', text)
    sentences = re.split(r'[.?!]', text)
    length_sentence = len(sentences)-1
    total_words = sum(len(sentence.split()) for sentence in sentences)-1
    average_length = total_words / length_sentence
    return length_sentence, average_length

train_df[['sentence_count', 'avg_sentence_length']] = train_df['line'].apply(
    lambda x: pd.Series(count_sentences_average_length(x)))
test_df[['sentence_count', 'avg_sentence_length']] = test_df['line'].apply(
    lambda x: pd.Series(count_sentences_average_length(x)))

## Ratio of Open-class Words to Closed-class Words

In [60]:
nlp = spacy.load("en_core_web_sm")

def open_closed_word_ratio(text):
    doc = nlp(text)  
    open_class_count = 0
    closed_class_count = 0
    
    for token in doc:
        if token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}:  
            open_class_count += 1
        elif token.pos_ in {"PRON", "ADP", "CONJ", "DET", "AUX", "SCONJ", "CCONJ", "PART"}:
            closed_class_count += 1
    
    if closed_class_count == 0:
        return 0  
    return open_class_count/closed_class_count


train_df['open_closed_ratio'] = train_df['line'].apply(open_closed_word_ratio)
test_df['open_closed_ratio'] = test_df['line'].apply(open_closed_word_ratio)

## Total Number of Simple Verb Forms & Compound Verb Forms

In [61]:
def count_verbs(text):
    doc = nlp(text)
    
    simple_verbs = 0
    compound_verbs = 0

    for token in doc:
        # Identify standalone verbs (simple verbs)
        if token.pos_ == "VERB" and not any(child.dep_ == "aux" for child in token.children):
            simple_verbs += 1
            # print(token)
        # Identify main verbs with auxiliary verbs (compound verbs)
        if token.pos_ == "VERB" and any(child.dep_ == "aux" for child in token.children):
            compound_verbs += 1
            # print(token)

    return simple_verbs, compound_verbs

train_df[['simple_verbs', 'compound_verbs']] = train_df['line'].apply(
    lambda x: pd.Series(count_verbs(x)))
test_df[['simple_verbs', 'compound_verbs']] = test_df['line'].apply(
    lambda x: pd.Series(count_verbs(x)))

## Number of pause

In [67]:
def count_pauses(text):
    pauses = re.findall(r'\(\.{1,3}\)', text)
    return len(pauses)

train_df['pause_count'] = train_df['line'].apply(count_pauses)
test_df['pause_count'] = test_df['line'].apply(count_pauses)

## Average word length (in letters)

In [73]:
def average_word_length(text):
    text_no_punctuation = text.translate(str.maketrans('', '', string.punctuation))
    words = text_no_punctuation.split() # Split the text into words
    total_letters = sum(len(word) for word in words)  # Count letters in all words
    total_words = len(words)  # Count total words
    return total_letters / total_words  # Calculate average length

train_df['avg_word_length'] = train_df['line'].apply(average_word_length)
test_df['avg_word_length'] = test_df['line'].apply(average_word_length)


## Proportion of pronouns

In [77]:
def calculate_pronoun_proportion(text):
    doc = nlp(text)  
    total_words = len([token for token in doc if token.is_alpha])  # Count only alphabetic words
    pronouns = len([token for token in doc if token.pos_ == "PRON"])  # Count pronouns
    if total_words == 0:  
        return 0
    return pronouns / total_words

train_df['pronoun_proportion'] = train_df['line'].apply(calculate_pronoun_proportion)
test_df['pronoun_proportion'] = test_df['line'].apply(calculate_pronoun_proportion)

## Proportion of non-dictionary words

In [87]:
spell = SpellChecker()

def calculate_non_dictionary_proportion(text):
    text_no_punctuation = text.translate(str.maketrans('', '', '?!.,()+"'))
    words = text_no_punctuation.split()
    total_words = len(words)  
    # Count words that are not in the dictionary and don't end with 'in'
    non_dictionary_words = [
        word for word in words 
        if word.lower() not in spell and not word.lower().endswith('in')
    ]
    return len(non_dictionary_words) /total_words # Calculate proportion

train_df['non_dict_word_proportion'] = train_df['line'].apply(calculate_non_dictionary_proportion)
test_df['non_dict_word_proportion'] = test_df['line'].apply(calculate_non_dictionary_proportion)

## Proportion of words in adverb category

In [91]:
def calculate_adverb_proportion(text):
    doc = nlp(text)
    total_words = len([token for token in doc if token.is_alpha])  
    adverbs = len([token for token in doc if token.pos_ == "ADV"]) 
    return adverbs / total_words  

train_df['adverb_proportion'] = train_df['line'].apply(calculate_adverb_proportion)
test_df['adverb_proportion'] = test_df['line'].apply(calculate_adverb_proportion)

In [93]:
train_df.to_csv('/Users/catherine/Desktop/Fall 2024/nlp-predict-AD/data/train_features.csv')

In [94]:
test_df.to_csv('/Users/catherine/Desktop/Fall 2024/nlp-predict-AD/data/test_features.csv')