## Feature extraction using python code
In the end, we have selected only the following features from our code: <br>
(future tense, past tense, sentence count, type-token ratio, content-word diversity, and the total number of first-person pronouns)<br>
The rest are derived from features extracted using LIWC.


### Imports

In [1]:
import numpy as np
import pandas as pd
import re, nltk

In [2]:
from nltk import pos_tag
from nltk import regexp_tokenize
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [3]:
from langdetect import detect

In [4]:
import spacy
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")
nlp_en.max_length = 1700000 
nlp_de.max_length = 1700000 

### Dataset 

In [5]:
# Define the file path to the CSV file in the "data" directory
file_path = "../data/reports/sentences_from_reports.csv"

# Read the CSV file
df = pd.read_csv(file_path)

# Ensure the 'sentence' column is of string data type
df['sentence'] = df['sentence'].astype(str)

In [6]:
def espace(text) : 
    text = text+' '
    return text
df["sentence"] = df["sentence"].apply(espace)
sentences = df.groupby(['company','year'],as_index=False)['sentence'].sum()

### Utill Functions 

In [8]:
def cleanText(text):
        text = str(text)  # convert into string
        text = text.lower()  # convert to lower case
        text = re.sub('[0-9]', '', text)  # remove numbers
        text = re.sub(r'\[.+?\-,_]', ' ', text)  # remove bracket
        return text

In [9]:
df['sentence'] = df['sentence'].astype(str)

In [10]:
def detect_language(text):
    """
    Detect the language of the given text.

    Parameters:
    text (str): Input text.

    Returns:
    str: Detected language ('en' for English, 'de' for German), or 'unknown' if language detection failed.
    """
    try:
        detected_lang = detect(text)
        if detected_lang == 'en':
            return 'en'
        elif detected_lang == 'de':
            return 'de'
        else:
            return 'unknown'
    except Exception as e:
        print("Language detection failed.")
        return 'unknown'

### Linguistic features

#### Feature: word count

In [11]:
def word_count(raw, min_length = 1):
    """ Function to count the number of words in a passage of text.
        Supplying parameter 'min_length' gives number of words with
        at least min_length letters.
    """
    words = re.sub(r'[^\w\s]',' ',raw)
    text = re.sub('[0-9]', '', words)
    tokens = nltk.word_tokenize(words)
    return len([word for word in tokens if len(word) >= min_length])

In [12]:
df['word_counts'] = df['sentence'].apply(word_count)
df_word_counts = df.groupby(['company','year'],as_index=False)['word_counts'].sum()

#### Feature: sentence count

In [13]:
df_sen_counts = df.groupby(['company','year'],as_index=False).size()
df_sen_counts = df_sen_counts.rename(columns={"size": "sentence_counts"})

In [14]:
features = pd.concat([df_word_counts , df_sen_counts["sentence_counts"]], axis=1)

#### Feature: type-token ratio
The number of unique words was divided by the total number of words within each document

In [16]:
def getTypeData(tokenList):
        """
    Calculate type-based data for a given token list.

    Parameters:
    tokenList (list): List of tokens.

    Returns:
    DataFrame: DataFrame containing type-based data (type, frequency, probability, rank, cumulative probability, part of speech).
    """
        typeData = pd.DataFrame()
        freqDist = FreqDist(tokenList)
        typeNum = len(freqDist)
        typeData['type'] = FreqDist(tokenList).keys()
        typeData['freq'] = [freqDist[i] for i in typeData['type']]
        tokenNum = np.sum(typeData['freq'])
        typeData = typeData.sort_values('freq', ascending=False)
        typeData.index = range(typeNum)
        typeData['prob'] = [freq / tokenNum for freq in typeData['freq']]
        typeData['rank'] = list(range(1, typeNum + 1))
        typeData['cumProb'] = [np.sum(typeData['prob'][:i]) for i in
                               range(1, typeNum + 1)]
        typeData['pos'] = [i[1] for i in pos_tag(typeData['type'])]
        return typeData

def getTTR(rawtext):
        """
    Calculate type-token ratio (TTR) for a given text.

    Parameters:
    rawtext (str): Input text.

    Returns:
    float: Type-token ratio (TTR) for the input text.
    """
        text = cleanText(rawtext)  # clean text
        tokenList = regexp_tokenize(text, '\w+')  # tokenize
        tokenNum = len(tokenList)  # calculate token number
        typeData = getTypeData(tokenList)
        typeNum = len(typeData)
        ttr = typeNum / tokenNum
        return ttr

In [17]:
features["type_token"] = sentences["sentence"].apply(getTTR)

#### feature: content-word diversity
Content (or lexical) words according to consist of nouns, main verbs, adjectives and adverbs. <br>
The number of content words was divided by the total number of words within each document.

In [18]:
def calculate_content_word_diversity(text):
    """
    Calculate Content Word Diversity for a given text.
    Content words consist of nouns, main verbs, adjectives, and adverbs.
    Content Word Diversity = (Number of content words) / (Total number of words)

    Parameters:
    text (str): Input text for which Content Word Diversity is to be calculated.

    Returns:
    float: Content Word Diversity for the input text.
    """
    words = word_tokenize(text.lower())
    pos_tags = nltk.pos_tag(words)
    content_word_count = sum(1 for word, pos in pos_tags if pos.startswith(('N', 'V', 'R', 'J')))
    total_word_count = len(words)
    content_word_diversity = content_word_count / total_word_count
    return content_word_diversity

In [19]:
features["content_word_diversity"] = sentences["sentence"].apply(calculate_content_word_diversity)

In [20]:
#features

#### features : average sentence length and average word length

In [21]:
def average_sentence_length(text):
    """
    Calculate the average sentence length in terms of words in a given text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Average sentence length (in words).
    """
    sentences = nltk.sent_tokenize(text)
    total_words = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences)
    total_sentences = len(sentences)
    if total_sentences == 0:
        return 0
    average_length = total_words / total_sentences
    return average_length

def average_word_length(text):
    """
    Calculate the average word length in characters in a given text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Average word length (in characters).
    """
    words = nltk.word_tokenize(text)
    total_characters = sum(len(word) for word in words)
    total_words = len(words)
    if total_words == 0:
        return 0
    average_length = total_characters / total_words
    return average_length

In [22]:
features["average_sentence_length"] = sentences["sentence"].apply(average_sentence_length)
features["average_word_length"] = sentences["sentence"].apply(average_word_length)

In [23]:
#features

#### features : Space Ratio

In [24]:
def posTag(rawText):
    """
    This function performs part-of-speech tagging on the input raw text using NLTK.
    
    Parameters:
    rawText (str): The raw text to be tagged for part-of-speech.
    
    Returns:
    str: A string representing the tagged text, where each word is combined with its
         corresponding part-of-speech tag in the format 'word_POS'.
    """
    # Perform part-of-speech tagging on the raw text
    tagList = pos_tag(word_tokenize(rawText))
    
    # Construct a list of words and their part-of-speech tags in a specific format (word_POS)
    resultList = [i[0].lower() + '_' + i[1] for i in tagList]
    
    # Join the words and their part-of-speech tags to form the tagged text
    taggedText = ' '.join(resultList)
    
    # Return the tagged text
    return taggedText

    
def calculate_spatial_adverbial_ratio(text):
    """
    Calculate the ratio of spatial adverbials for the given language using part-of-speech tagging.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of spatial adverbials.
    """
    # Clean the text
    text = cleanText(text)
    language = detect_language(text)
    
    # Tokenize the text using regexp_tokenize
    tokenList = regexp_tokenize(text, '\w+')
    tokenNum = len(tokenList)
    
    # Perform part-of-speech tagging and join into a tagged text
    taggedText = posTag(text)

    # Define the pattern to match spatial adverbials based on the provided list
    english_spatial_pattern = r'( (aboard|above|abroad|across|ahead|alongside|around' \
                              r'|ashore|astern|away|behind|below|beneath|beside|downhill' \
                              r'|downstairs|downstream|east|far|here|hereabouts|indoors|inland' \
                              r'|inshore|inside|locally|near|nearby|north|nowhere|outdoors' \
                              r'|outside|overboard|overland|overseas|south|underfoot|there' \
                              r'|underground|underneath|uphill|upstairs|upstream|west)' \
                              r'_[A-Z]+)'

    german_spatial_pattern = r'( (hier| dort|dorthin|oben|unten|nahe|weit|innen|außen'\
                                r'|an Bord|überall|quer|voraus|entlang|um|an Land|achtern'\
                                r'|weg|hinter|neben|bergab|unten|stromabwärts|östlich|hier'\
                                r'|in der Nähe|drinnen|im Landesinneren|an Land|lokal|in der Nähe'\
                                r'|nördlich|nirgendwo|im Freien|über Bord|über Land|überseeisch'\
                                r'|südlich|unter den Füßen|unterirdisch|unterhalb|bergauf|oben|stromaufwärts|westlich)'\
                                r'_[A-Z]+)'
    
    # Select the appropriate pattern based on the language
    if language == 'en':
        pattern = english_spatial_pattern
    elif language == 'de':
        pattern = german_spatial_pattern
    else:
        pattern = german_spatial_pattern
        #print (text)
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of matches for the pattern (spatial adverbials)
    num = len(re.findall(pattern, taggedText))

    # Calculate the ratio of spatial adverbials
    spatial_adverbial_ratio = 100 * num / tokenNum if tokenNum != 0 else 0
    return spatial_adverbial_ratio

In [25]:
features["spatial_adverbial_ratio"] = sentences["sentence"].apply(calculate_spatial_adverbial_ratio)

#### features : Time Ratio

In [26]:
def calculate_time_adverbial_ratio(text):
    """
    Calculate the ratio of time adverbials for English and German using part-of-speech tagging.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of time adverbials.
    """
    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)
    
    # Tokenize the text using regexp_tokenize
    tokenList = regexp_tokenize(cleaned_text, '\w+')
    tokenNum = len(tokenList)

    # Perform part-of-speech tagging and join into a tagged text
    taggedText = posTag(text)

    # Define the patterns for time adverbials
    english_time_pattern =  r'( (afterwards|again|earlier|early|eventually|formerly' \
                            r'|immediately|initially|instantly|late|lately|later' \
                            r'|momentarily|now|nowadays|once|originally|presently' \
                            r'|previously|recently|shortly|simultaneously|soon' \
                            r'|subsequently|today|tomorrow|tonight|yesterday' \
                            r'|morning|afternoon|evening|night)' \
                            r'_[A-Z]+)'
    
    german_time_pattern =  r'( (nachher|wieder|früher|früh|schließlich|ehemals'\
                           r'|sofort|anfänglich|augenblicklich|spät|Zeit|später'\
                           r'|moment|jetzt|heutzutage|einmal|ursprünglich|gegenwärtig'\
                           r'|früher|kürzlich|bald|gleichzeitig|bald|anschließend'\
                           r'|morgen|nachmittag|abend|nachts)'\
                           r'_[A-Z]+)'

    # Select the appropriate pattern based on the language
    if language == 'en':
        pattern = english_time_pattern
    elif language == 'de':
        pattern = german_time_pattern
    else:
        pattern = german_time_pattern
        
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of matches for the pattern (time adverbials)
    num = len(re.findall(pattern, taggedText))
    
    # Calculate the ratio of time adverbials
    time_adverbial_ratio = 100 * num / tokenNum if tokenNum != 0 else 0
    return time_adverbial_ratio

In [27]:
features["time_adverbial_ratio"] = sentences["sentence"].apply(calculate_time_adverbial_ratio)

#### features : Generalizing word

In [28]:
def calculate_generalizing_term_ratio(text):
    """
    Calculate the ratio of generalizing terms to the total number of words in the text for English and German.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of generalizing terms to the total number of words.
    """
    # Detect the language of the text
    cleaned_text = cleanText(text)
    language = detect_language(text)

    if language not in ['en', 'de']:
        raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        generalizing_terms = {'often', 'sometimes', 'usually', 'generally', 'typically', 'commonly', 'mostly', 'general'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        generalizing_terms = {'oft', 'manchmal', 'normalerweise', 'generell', 'typischerweise', 'üblicherweise', 'meistens', 'allgemein'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        generalizing_terms = {'oft', 'manchmal', 'normalerweise', 'generell', 'typischerweise', 'üblicherweise', 'meistens', 'allgemein'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of generalizing terms
    
    generalizing_term_count = sum(1 for token in doc if token.text.lower() in generalizing_terms)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (generalizing_term_count / total_words) * 100 if total_words != 0 else 0
    return ratio

#### features : Count modal verbe

In [29]:
def calculate_modal_verb_ratio(text):
    """
    Calculate the ratio of modal verbs to the total number of words in the text using spaCy.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of modal verbs to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        modal_verbs = {'can', 'could', 'will', 'would', 'shall', 'should', 'may', 'might', 'must'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        modal_verbs = {'können', 'kann', 'könnte', 'will', 'würde', 'soll', 'sollte', 'mögen', 'müsste', 'muss'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        modal_verbs = {'können', 'kann', 'könnte', 'will', 'würde', 'soll', 'sollte', 'mögen', 'müsste', 'muss'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of modal verbs
    modal_verb_count = sum(1 for token in doc if token.text.lower() in modal_verbs)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (modal_verb_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [30]:
features["modal_verb_ratio"] = sentences["sentence"].apply(calculate_modal_verb_ratio)

#### features : Total first person / Impersonal  pronouns 

In [31]:
def calculate_first_person_ratio(text):
    """
    Calculate the ratio of total first-person pronouns to the total number of words in the text using spaCy.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of total first-person pronouns to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text , disable=['ner', 'parser'])
        first_person_pronouns = {'I', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'}
    elif language == 'de':
        doc = nlp_de(cleaned_text , disable=['ner', 'parser'])
        first_person_pronouns = {'ich', 'mich', 'mein', 'meine', 'mir', 'wir', 'uns', 'unser', 'unsere', 'uns'}
    else:
        doc = nlp_de(cleaned_text , disable=['ner', 'parser'])
        first_person_pronouns = {'ich', 'mich', 'mein', 'meine', 'mir', 'wir', 'uns', 'unser', 'unsere', 'uns'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of first-person pronouns
    first_person_count = sum(1 for token in doc if token.text.lower() in first_person_pronouns)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (first_person_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [32]:
def calculate_impersonal_pronoun_ratio(text):
    """
    Calculate the ratio of impersonal pronouns to the total number of words in the text using spaCy.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of impersonal pronouns to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text , disable=['ner', 'parser'])
        impersonal_pronouns = {'it', 'its'}
    elif language == 'de':
        doc = nlp_de(cleaned_text , disable=['ner', 'parser'])
        impersonal_pronouns = {'es'}
    else:
        impersonal_pronouns = {'es'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of impersonal pronouns
    impersonal_pronoun_count = sum(1 for token in doc if token.text.lower() in impersonal_pronouns)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (impersonal_pronoun_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [33]:
features["first_person_ratio"] = sentences["sentence"].apply(calculate_first_person_ratio)

In [34]:
features["impersonal_pronoun_ratio"] = sentences["sentence"].apply(calculate_impersonal_pronoun_ratio)

#### features : Negations

In [35]:
def calculate_negation_ratio(text):
    """
    Calculate the ratio of negation words to the total number of words in the text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of negation words to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        negation_words = {'not', "n't", 'never', 'no', 'nothing', 'neither', 'nowhere', 'none', 'nor'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        negation_words = {'nicht', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'nirgendwo', 'niemals', 'nie'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        negation_words = {'nicht', 'kein', 'keine', 'keinem', 'keinen', 'keiner', 'nirgendwo', 'niemals', 'nie'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of negation words
    negation_count = sum(1 for token in doc if token.text.lower() in negation_words)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (negation_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [36]:
features["negation_ratio"] = sentences["sentence"].apply(calculate_negation_ratio)

#### features : Conjuctions Exclusive/ Inclusive
maybe i will use LIWC to calculate exclusive/inclusive because i can't finde the dic

In [37]:
def calculate_exclusive_conjunction_ratio(text):
    """
    Calculate the ratio of exclusive conjunctions to the total number of words in the text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of exclusive conjunctions to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        exclusive_conjunctions = {'but', 'yet'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        exclusive_conjunctions = {'aber', 'sondern'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        exclusive_conjunctions = {'aber', 'sondern'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of exclusive conjunctions
    exclusive_conjunction_count = sum(1 for token in doc if token.text.lower() in exclusive_conjunctions)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (exclusive_conjunction_count / total_words) * 100 if total_words != 0 else 0
    return ratio


def calculate_inclusive_conjunction_ratio(text):
    """
    Calculate the ratio of inclusive conjunctions to the total number of words in the text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of inclusive conjunctions to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        inclusive_conjunctions = {'and', 'or'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        inclusive_conjunctions = {'und', 'oder'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        inclusive_conjunctions = {'und', 'oder'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of inclusive conjunctions
    inclusive_conjunction_count = sum(1 for token in doc if token.text.lower() in inclusive_conjunctions)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (inclusive_conjunction_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [38]:
features["exclusive_conjunction_ratio"] = sentences["sentence"].apply(calculate_exclusive_conjunction_ratio)
features["inclusive_conjunction_ratio"] = sentences["sentence"].apply(calculate_inclusive_conjunction_ratio)

#### features : Tentative

In [39]:
def calculate_tentative_word_ratio(text):
    """
    Calculate the ratio of tentative words to the total number of words in the text.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of tentative words to the total number of words.
    """

    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text, disable=['ner', 'parser'])
        tentative_words = {'maybe', 'perhaps', 'possibly', 'probably', 'uncertain', 'doubt', 'likely', 'would', 'could'}
    elif language == 'de':
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        tentative_words = {'vielleicht', 'möglicherweise', 'eventuell', 'wahrscheinlich', 'unsicher', 'zweifel', 'vermutlich', 'würde', 'könnte'}
    else:
        doc = nlp_de(cleaned_text, disable=['ner', 'parser'])
        tentative_words = {'vielleicht', 'möglicherweise', 'eventuell', 'wahrscheinlich', 'unsicher', 'zweifel', 'vermutlich', 'würde', 'könnte'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of tentative words
    tentative_word_count = sum(1 for token in doc if token.text.lower() in tentative_words)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (tentative_word_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [40]:
features["tentative_word_ratio"] = sentences["sentence"].apply(calculate_tentative_word_ratio)

#### features : Informal
maybe i will use LIWC

#### features : Count Number

In [43]:
def calculate_number_count_ratio(text):
    """
    Calculate the ratio of number counts to the total number of words in the text for English and German.

    Parameters:
    text (str): Input text.

    Returns:
    float: Ratio of number counts to the total number of words.
    """
    # Detect the language of the text
    language = detect_language(text)

    #if language not in ['en', 'de']:
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(text, disable=['ner', 'parser'])
    elif language == 'de':
        doc = nlp_de(text, disable=['ner', 'parser'])
    else :
        doc = nlp_de(text, disable=['ner', 'parser'])

    # Count the number of numeric tokens
    number_count = sum(1 for token in doc if token.is_digit)

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (number_count / total_words) * 100 if total_words != 0 else 0
    return ratio


In [44]:
features["number_count_ratio"] = sentences["sentence"].apply(calculate_number_count_ratio)

#### features : Future Tense

In [45]:
def calculate_future_verb_ratio(text):
    """
    Calculate the percentage ratio of future tense verbs to the total number of words in the text using spaCy.

    Parameters:
    text (str): Input text.

    Returns:
    float: Percentage ratio of future tense verbs to the total number of words.
    """

    # Detect the language
    language = detect_language(text)

    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(text, disable=['ner', 'parser'])
        future_tense_forms = {'will', 'shall'}
    elif language == 'de':
        doc = nlp_de(text, disable=['ner', 'parser'])
        future_tense_forms = {'werde', 'wirst', 'wird', 'werden'}
    else:
        doc = nlp_de(text, disable=['ner', 'parser'])
        future_tense_forms = {'werde', 'wirst', 'wird', 'werden'}
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of future tense verbs
    future_verb_count = sum(1 for token in doc if token.lemma_ in future_tense_forms and token.pos_ == "AUX")

    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (future_verb_count / total_words) * 100 if total_words != 0 else 0
    return ratio

In [46]:
features["future_verb_ratio"] = sentences["sentence"].apply(calculate_future_verb_ratio)

#### features : Past Tense

In [47]:
def calculate_past_verb_ratio(text):
    """
    Calculate the percentage ratio of past tense verbs to the total number of words in the text using spaCy.

    Parameters:
    text (str): Input text.
    language (str): Language of the text ('en' for English, 'de' for German).

    Returns:
    float: Percentage ratio of past tense verbs to the total number of words.
    """
    
    # Clean the text and detect the language
    cleaned_text = cleanText(text)
    language = detect_language(text)
    
    # Process the text with the appropriate spaCy model
    if language == 'en':
        doc = nlp_en(cleaned_text,disable = ['ner', 'parser'])
    elif language == 'de':
        doc = nlp_de(cleaned_text,disable = ['ner', 'parser'])
    else:
        doc = nlp_de(cleaned_text,disable = ['ner', 'parser'])
        #raise ValueError("Unsupported language. Supported languages: 'en' (English), 'de' (German)")

    # Count the number of past tense verbs (VERB with past tense morphology)
    past_verb_count = sum(1 for token in doc if token.pos_ == 'VERB' and ('VBD'or'VVFIN' in token.tag_))
    
    # Total number of words
    total_words = len(doc)

    # Calculate the ratio
    ratio = (past_verb_count / total_words) * 100 if total_words != 0 else 0
    return ratio

In [48]:
features["past_verb_ratio"] = sentences["sentence"].apply(calculate_past_verb_ratio)

In [50]:
features

Unnamed: 0,company,year,word_counts,sentence_counts,type_token,content_word_diversity,average_sentence_length,average_word_length,spatial_adverbial_ratio,time_adverbial_ratio,modal_verb_ratio,first_person_ratio,impersonal_pronoun_ratio,negation_ratio,exclusive_conjunction_ratio,inclusive_conjunction_ratio,tentative_word_ratio,number_count_ratio,future_verb_ratio,past_verb_ratio
0,Aldi,2020,6283,371,0.178620,0.565418,19.759887,4.683345,0.102951,0.171585,0.645902,0.846849,0.100474,0.358835,0.028707,3.200804,0.043060,5.647287,0.485639,7.664705
1,Aldi,2021,8139,468,0.154746,0.572570,20.411111,4.652041,0.065738,0.065738,0.494234,1.219110,0.076881,0.186711,0.010983,2.877540,0.032949,5.176996,0.276390,7.655135
2,Allianz,2020,61333,2565,0.082896,0.578371,26.288994,5.114752,0.478576,0.162405,0.458408,2.774264,0.324458,0.151811,0.026790,4.226882,0.034232,2.453852,0.220803,10.165354
3,Allianz,2021,59048,2367,0.082174,0.575771,27.082408,5.089017,0.498630,0.111416,0.469732,2.858832,0.269085,0.183538,0.043551,4.104710,0.048218,2.500385,0.197196,9.651279
4,Asklepios,2020,12877,737,0.277373,0.828323,19.595598,6.322757,0.514974,0.158453,0.490402,2.494045,0.182149,0.427350,0.112092,3.909206,0.028023,1.378490,1.168568,8.301807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,telekom,2020,66723,3074,0.079060,0.563742,24.523163,4.913692,0.387579,0.169855,0.523400,3.281570,0.438164,0.235730,0.046613,3.332179,0.051940,1.960087,0.133044,9.446501
116,telekom,2021,86733,3809,0.069928,0.556177,26.194667,4.825764,0.256932,0.230884,0.385260,3.297784,0.450473,0.208682,0.033108,3.189429,0.036118,1.850748,0.074507,9.111796
117,volkswagen,2020,22970,599,0.138063,0.566038,46.541586,4.316807,0.324070,0.115422,0.639429,2.208935,0.535625,0.311410,0.103803,3.504401,0.041521,1.107255,0.235498,8.221226
118,volkswagen,2021,20322,532,0.135339,0.578111,48.993103,4.206175,0.324854,0.159928,0.643072,1.830642,0.492865,0.276943,0.098573,3.370259,0.028164,1.107011,0.233547,8.646264


In [49]:
features.to_csv("myfeatures.csv", encoding='utf-8', index=False)