# Inference of the English marker representations



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
print(os.getcwd())

/content


In [None]:
file_path='/content/drive/My Drive/Colab Notebooks/English_raw_data/'

In [None]:
keys=['plural_noun_sentences', 'third_person_present_sentences', 'accusative_case_sentences', 'genitive_case_sentences', 'dative_case_sentences', 'sublative_case_sentences', 'translative_case_sentences', 'causal_final_case_sentences']

In [None]:
import pandas as pd
dfs={}
for key in keys:
    df_path=file_path + f'{key}.csv'
    dfs[key]=pd.read_csv(df_path)

In [None]:
def read_data_as_sentence(file_path):
    """
    Parses the CoNNL-U Plus file and returns a dataframe of sentences.
    Extracting features from the data and return a dataframe of sentence's Input_form list and sentence's arguments list.

    Returns a dataframe, where each row represents one sentence with its all words and all features of words (each columns is a list with lengh of number of words in sentence).

    file_path (str): The file path to the data to be preprocessed.
    output_path (str): The file path to the save processed dataframe.
    """
    # sentences list for all sentence in data file
    sentences = []
    # sentence list for all token in each sentence
    sentence = []  # Initialize an empty list for the current sentence
    # Open data file
    with open(file_path, 'r', encoding="utf8") as file:
        # For each line in data file
        for line in file:
            # split line by TAB (\t)
            line = line.strip().split('\t')
            # If the line starts with '#', it's a comment, ignore it
            if line[0].startswith('#'):
                continue
            # If the line is not empty
            elif line[0].strip() != '':

                # Create a token if its ID does not contain a period
                # Each token only has form, predicate, and arguments (argument per each predicate of sentence)
                if '.' not in line[0] and len(line) > 10:
                    token = {
                        'id': line[0],
                        'form': line[1],
                        'lemma': line[2],
                        'upos': line[3],
                        'xpos': line[4],
                        'head': line[6],
                        'dependency_relation': line[7],
                        'dependency_graph': line[8],
                        'miscellaneous': line[9],
                        'predicate': line[10],
                        'argument': line[11:]  # Store all remaining elements as arguments.
                    }
                    # Append the token to the sentence.
                    sentence.append(token)
# A new line indicates the end of a sentence.
            # If line is empty one sentence has been finished.
            elif line[0].strip() == '':
                # Append the completed sentence to the sentences list.
                sentences.append(sentence)
                # Reset sentence for the next sentence.
                sentence = []
    print(f'num of sentences before duplication{len(sentences)}\n')
    # Iterate over all sentences. Create copies of sentences for each predicate.
    expanded_sentences = []
    for sentence in sentences:
        # Find all predicates in the sentence.
        predicates = [token['predicate'] for token in sentence if token['predicate'] != '_']

        # Skip sentences that do not contain any predicates
        if len(predicates) == 0:
            continue

        # For every predicate, create a copy of the sentence.
        for index, predicate in enumerate(predicates):
            # For each predicate in the sentence, we make a copy of the sentence.
            sentence_copy = [token.copy() for token in sentence]
            # Finding the predicate 'form' of the sentence predicate.
            predicate_form = [token['form'] for token in sentence_copy if token['predicate'] == predicate]
            for token in sentence_copy:
                token['predicate'] = predicate_form[0]

                # Keep only the relevant argument for this predicate. Overwrite 'V' and 'C-V' with '_'.
                if token['argument'][index] == 'V' or token['argument'][index] == 'C-V':
                    token['argument'] = '_'
                else:
                    token['argument'] = token['argument'][index]

            expanded_sentences.append(sentence_copy)

    print(f'num of sentences after duplication {len(expanded_sentences)}\n')
    print(f'the difference between the two is {len(expanded_sentences) - len(sentences)}')

    return expanded_sentences

In [None]:
file_path=f'{file_path}en_ewt-up-train.conllu'

In [None]:
sents=read_data_as_sentence(file_path)

num of sentences before duplication12543

num of sentences after duplication 40482

the difference between the two is 27939


## Preprocessing of the English data to include the needed information

In [None]:
def create_sentence_dfs(sentences):
    """
    Converts each sentence (a list of token dictionaries) into a DataFrame.

    Parameters:
        sentences (list of list of dict): A list containing sentences, where each sentence is a list of token dictionaries.

    Returns:
        list of pd.DataFrame: A list of DataFrames, each representing a sentence.
    """
    # Convert each sentence (list of dictionaries) into a DataFrame
    sentence_dfs = [pd.DataFrame(sentence) for sentence in sentences]
    return sentence_dfs

In [None]:
sentences=create_sentence_dfs(sents)

In [None]:
import pandas as pd

def add_head_srl_labels(sentence_df):
    """
    Adds the semantic role label of the head for each token in the DataFrame.

    Parameters:
        sentence_df (pd.DataFrame): DataFrame representing a sentence with token-level information, including 'head' and 'argument'.

    Returns:
        pd.DataFrame: The input DataFrame with an additional 'head_srl_label' column.
    """
    # Create a dictionary to map each token's ID to its SRL label
    srl_label_dict = sentence_df.set_index('id')['argument'].to_dict()

    # Initialize a list to store the head SRL labels for each token
    head_srl_labels = []

    # For each token, find the SRL label of its head
    for _, row in sentence_df.iterrows():
        head_id = row['head']  # Get the head ID of the token
        # Get the SRL label of the head, or "_" if the head is not in the dictionary (e.g., root of the sentence)
        head_srl_label = srl_label_dict.get(head_id, "_")
        head_srl_labels.append(head_srl_label)

    # Add the head SRL labels as a new column in the DataFrame
    sentence_df['head_srl_label'] = head_srl_labels
    return sentence_df



# Generate DataFrames for each sentence
sentence_dfs = [pd.DataFrame(sentence) for sentence in sents]

# Apply the head SRL labeling to each sentence DataFrame
labeled_sentence_dfs = [add_head_srl_labels(df) for df in sentence_dfs]

# Print each DataFrame with the new head SRL labels to verify



In [None]:
import spacy
import pandas as pd

# Load the spaCy model for morphology
nlp = spacy.load("en_core_web_sm")

def add_morphological_info(sentence_dfs):
    """
    Adds morphological information to each token in each sentence DataFrame
    and includes the original sentence as a new column 'text'.

    Parameters:
        sentence_dfs (list of pd.DataFrame): A list of DataFrames representing sentences, each containing token-level information.

    Returns:
        list of pd.DataFrame: A list of DataFrames with morphological information and original sentence added.
    """
    updated_sentence_dfs = []
    for df in sentence_dfs:
        # Join tokens into a single sentence for spaCy processing
        sentence_text = " ".join(df['form'])
        doc = nlp(sentence_text)

        # Ensure token alignment by matching spaCy tokens to DataFrame rows
        morphs = [token.morph.to_dict() for token in doc]

        # Check if spaCy token count matches DataFrame row count
        if len(morphs) == len(df):
            df = df.copy()  # Make a copy to avoid modifying the original DataFrame
            df['morph'] = morphs
            df['text'] = sentence_text  # Add the full sentence as a new column
            updated_sentence_dfs.append(df)

    return updated_sentence_dfs

# Run the morphological analyzer and add the morph column to each sentence DataFrame
sentence_dfs_with_morph = add_morphological_info(labeled_sentence_dfs)

def filter_sentences(sentence_dfs):
    plural_noun_sentences = []
    third_person_present_sentences = []
    accusative_case_sentences = []
    genitive_case_sentences = []
    dative_case_sentences = []
    sublative_case_sentences = []
    translative_case_sentences = []
    causal_final_case_sentences = []

    for df in sentence_dfs:
        add_head_srl_labels(df)  # Ensure head SRL labels are added only once per DataFrame

        # 1. Plural marker on nouns
        if df[(df['upos'] == 'NOUN') & (df['morph'].apply(lambda x: x.get('Number') == 'Plur'))].shape[0] > 0:
            plural_noun_sentences.append(df)

        # 2. Third-person present tense verb conjugation
        if df[(df['upos'] == 'VERB') & (df['morph'].apply(lambda x: x.get('Person') == '3')) & (df['morph'].apply(lambda x: x.get('Tense') == 'Pres'))].shape[0] > 0:
            third_person_present_sentences.append(df)

        # 3. Accusative case for "him", "them", "me" with ARG1 SRL label
        if df[(df['form'].isin(['him', 'them', 'me'])) & (df['argument'] == 'ARG1')].shape[0] > 0:
            accusative_case_sentences.append(df)

        # 4. Genitive case signaled by possessive marker 's
        if df['form'].str.endswith("'s").any():
            genitive_case_sentences.append(df)

        # 5. Dative case with "to" or "for" and head labeled ARG2
        if df[(df['form'].isin(['to', 'for'])) & (df['head_srl_label'] == 'ARG2')].shape[0] > 0:
            dative_case_sentences.append(df)

        # 6. Sublative case with "onto" preposition
        if (df['form'] == 'on').any():
            sublative_case_sentences.append(df)

        # 7. Translative case with "into" preposition
        if (df['form'] == 'into').any():
            translative_case_sentences.append(df)

        # 8. Causal-final case with "for" preposition, where head is not ARG2
        if df[(df['form'] == 'for') & (df['head_srl_label'] != 'ARG2')].shape[0] > 0:
            causal_final_case_sentences.append(df)

    return {
        "plural_noun_sentences": plural_noun_sentences,
        "third_person_present_sentences": third_person_present_sentences,
        "accusative_case_sentences": accusative_case_sentences,
        "genitive_case_sentences": genitive_case_sentences,
        "dative_case_sentences": dative_case_sentences,
        "sublative_case_sentences": sublative_case_sentences,
        "translative_case_sentences": translative_case_sentences,
        "causal_final_case_sentences": causal_final_case_sentences
    }

# Apply the filtering with morphological information
filtered_sentences = filter_sentences(sentence_dfs_with_morph)






In [None]:
import pandas as pd
import spacy
from transformers import BertTokenizer

# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def extract_target_morpheme_subtoken(sentence, doc, bert_subtokens, bert_token_ids):
    # Track the subtoken index
    subtoken_idx = 0

    # Iterate through each token in the spaCy doc and identify the first target token
    for token in doc:
        morph = token.morph
        form = token.text
        lemma = token.lemma_
        morpheme_diff = len(form) - len(lemma) if form != lemma else 0

        # Tokenize only the word form to find the subtoken range
        word_subtokens = tokenizer.tokenize(form)
        word_subtoken_ids = tokenizer.convert_tokens_to_ids(word_subtokens)

        # Calculate the subtoken start and end index for the token
        subtoken_start_idx = subtoken_idx
        subtoken_end_idx = subtoken_idx + len(word_subtokens) - 1

        # Identify if this token matches any target criteria
        target_class = None
        if token.pos_ == 'NOUN' and morph.get("Number") == ["Plur"]:
            target_class = "Plural Noun"
        elif token.pos_ == 'VERB' and morph.get("Person") == ["3"] and morph.get("Tense") == ["Pres"]:
            target_class = "Third-Person Present Tense Verb"
        elif token.text in ['him', 'them', 'me'] and token.ent_type_ == 'ARG1':
            target_class = "Accusative Case"
        elif token.text.endswith("'s"):
            target_class = "Genitive Case"
        elif token.text in ['to', 'for'] and token.head.ent_type_ == 'ARG2':
            target_class = "Dative Case"
        elif token.text == 'on':
            target_class = "Sublative Case"
        elif token.text == 'into':
            target_class = "Translative Case"
        elif token.text == 'for' and token.head.ent_type_ != 'ARG2':
            target_class = "Causal-Final Case"

        # If this is a target token, create a row and break to prevent multiple rows per sentence
        if target_class:
            target_row = {
                "Word": form,
                "Lemma": lemma,
                "POS": token.pos_,
                "Case": morph.get("Case")[0] if morph.get("Case") else None,
                "Number": morph.get("Number")[0] if morph.get("Number") else None,
                "Person": morph.get("Person")[0] if morph.get("Person") else None,
                "Tense": morph.get("Tense")[0] if morph.get("Tense") else None,
                "Mood": morph.get("Mood")[0] if morph.get("Mood") else None,
                "Voice": morph.get("Voice")[0] if morph.get("Voice") else None,
                "Form-Lemma Difference": morpheme_diff,
                "Subtokens": word_subtokens,
                "Subtoken IDs": word_subtoken_ids,
                "Subtoken Start Index": subtoken_start_idx,
                "Subtoken End Index": subtoken_end_idx,
                "Target Class": target_class,
                "BERT Subtokenized Sentence": ' '.join(bert_subtokens),
                "Sentence": sentence
            }
            return target_row  # Return immediately after finding the first target token

        # Update subtoken index for the next token
        subtoken_idx += len(word_subtokens)

    return None  # Return None if no target token is found

# Function to process a list of DataFrames with sentences
def process_target_instances(df_list, sentence_column, nlp, tokenizer):
    # Ensure unique sentences are processed
    unique_sentences = set()

    target_rows = []  # List to store only the target rows

    for df in df_list:
        # Drop duplicate rows within each DataFrame
        df = df.drop_duplicates(subset=[sentence_column])

        for index, row in df.iterrows():
            sentence = row[sentence_column]

            # Skip if this sentence has already been processed
            if sentence in unique_sentences:
                continue
            unique_sentences.add(sentence)

            # Process the sentence with spaCy and BERT
            doc = nlp(sentence)
            bert_subtokens = tokenizer.tokenize(sentence)
            bert_token_ids = tokenizer.convert_tokens_to_ids(bert_subtokens)

            target_row = extract_target_morpheme_subtoken(sentence, doc, bert_subtokens, bert_token_ids)

            # Append only if a target row was found
            if target_row:
                target_rows.append(target_row)

    # Convert the list of target rows to a DataFrame
    target_instances_df = pd.DataFrame(target_rows)

    return target_instances_df

# Example usage with a list of DataFrames (e.g., filtered_sentences values)
# final_target_df = process_target_instances(filtered_sentences_list, 'text', nlp, tokenizer)




In [None]:
target_items={}
full_items=[]
for key, value in filtered_sentences.items():



    final_target_df = process_target_instances(value[:-1], 'text', nlp, tokenizer)

    target_items[key]=final_target_df
    #full_items.append(df_morphemes_with_subtokens)

In [None]:
# Create a new dictionary to store deduplicated copies
deduplicated_target_items = {}

for key, value in target_items.items():
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = value.copy()
    print(key)
    print(len(value))
    print()
    # Remove duplicates on the copy
    # Replace 'column_with_list1' and 'column_with_list2' with actual column names if needed
    deduplicated_df = df_copy.drop_duplicates(subset=[col for col in df_copy.columns if not isinstance(df_copy[col].iloc[0], list)])
    print(len(deduplicated_df))
    # Store the deduplicated DataFrame in the new dictionary
    deduplicated_target_items[key] = deduplicated_df

# Now `deduplicated_target_items` contains the deduplicated DataFrames without modifying `target_items`.


plural_noun_sentences
4346

4346
third_person_present_sentences
1238

1238
accusative_case_sentences
326

326
genitive_case_sentences
737

737
dative_case_sentences
1025

1025
sublative_case_sentences
948

948
translative_case_sentences
175

175
causal_final_case_sentences
1388

1388


In [None]:
import torch
import numpy as np
import pickle
from transformers import BertTokenizer, BertModel

# Load the English BERT tokenizer and model with hidden states enabled
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)

# Move the model to GPU
model = model.to('cuda')

# Ensure the model is in evaluation mode
model.eval()

def extract_bert_hidden_representations(df, output_file="representations_english.pkl", model=model, tokenizer=tokenizer):
    # Use half of the dataset
    df_half = df.sample(frac=0.5, random_state=42)  # Using half of the dataset

    representations = []  # List to store all representations in memory

    for idx, row in df_half.iterrows():
        sentence = row['Sentence']
        encoded_input = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512, add_special_tokens=True)

        # Move tokenized input to GPU
        encoded_input = {key: value.to('cuda') for key, value in encoded_input.items()}

        # Process the sentence
        with torch.no_grad():
            outputs = model(**encoded_input)
            all_hidden_states = outputs.hidden_states

        # Get the length of the tokenized sentence (excluding padding)
        tokenized_length = (encoded_input['input_ids'] != tokenizer.pad_token_id).sum(dim=1) - 2

        subtoken_end_idx = row['Subtoken End Index']

        # Ensure subtoken_end_idx is within the tokenized length for this sentence
        if subtoken_end_idx > tokenized_length.item():
            print(f"Skipping token '{row['Word']}' in sentence '{row['Sentence']}' because subtoken_end_idx exceeds tokenized length.")
            torch.cuda.empty_cache()  # Clear GPU memory
            continue

        # Extract hidden states for the given subtoken end index for all layers
        token_representations_all_layers = [layer_output[0, subtoken_end_idx + 1, :].cpu().numpy() for layer_output in all_hidden_states]

        # Create the dictionary for the current sentence
        representation = {
            'Word': row['Word'],
            'Lemma': row['Lemma'],
            'Sentence': row['Sentence'],
            'Hidden Representations (All Layers)': token_representations_all_layers
        }

        # Save each sentence's representation to file incrementally
        with open(output_file, 'ab') as f:
            pickle.dump(representation, f)

        # Add to the list to keep in memory
        representations.append(representation)

        # Print confirmation for each sentence processed
        print(f"Processed and saved representation for token '{row['Word']}' in sentence: '{row['Sentence']}'")

        # Clear GPU memory after each sentence
        torch.cuda.empty_cache()

    # Return all representations stored in memory
    return representations


In [None]:
reps={}
for key, value in deduplicated_target_items.items():

    representation=extract_bert_hidden_representations(value, output_file="representations_english.pkl", model=model, tokenizer=tokenizer)
    reps[key]=(representation)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed and saved representation for token 'planes' in sentence: 'On its journey across the Bay of Bengal the freighter was tracked by the Indian navy and Orissa - based spy planes of India 's Aviation Research Center ; it was intercepted by Indian naval vessels off Sri Lanka 's east coast .'
Processed and saved representation for token 'grips' in sentence: 'Laser grips are the most common for wangs .'
Processed and saved representation for token 'Words' in sentence: 'Words can be lethal .'
Processed and saved representation for token 'minutes' in sentence: 'Of course I could n't make it back in time ( and they apparently could not stay 5 extra minutes to wait for me ) .'
Processed and saved representation for token 'has' in sentence: 'He has encyclopedic knowledge of hundreds of different Supreme Court cases , and he can recite details from memory .'
Processed and saved representation for token 'reasons' in sentence: '

In [None]:
import pandas as pd
from google.colab import files

# Step 1: Convert each list of dictionaries in `reps` to a DataFrame and save to CSV
for key, representation_list in reps.items():
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(representation_list)

    # Define the file name for each marker
    file_name = f"{key}_representations.csv"

    # Step 2: Save each DataFrame to a CSV file
    df.to_csv(file_name, index=False)

    # Step 3: Download the file to your local computer
    files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for key, value in reps.items():
  print(key)

plural_noun_sentences
third_person_present_sentences
accusative_case_sentences
genitive_case_sentences
dative_case_sentences
sublative_case_sentences
translative_case_sentences
causal_final_case_sentences


In [None]:
import pandas as pd
from google.colab import files

# Check if "Accusative" exists in `reps`
if "accusative_case_sentences" in reps:
    # Convert the "Accusative" representation list to a DataFrame
    df_accusative = pd.DataFrame(reps["accusative_case_sentences"])

    # Define the file name for the Accusative marker
    file_name = "Accusative_representations.csv"

    # Save the DataFrame to a CSV file
    df_accusative.to_csv(file_name, index=False)

    # Download the file to your local computer
    files.download(file_name)
else:
    print("The 'Accusative' marker was not found in reps.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for key, value in reps.items():
    print(key)
    print(len(value))

plural_noun_sentences
2173
third_person_present_sentences
619
accusative_case_sentences
163
genitive_case_sentences
368
dative_case_sentences
512
sublative_case_sentences
474
translative_case_sentences
88
causal_final_case_sentences
694
