## Imports

In [10]:
import os
import pandas as pd
from tqdm import tqdm

# Import custom modules
import data_utils
import conll_processing
import analysis

## Configuration

In [11]:
DATA_DIR = "data"
MIN_COUNT = 10  # Minimum occurrence count for positions

## 1. Load Metadata

In [12]:
# Load metadata from notebook 01
metadata = data_utils.load_metadata(os.path.join(DATA_DIR, 'metadata.pkl'))

langShortConllFiles = metadata['langShortConllFiles']
langNames = metadata['langNames']
langnameGroup = metadata['langnameGroup']
appearance_dict = metadata['appearance_dict']
ud_version = metadata['ud_version']

print(f"UD version: {ud_version}")
print(f"Languages: {len(langShortConllFiles)}")
total_files = sum(len(files) for files in langShortConllFiles.values())
print(f"Total short files to process: {total_files}")

Loaded metadata from data/metadata.pkl
UD version: 2.17
Languages: 186
Total short files to process: 818


## 2. Process CoNLL Files in Parallel

This step extracts dependency sizes for all verbal constructions across all languages.
It uses multiprocessing to parallelize across CPU cores.

**Note**: This is the most computationally intensive step and may take a long time depending on your system. On Calcul, this takes less than 40 seconds without bastards, and about 1 min 15 seconds with bastards.

In [13]:
# Flatten all short files into a single list
allshortconll = []
for lang, files in langShortConllFiles.items():
    allshortconll.extend(files)

print(f"Processing {len(allshortconll)} files in parallel...")

Processing 818 files in parallel...


In [14]:
# Process files in parallel
# this first adds spans and kids to each tree
# then it calls get_dep_sizes to compute the size of each verbal dependent
# finally it calls get_type_freq_all_files_parallel to compute the frequency of each configuration

all_langs_position2num, all_langs_position2sizes, all_langs_average_sizes = conll_processing.get_type_freq_all_files_parallel(allshortconll, include_bastards=True)
print("Processing complete!")
print(f"Results computed for {len(all_langs_position2num)} languages")

Starting processing all files, running on 80 cores


Processing files: 100%|██████████| 818/818 [00:44<00:00, 18.26it/s]


Finished processing. Combining results...
Done!
Processing complete!
Results computed for 185 languages


## 3. Filter by Minimum Count

Remove positions that occur fewer than MIN_COUNT times to reduce noise.

## 4. Compute Mean Aggregate Length (MAL)

For each language, compute MALₙ for n=1 to max available right dependents.

### MAL Formula

$$
\text{MAL}_n = \frac{\sum_{i=1}^{n} \text{position2sizes}[\text{right}\_i\text{\_totright}\_n]}{\sum_{i=1}^{n} \text{position2num}[\text{right}\_i\text{\_totright}\_n]}
$$

In [15]:
# Filter positions
filtered_position2num, filtered_position2sizes = analysis.filter_by_min_count(
    all_langs_position2num,
    all_langs_position2sizes,
    min_count=MIN_COUNT
)

# Count total positions before and after filtering
total_before = sum(len(positions) for positions in all_langs_position2num.values())
total_after = sum(len(positions) for positions in filtered_position2num.values())
print(f"Positions before filtering: {total_before}")
print(f"Positions after filtering (>= {MIN_COUNT}): {total_after}")
print(f"Removed: {total_before - total_after} ({100*(total_before - total_after)/total_before:.1f}%)")

Filtered to positions with at least 10 occurrences
Positions before filtering: 11984
Positions after filtering (>= 10): 6602
Removed: 5382 (44.9%)


## 5. Data Structure Documentation

### all_langs_position2num
Dictionary mapping each language code to a dictionary of position keys → occurrence counts.

**Structure**: `{lang_code: {position_key: count}}`

**Position keys**:
- `left_N`: N-th dependent to the left of verb (N=1 is closest)
- `right_N`: N-th dependent to the right of verb (N=1 is closest)
- `left_N_totleft_M`: N-th left dependent when there are M total left dependents
- `right_N_totright_M`: N-th right dependent when there are M total right dependents
- `average_totleft_M`: average size across all left dependents when M total
- `average_totright_M`: average size across all right dependents when M total

**Example**:
```python
all_langs_position2num['en'] = {
    'left_1': 12543,
    'right_1': 18732,
    'right_1_totright_2': 5234,
    ...
}
```

### all_langs_position2sizes
Dictionary mapping each language code to a dictionary of position keys → total size (sum of all dependency sizes).

**Structure**: `{lang_code: {position_key: total_size}}`

**Example**:
```python
all_langs_position2sizes['en'] = {
    'left_1': 25086,  # Total size = 12543 occurrences * ~2 words average
    'right_1': 56196,  # Total size = 18732 occurrences * ~3 words average
    ...
}
```

### all_langs_average_sizes
Dictionary mapping each language code to a dictionary of position keys → average dependency size.

**Structure**: `{lang_code: {position_key: average_size}}`

**Computation**: `average_size = total_size / count`

**Example**:
```python
all_langs_average_sizes['en'] = {
    'left_1': 2.0,  # 25086 / 12543
    'right_1': 3.0,  # 56196 / 18732
    ...
}
```

### lang2MAL
Dictionary mapping each language code to a dictionary of n → MALₙ values.

**Structure**: `{lang_code: {n: MAL_n}}`

**Computation**: For each n, compute the mean aggregate length of right dependents from position 1 to n.

**Formula**:
$$
\text{MAL}_n = \frac{\sum_{i=1}^{n} \text{position2sizes}[\text{right}\_i\text{\_totright}\_n]}{\sum_{i=1}^{n} \text{position2num}[\text{right}\_i\text{\_totright}\_n]}
$$

**Example**:
```python
lang2MAL['en'] = {
    1: 2.5,  # Average size when 1 right dependent
    2: 3.2,  # Average aggregate size when 2 right dependents
    3: 3.8,  # Average aggregate size when 3 right dependents
    ...
}
```

**Interpretation**: MALₙ typically increases with n, indicating that having more dependents correlates with longer dependencies.

In [16]:
# Compute MAL for each language
lang2MAL = analysis.compute_MAL_per_language(
    filtered_position2sizes,
    filtered_position2num
)

print(f"MAL computed for {len(lang2MAL)} languages")

# Show sample
sample_lang = list(lang2MAL.keys())[0]
print(f"\nSample (language '{sample_lang}'):")
print(lang2MAL[sample_lang])

MAL computed for 185 languages

Sample (language 'abq'):
{1: 1.7714285714285714}


## Analysis of bastard frequency

In [17]:
# Run analysis with bastards enabled
print("Running analysis WITH bastards...")
all_langs_position2num_with, all_langs_position2sizes_with, all_langs_average_sizes_with = conll_processing.get_type_freq_all_files_parallel(allshortconll, include_bastards=True)

# Run analysis WITHOUT bastards
print("\nRunning analysis WITHOUT bastards...")
all_langs_position2num_without, all_langs_position2sizes_without, all_langs_average_sizes_without = conll_processing.get_type_freq_all_files_parallel(allshortconll, include_bastards=False)

print("Done comparing!")

Running analysis WITH bastards...
Starting processing all files, running on 80 cores


Processing files: 100%|██████████| 818/818 [00:51<00:00, 15.76it/s]


Finished processing. Combining results...
Done!

Running analysis WITHOUT bastards...
Starting processing all files, running on 80 cores


Processing files: 100%|██████████| 818/818 [00:58<00:00, 13.88it/s]


Finished processing. Combining results...
Done!
Done comparing!


In [18]:
# Compute bastard statistics
lang_bastard_stats, global_bastard_relations = conll_processing.get_bastard_stats_all_files_parallel(allshortconll)

# Create a DataFrame for ranking
ranking_data = []
for lang, stats in lang_bastard_stats.items():
    verbs = stats['verbs']
    bastards = stats['bastards']
    percentage = (bastards / verbs * 100) if verbs > 0 else 0
    
    # Find most frequent relation
    relations = stats.get('relations', {})
    if relations:
        top_rel = max(relations, key=relations.get)
        top_rel_count = relations[top_rel]
        top_rel_str = f"{top_rel} ({top_rel_count})"
    else:
        top_rel_str = "None"

    ranking_data.append({
        'Code': lang,
        'Language': langNames.get(lang, lang),
        'Verbs': verbs,
        'Bastards': bastards,
        'Bastards_per_Verb_Pct': percentage,
        'Top_Bastard_Rel': top_rel_str
    })

df_ranking = pd.DataFrame(ranking_data)
df_ranking = df_ranking.sort_values('Bastards_per_Verb_Pct', ascending=False).reset_index(drop=True)

print("\nTop 20 Languages by Bastard Frequency (per Verb):")
print(df_ranking.head(20))

print("\nGlobal Bastard Relation Frequencies:")
sorted_relations = sorted(global_bastard_relations.items(), key=lambda x: x[1], reverse=True)
for rel, count in sorted_relations[:20]:
    print(f"{rel}: {count}")

# Export examples
examples_dir = os.path.join(DATA_DIR, 'examples')
os.makedirs(examples_dir, exist_ok=True)

print(f"\nExporting examples to {examples_dir}...")
count_exported = 0

for lang, stats in lang_bastard_stats.items():
    relations = stats.get('relations', {})
    examples = stats.get('examples', {})
    
    if relations and examples:
        # Get most frequent relation
        top_rel = max(relations, key=relations.get)
        
        if top_rel in examples:
            # Create file content
            content = f"# Language: {lang} ({langNames.get(lang, lang)})\n"
            content += f"# Most frequent bastard relation: {top_rel}\n"
            content += f"# Total bastards with this relation: {relations[top_rel]}\n\n"
            
            for i, tree_str in enumerate(examples[top_rel]):
                content += f"# Example {i+1}\n"
                content += tree_str + "\n"
            
            # Save to file
            filename = os.path.join(examples_dir, f"{lang}_{top_rel}_examples.conllu")
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(content)
            count_exported += 1

print(f"Exported example files for {count_exported} languages.")

Starting bastard analysis, running on 80 cores


Analyzing bastards: 100%|██████████| 818/818 [00:52<00:00, 15.65it/s]


Finished processing. Combining results...
Done!

Top 20 Languages by Bastard Frequency (per Verb):
   Code           Language   Verbs  Bastards  Bastards_per_Verb_Pct  \
0   grc       AncientGreek   80843     24326              30.090422   
1   xpg           Phrygian     234        58              24.786325   
2    la              Latin  149655     29636              19.802880   
3    ps             Pashto     327        63              19.266055   
4   pro       OldProvençal    5468       895              16.367959   
5   orv      OldEastSlavic   69895     11071              15.839473   
6    sa           Sanskrit   40150      6230              15.516812   
7    hu          Hungarian    3663       491              13.404313   
8   swl        SwedishSign     611        81              13.256956   
9   fro          OldFrench   37730      4609              12.215743   
10  gub          Guajajára    1060       129              12.169811   
11  hit            Hittite     213        25     

In [19]:
# showing the bastard tables as df

pd.set_option('display.max_rows', 50)

print("Top Languages by Bastard Frequency (per Verb):")
display(df_ranking)

print("\nGlobal Bastard Relation Frequencies:")
df_global_relations = pd.DataFrame(
    sorted(global_bastard_relations.items(), key=lambda x: x[1], reverse=True),
    columns=['Relation', 'Count']
)
display(df_global_relations)

Top Languages by Bastard Frequency (per Verb):


Unnamed: 0,Code,Language,Verbs,Bastards,Bastards_per_Verb_Pct,Top_Bastard_Rel
0,grc,AncientGreek,80843,24326,30.090422,nmod (4353)
1,xpg,Phrygian,234,58,24.786325,conj (18)
2,la,Latin,149655,29636,19.802880,acl (3835)
3,ps,Pashto,327,63,19.266055,acl (25)
4,pro,OldProvençal,5468,895,16.367959,obj (160)
...,...,...,...,...,...,...
180,bor,Borôro,27057,0,0.000000,
181,az,Azerbaijani,190,0,0.000000,
182,aii,Assyrian,57,0,0.000000,
183,apu,Apurinã,215,0,0.000000,



Global Bastard Relation Frequencies:


Unnamed: 0,Relation,Count
0,acl,27396
1,conj,21929
2,obl,20282
3,obj,19591
4,nmod,18216
5,advcl,10957
6,mark,10643
7,advmod,10638
8,det,9313
9,cc,9259


## 6. Export Analysis Results

In [20]:
# Save all analysis results
analysis.save_analysis_results(
    all_langs_position2num,
    all_langs_position2sizes,
    all_langs_average_sizes,
    filtered_position2num,
    filtered_position2sizes,
    lang2MAL,
    output_dir=DATA_DIR
)

print(f"Analysis results saved to {DATA_DIR}/")

Saved all_langs_position2num.pkl
Saved all_langs_position2sizes.pkl
Saved all_langs_average_sizes.pkl
Saved filtered_position2num.pkl
Saved filtered_position2sizes.pkl
Saved lang2MAL.pkl
All analysis results saved to data/
Analysis results saved to data/


## Summary

This notebook has:
- ✅ Loaded metadata from notebook 01
- ✅ Processed all CoNLL files in parallel (extracted dependency sizes)
- ✅ Filtered positions by minimum count (>= 10)
- ✅ Computed Mean Aggregate Length (MAL) for each language
- ✅ Exported 6 analysis result files to data/
- ✅ Analyzed bastard frequencies and exported examples

**Next step**: Run `03_visualization.ipynb` to create plots and explore results.