# 04 Data Processing and Factor Computation

This notebook loads the processed dependency size data, calculates linguistic factors (such as HCS and Diagonal factors), and generates analysis tables.

**Output**:
- `data/hcs_factors.csv`: Computed HCS factors for all languages.
- `data/verb_centered_table.txt`: Verb-centered constituent size table.

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from importlib import reload

# Custom modules
import data_utils
import compute_factors
import verb_centered_analysis

# Reload to ensure latest changes are picked up
reload(compute_factors)
reload(verb_centered_analysis)

<module 'verb_centered_analysis' from '/bigstorage/kim/typometrics/dataanalysis/verb_centered_analysis.py'>

In [2]:
# Configuration
DATA_DIR = "data"
OUTPUT_DIR = "data"

## 1. Load Data

In [3]:
metadata = data_utils.load_metadata(os.path.join(DATA_DIR, 'metadata.pkl'))
langNames = metadata['langNames']
langnameGroup = metadata['langnameGroup']

print(f"Loaded metadata for {len(langNames)} languages")

Loaded metadata from data/metadata.pkl
Loaded metadata for 186 languages


In [4]:
# Load average sizes
with open(os.path.join(DATA_DIR, 'all_langs_average_sizes.pkl'), 'rb') as f:
    all_langs_average_sizes_filtered = pickle.load(f)

# Save filtered data for notebook 05 (if needed by downstream)
with open(os.path.join(DATA_DIR, 'all_langs_average_sizes_filtered.pkl'), 'wb') as f:
    pickle.dump(all_langs_average_sizes_filtered, f)

print(f"Loaded average sizes for {len(all_langs_average_sizes_filtered)} languages")

Loaded average sizes for 185 languages


## 2. Compute HCS Factors

In [5]:
hcs_df = compute_factors.compute_hcs_factors(
    all_langs_average_sizes_filtered, 
    langNames, 
    langnameGroup
)

print(f"Computed HCS factors for {len(hcs_df)} languages")
print(hcs_df.head())

Computed HCS factors for 170 languages
    language_code  language_name           group  right_1_totright_2  \
80             ko         Korean           Other            1.881159   
116           pad        Paumarí  South-American            2.333333   
146            tn         Tswana     Niger-Congo            2.500000   
137           ssp    SpanishSign   Indo-European            1.750000   
144           qte  TeluguEnglish       Dravidian            1.000000   

     right_2_totright_2  hcs_factor  
80             1.055072    0.560863  
116            1.666667    0.714286  
146            2.250000    0.900000  
137            1.625000    0.928571  
144            1.000000    1.000000  


In [6]:
hcs_path = os.path.join(OUTPUT_DIR, 'hcs_factors.csv')
hcs_df.to_csv(hcs_path, index=False)
print(f"Saved HCS factors to {hcs_path}")

Saved HCS factors to data/hcs_factors.csv


## 3. Verb-Centered Constituent Size Analysis

In [7]:
position_averages = verb_centered_analysis.compute_average_sizes_table(all_langs_average_sizes_filtered)
table_str = verb_centered_analysis.format_verb_centered_table(position_averages)

print(table_str)

VERB-CENTERED CONSTITUENT SIZE TABLE

                                        V                                   
R tot=4:                                  V   1.878   2.475   3.350   6.684
R tot=3:                                  V   1.757   2.744   5.689
R tot=2:                                  V   2.000   4.753
R tot=1:                                  V   4.326
------------------------------------------------------------------------------------------------------------------------
L tot=1:                            2.251 V
L tot=2:                    1.853   2.429 V
L tot=3:            1.749   2.128   2.646 V
L tot=4:    1.642   2.072   2.528   2.964 V
Mode: Factors=False, Diagonals=False, Direction=diverging
Table saved to: data/verb_centered_table.tsv


In [8]:
table_path = os.path.join(OUTPUT_DIR, 'verb_centered_table.txt')
with open(table_path, 'w') as f:
    f.write(table_str)
print(f"Saved table to {table_path}")

Saved table to data/verb_centered_table.txt


In [9]:
# 

In [23]:
# Print the complex table with growth factors
import verb_centered_analysis
from importlib import reload
reload(verb_centered_analysis)

if 'position_averages' in locals():
    table_with_factors = verb_centered_analysis.format_verb_centered_table(position_averages, show_horizontal_factors=True, arrow_direction='diverging')
    print(table_with_factors)
else:
    print("position_averages variable not found. Please run previous cells.")

VERB-CENTERED CONSTITUENT SIZE TABLE
WITH GROWTH FACTORS

                                                                            V                                                                       
R tot=4:                                                                      V   1.878  ×1.32 →      2.475  ×1.35 →      3.350  ×2.00 →      6.684
R tot=3:                                                                      V   1.757  ×1.56 →      2.744  ×2.07 →      5.689
R tot=2:                                                                      V   2.000  ×2.38 →      4.753
R tot=1:                                                                      V   4.326
------------------------------------------------------------------------------------------------------------------------
L tot=1:                                                                2.251 V
L tot=2:                                            1.853  ×0.76 ←      2.429 V
L tot=3:                        1.749  ×

In [10]:
# Print the even more complex table with DIAGONAL growth factors
import verb_centered_analysis
from importlib import reload
reload(verb_centered_analysis)

if 'position_averages' in locals():
    table_diag = verb_centered_analysis.format_verb_centered_table(position_averages, show_horizontal_factors=True, show_diagonal_factors=True, arrow_direction='rightwards')
    print(table_diag)
else:
    print("position_averages variable not found. Please run previous cells.")

VERB-CENTERED CONSTITUENT SIZE TABLE
WITH GROWTH FACTORS

                                                                            V                                                                       
R tot=4:                                                                      V   1.878  ×1.32 →      2.475  ×1.35 →      3.350  ×2.00 →      6.684
                                                                                         ×1.41 ↗             ×1.22 ↗             ×1.17 ↗           
R tot=3:                                                                      V   1.757  ×1.56 →      2.744  ×2.07 →      5.689
                                                                                         ×1.37 ↗             ×1.20 ↗           
R tot=2:                                                                      V   2.000  ×2.38 →      4.753
                                                                                         ×1.10 ↗           
R tot=1:                 

In [25]:
# Print the even more complex table with DIAGONAL growth factors
import verb_centered_analysis
from importlib import reload
reload(verb_centered_analysis)

if 'position_averages' in locals():
    table_diag = verb_centered_analysis.format_verb_centered_table(position_averages, show_horizontal_factors=True, show_diagonal_factors=True, arrow_direction='rightwards')
    print(table_diag)
else:
    print("position_averages variable not found. Please run previous cells.")

VERB-CENTERED CONSTITUENT SIZE TABLE
WITH GROWTH FACTORS

                                                                            V                                                                       
R tot=4:                                                                      V   1.878  ×1.32 →      2.475  ×1.35 →      3.350  ×2.00 →      6.684
                                                                                         ×1.41 ↗             ×1.22 ↗             ×1.17 ↗           
R tot=3:                                                                      V   1.757  ×1.56 →      2.744  ×2.07 →      5.689
                                                                                         ×1.37 ↗             ×1.20 ↗           
R tot=2:                                                                      V   2.000  ×2.38 →      4.753
                                                                                         ×1.10 ↗           
R tot=1:                 