In [1]:
import pandas as pd
import json
import os
from IPython.display import display, Markdown
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from collections import Counter

In [17]:
with open('./results/thresholds_IQR.json') as json_data:
    thresholds = json.load(json_data)
thresholds

flattened = {}

for text_id, categories in thresholds.items():
    for category, features in categories.items():
        for feature, value in features.items():
            index = f"{category}/{feature}"
            flattened.setdefault(index, {})[text_id] = value

df = pd.DataFrame.from_dict(flattened, orient='index')
df = df[['N1', 'N2', 'N3', 'N4']]  # ensure column order

df

Unnamed: 0,N1,N2,N3,N4
sentence-token-level/max_size_aux_verbs,5.0,3.5,2.0,2.0
sentence-token-level/max_size_passive,0.0,5.0,5.0,5.0
sentence-token-level/max_size_named_entities,7.0,9.5,9.5,7.0
sentence-token-level/max_size_np_pp_modifiers,29.5,16.5,16.0,15.625
sentence-token-level/max_size_subordination,21.5,31.5,49.0,53.25
sentence-token-level/max_size_coordination,3.5,6.0,6.0,6.0
document-token-level/total_token_ratio_aux_verbs,0.1623,0.113,0.1019,0.0639
document-token-level/total_token_ratio_passive,0.0,0.022,0.022,0.0211
document-token-level/total_token_ratio_named_entities,0.4142,0.2401,0.1955,0.157
document-token-level/total_token_ratio_subordination,0.5601,0.6977,0.8173,0.7142


In [20]:
with open('./results/thresholds_LogReg.json') as json_data:
    thresholds = json.load(json_data)
thresholds

flattened = {}

for level, categories in thresholds.items():
    for category, features in categories.items():
        for feature, value in features.items():
            index = f"{category}/{feature}"
            flattened.setdefault(index, {})[level] = value


df = pd.DataFrame.from_dict(flattened, orient='index')
df = df[['N1', 'N2', 'N3', 'N4']]  # ensure column order

df

Unnamed: 0,N1,N2,N3,N4
sentence-token-level/max_size_aux_verbs,1.3,1.656,1.76,
sentence-token-level/max_size_passive,0.599,0.793,0.985,
sentence-token-level/max_size_named_entities,3.625,3.666,3.422,
sentence-token-level/max_size_np_pp_modifiers,7.672,7.981,8.122,
sentence-token-level/max_size_subordination,12.864,17.807,22.538,
sentence-token-level/max_size_coordination,2.214,2.308,2.392,
document-token-level/total_token_ratio_aux_verbs,0.036,0.033,0.03,
document-token-level/total_token_ratio_passive,0.006,0.0,0.006,
document-token-level/total_token_ratio_named_entities,0.115,0.092,0.079,
document-token-level/total_token_ratio_subordination,0.245,0.312,0.369,


In [14]:
import pandas as pd
import numpy as np
import json

with open('./results/distributions.json') as json_data:
    distributions = json.load(json_data)

stats = {}

for level, categories in distributions.items():
    for category, features in categories.items():
        for feature, values in features.items():
            index = f"{level}/{category}/{feature}"
            arr = np.array(values, dtype=float)
            stats[index] = {
                "min": arr.min(),
                "max": arr.max(),
                "mean": arr.mean(),
                "std": arr.std(),
                "median": np.median(arr),
            }

df = pd.DataFrame.from_dict(stats, orient="index")

# split index into (level, category, feature)
df["level"] = df.index.str.split("/").str[0]
df["category"] = df.index.str.split("/").str[1]
df["feature"] = df.index.str.split("/").str[2]

# sort by category, feature, then level
df = df.sort_values(["category", "feature", "level"])

# rebuild flat index in desired order
df.index = df.apply(lambda row: f"{row['level']}/{row['category']}/{row['feature']}", axis=1)

# drop helper columns
df = df.drop(columns=["level", "category", "feature"])

# round for readability
df = df.round(3)

df


Unnamed: 0,min,max,mean,std,median
N1/document-document-level/lexical_diversity,0.606,1.00,0.931,0.089,0.953
N2/document-document-level/lexical_diversity,0.800,1.00,0.975,0.025,0.982
N3/document-document-level/lexical_diversity,0.850,1.00,0.983,0.020,0.989
N4/document-document-level/lexical_diversity,0.952,1.00,0.987,0.012,0.989
N1/document-document-level/sentence_count,1.000,25.00,7.842,7.464,5.000
...,...,...,...,...,...
N4/token-level-low/familiarity,2.250,7.00,6.151,1.033,6.714
N1/token-level-low/lexical_frequency,0.000,38928.92,6290.458,9995.912,164.930
N2/token-level-low/lexical_frequency,0.000,38928.92,6427.584,10259.371,140.070
N3/token-level-low/lexical_frequency,0.000,38928.92,6891.971,11048.043,162.300
