In [1]:
import sys
sys.path.append("../code")
import annotation
import pandas as pd
from importlib import reload
from IPython.display import display, clear_output, HTML

pd.set_option('display.max_colwidth', 20) 
pd.set_option('display.max_rows', 2000)

In [2]:
vacc = pd.read_csv("VACC.csv", index_col=0, na_filter=False)
vacw = pd.read_csv("VACW.csv", index_col=0, na_filter=False)
rbc = pd.read_csv("RBC.csv", index_col=0, na_filter=False)
rbc = rbc[~(rbc.interaction_id.str.startswith("Instructions"))]

In [3]:
def lemma_stats(df):
    # Step 1: Count occurrences of each lemma in each interaction
    interaction_counts = (
        df.groupby(["interaction_id", "lemma"])
        .size()
        .unstack(fill_value=0)
    )
    
    # Step 2: Calculate statistics
    stats_per_lemma = pd.DataFrame({
        "lemma": interaction_counts.columns,
        "mean_count_per_interaction": interaction_counts.mean(axis=0),
        "std_dev_from_mean": interaction_counts.std(axis=0),
        "interaction_range": (interaction_counts > 0).sum(axis=0)
    }).reset_index(drop=True)
    
    display(stats_per_lemma.sort_values("mean_count_per_interaction", ascending=False).reset_index(drop=True).head(100))

In [4]:
lemma_stats(vacc)

Unnamed: 0,lemma,mean_count_per_interaction,std_dev_from_mean,interaction_range
0,der,47.324074,35.982298,108
1,ich,30.944444,26.298372,106
2,alexa,22.12963,8.722184,108
3,termin,22.12963,24.248747,56
4,sein,22.037037,16.424808,107
5,uhr,20.898148,23.511755,60
6,um,17.638889,18.846383,69
7,haben,17.527778,14.199156,103
8,an,17.222222,13.890986,108
9,es,16.555556,13.878197,87


In [5]:
lemma_stats(vacw)

Unnamed: 0,lemma,mean_count_per_interaction,std_dev_from_mean,interaction_range
0,der,7.274454,9.608085,878
1,sein,4.414055,5.864071,934
2,computer,3.463438,3.876889,1050
3,ich,2.476733,4.192683,717
4,und,2.433998,2.979439,749
5,in,1.890788,2.769751,658
6,ein,1.861349,3.080936,605
7,von,1.589744,2.554867,571
8,haben,1.494777,2.392801,591
9,du,1.476733,4.062099,462


In [6]:
lemma_stats(rbc)

Unnamed: 0,lemma,mean_count_per_interaction,std_dev_from_mean,interaction_range
0,sie,22.177778,6.001082,90
1,der,19.655556,6.195615,90
2,ich,11.944444,4.092638,90
3,wir,9.233333,3.940712,90
4,können,8.822222,2.479479,90
5,sein,8.288889,3.63888,90
6,ein,8.288889,3.346118,90
7,für,8.155556,3.35506,90
8,ja,6.077778,5.69689,87
9,Tisch,6.033333,2.916247,90


In [None]:
alternative_condition = lambda df: (df["lemma"].isin(["danke", "dankeschön"])) |((df["lemma"] == "danke")  (df["lemma"].shift(-1) == "uhr"))
annotation.display_context(rbc, ["danke"], "lemma")