<a href="https://colab.research.google.com/github/w-decker/wiscs-stats/blob/main/stats_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# @title # Load data

import requests
from typing import Union
import os
import pandas as pd

DATA_PATH = "https://raw.githubusercontent.com/w-decker/wiscs-stats/main/data/"
FILES = ["simulated_Potter1975.csv", "simulated_main.csv", "simulated_alt.csv"]
LOCAL_DATA_PATH = "data/"

def download_data(url:str, file:Union[str, list[str]]):

    os.makedirs("data", exist_ok=True)

    # Download each file
    for fname in file:
        file_url = url + fname
        local_path = os.path.join("data", fname)
        response = requests.get(file_url)

        if response.status_code == 200:
            with open(local_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {fname}")
        else:
            print(f"Failed to download: {fname} (Status Code: {response.status_code})")

def import_data(path:str, file:Union[str, list[str]]) -> list[pd.DataFrame]:

  df = []
  for fname in file:
    df.append(pd.read_csv(os.path.join(path, fname)))
  print('Data imported')
  return df

download_data(DATA_PATH, FILES)
df = import_data(LOCAL_DATA_PATH, FILES)


Downloaded: simulated_Potter1975.csv
Downloaded: simulated_main.csv
Downloaded: simulated_alt.csv
Data imported


In [37]:
# @title # Helper functions
from statsmodels.regression.mixed_linear_model import MixedLMResultsWrapper # type: ignore
from typing import Mapping, Tuple

aic = lambda models: {label: model.aic for label, model in models.items()}

def best_model(models: Mapping[any, MixedLMResultsWrapper],
               metric:str="aic", print_aic:bool=False) ->Tuple[any, MixedLMResultsWrapper]:

    """Return the best model based on metric"""
    assert metric in ["aic", "bic"], "Invalid metric"
    key = lambda item: getattr(item[1], metric)
    label, winner = min(models.items(), key=key)
    if print_aic:
        for model, result in models.items():
            print(f'{model}: {getattr(result, metric)}')
    return label, winner

In [35]:
# @title # Imports
import statsmodels.formula.api as smf # type: ignore

import warnings
warnings.filterwarnings('ignore')

In [42]:
main = df[1]
alt = df[2]
potter1975 = df[0]

Unnamed: 0,subject,rt,question,item,modality
0,0,386.471825,0,0,image
1,0,398.316232,0,1,image
2,0,380.544709,0,2,image
3,0,397.270523,0,3,image
4,0,393.769889,0,4,image


In [44]:
model1 = smf.mixedlm("rt ~ modality", main, groups=main["subject"])
result1 = model1.fit(reml=False)

model2 = smf.mixedlm("rt ~ modality", alt, groups=alt["subject"])
result2 = model2.fit(reml=False)

model3 = smf.mixedlm("rt ~ modality", potter1975, groups=potter1975["subject"])
result3 = model3.fit(reml=False)

In [45]:
models = {
    "MAIN":result1,
    "ALT":result2,
    "potter1975":result3
}

label, winner = best_model(models, print_aic=True)

print('-'*50)
print(f'\nwinner: {label}')
winner.summary()

MAIN: 829504.196578702
ALT: 1747940.5165951252
potter1975: 1748078.0975339063
--------------------------------------------------

winner: MAIN


0,1,2,3
Model:,MixedLM,Dependent Variable:,rt
No. Observations:,100000,Method:,ML
No. Groups:,50,Scale:,234.4131
Min. group size:,2000,Log-Likelihood:,-414748.0983
Max. group size:,2000,Converged:,No
Mean group size:,2000.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,397.496,0.068,5805.322,0.000,397.362,397.631
modality[T.word],24.997,0.097,258.150,0.000,24.807,25.187
Group Var,0.000,,,,,
