In [1]:
%load_ext autoreload
%autoreload 2

import data_strings

import os  
# Set the environment for R
if os.environ.get('HOME') == '/home/tom':
    os.environ['R_HOME'] = '/home/tom/miniconda3/envs/octagon_analysis/lib/R'  # Lab desktop
elif os.environ.get('HOME') == 1:
    os.environ['R_HOME']= r'C:\Users\tomha\miniconda3\envs\octagon_analysis\lib\R'  # Laptop
elif data_strings.DATA_FOLDER == r'D:\Users\Tom\OneDrive\PhD\SWC\data\pseudonymised_json_files':
    os.environ['R_HOME']=r'D:\Users\Tom\miniconda3\envs\octagon_analysis\lib\R'       # Home desktop
# elif:
    # append other machines here

import rpy2.robjects as robjects
print(robjects.r('R.version.string'))

import numpy as np
import pandas as pd
from ipywidgets import IntProgress
from IPython.display import display
from pymer4.models import Lmer
import populate_dataframes
import re




[1] "R version 4.1.3 (2022-03-10)"



In [2]:
k = 20
new_split_dataframes = False

### load data

In [3]:
import pickle

analysis_dir = os.path.join('..', 'data')
# analysis_file = 'analysis_results_2levelsFirstSeenWall_normalisedWallSep.pkl'
analysis_file = 'analysis_results_3levelsFirstSeenWall_normalisedWallSep.pkl'
filename = os.path.join(analysis_dir, analysis_file)
# load the analysis results
with open(filename, 'rb') as f:
    analysis_results = pickle.load(f)

### populate dataframes for glm input

In [4]:
# populate dataframes for solo, solosocial, and social analysis_type
glm_df_solo = populate_dataframes.populate_dataframe(analysis_results, analysis_type='solo')
glm_df_solosocial = populate_dataframes.populate_dataframe(analysis_results, analysis_type='solosocial')
glm_df_social = populate_dataframes.populate_dataframe(analysis_results, analysis_type='social')


### create reference to dataframes

In [6]:
glm_df_social['WallSep'].value_counts()

WallSep
0.25    3888
0.50    2428
1.00    2130
Name: count, dtype: int64

In [7]:
dataframes = {
    'glm_df_solo': glm_df_solo,
    'glm_df_solosocial': glm_df_solosocial,
    'glm_df_social': glm_df_social
}

### shuffle the dataframes for k-fold index selection


In [8]:
shuffled_dataframes_path = os.path.join('..', 'data', 'k_fold_CV', 'shuffled_dataframes.pkl')

if new_split_dataframes:
    
    # shuffle each dataframe
    for name, df in dataframes.items():
        dataframes[name] = df.sample(frac=1, random_state=17).reset_index(drop=True)

    # pickle save shuffled dataframes to sandbox > data, as one dictionary
    with open(shuffled_dataframes_path, 'wb') as f:
        pickle.dump(dataframes, f)

    shuffled_dataframes = dataframes

else:
    # load the shuffled dataframes
    with open(shuffled_dataframes_path, 'rb') as f:
        shuffled_dataframes = pickle.load(f)



### Split each dataframe into folds

In [9]:
# split each dataframe into k equal parts
split_dataframes = {name: np.array_split(df, k) for name, df in shuffled_dataframes.items()}

  return bound(*args, **kwds)


### Cross-validation functions

In [10]:
from contextlib import redirect_stdout

def fit_models(split_df, model_formula):
    '''
    Takes a dataframe and a model formula, and fits k models to the data.
    Wjere k is the number of folds in the split dataframe.
    Use k-1 folds to train each model'''
    
    models = []
    max_count = len(split_df)
    f = IntProgress(min=0, max=max_count, description='Fitting models')
    display(f)

    # for i, df in enumerate(split_df):
    #     print(f"Fold {i}: Type = {type(df)}")

    # Suppress the output of the models fitting process
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull):
            for i in range(len(split_df)):
                # Combine all folds except the i-th fold
                train_data = pd.concat([df for j, df in enumerate(split_df) if j != i], ignore_index=True)                
                # Fit the Lmer model to these folds
                model = Lmer(model_formula, data=train_data, family='binomial')
                model.fit(
                    # use the bobyqa optimizer and 10000 iterations
                    control="optimizer='bobyqa', optCtrl=list(maxfun=10000)"
                )
                models.append(model)
                print(f"Model {i} fit with {len(train_data)} rows")
                f.value += 1
    
    return models

def drop_nans_relevant_columns(df, model_formula):
    ''' Remove rows with NaNs in any columns that contain them, if these columns are in the model formula.
    This is to avoid making predictions on rows with NaNs in the relevant columns, which would cause errors.
    And also to avoid making predictions on rows with NaNs in the ground truth column, where
    the ground truth cannot be used to check them '''
    
    # drop rows with nans in any columns that contain them (if these columns are in the model formula)
    nan_containing_cols = list(df.isna().sum()[df.isna().sum() > 0].index)
    for column in nan_containing_cols:
        pattern = re.compile(column)
        match = re.search(pattern, model_formula)
        # if match and match.group(0) != "ChooseHigh": # 'ChooseHigh' is dependent variable, we don't need to drop this
        if match:
            df = df.dropna(subset=column)
        else:
            continue


    return df

def calculate_predictions(split_df, models):
    ''' 
    Predict on the held-out fold, for each trained model
    '''

    all_predictions = []
    all_predict_data = []
    
    # for each dataframe type, predict on the held-out fold using the relevant model
    for i, model in enumerate(models):
        
        # predict on the held-out fold
        predict_data = split_df[i].copy()

        # drop rows with nans in any columns that contain them (if these columns are in the model formula)
        # excluding the dependent variable 'ChooseHigh'
        predict_data = drop_nans_relevant_columns(predict_data, model.formula)
        all_predict_data.append(predict_data) # save this to provide data for ground truth later

        predictions = model.predict(predict_data, skip_data_checks=True, verify_predictions=False)

        # concatenate these predictions to the predictions array
        all_predictions.append(predictions)

    return all_predictions, all_predict_data



def calculate_likelihoods(split_df, predictions, model_formula):
    ''' 
    Calculate the likelihood of each prediction given the true output.
    The likelihood is calculated as p^y * (1-p)^(1-y), where p is the predicted probability
    and y is the true output (0 or 1).

    Takes a list of dataframes and a list of predictions, where each array of predictions corresponds
    to the dataframe of the same index.

    Avoid calculating the likelihood for NaN ground truth values. (Social trials without confident inferred
    choices)
    '''
    
    fold_likelihoods = []
    for i, prediction_fold in enumerate(predictions):
        
        predicted_output = np.array(prediction_fold)
        true_output = split_df[i].copy()
        
        # drop rows with nans in any columns that contain them (if these columns are in the model formula)
        # excluding the dependent variable 'ChooseHigh'
        true_output = drop_nans_relevant_columns(true_output, model_formula)

        # only calculate the likelihood for non-NaN ground truth values      
        nonnan_mask = ~np.isnan(true_output['ChooseHigh'])
        true_output = true_output[nonnan_mask]
        try:
            predicted_output = predicted_output[nonnan_mask]
        except Exception as e:
            print(f"Error: {e}")
            print(f"predicted output shape: {predicted_output.shape}, non-nan mask shape: {nonnan_mask.shape}")
            print(f"predicted output: {predicted_output}")
            print(f"nonnan mask: {nonnan_mask}")
            predicted_output = predicted_output[nonnan_mask.to_numpy()]

        # initialise likelihoods array to be the size of the ground truth data once nans have been filtered
        likelihoods = np.full(len(predicted_output), np.nan)


        # calculate the metric for each prediction

        for i, prediction in enumerate(predicted_output):
            ground_truth = true_output.iloc[i]['ChooseHigh']
            likelihood = prediction**ground_truth * (1 - prediction)**(1 - ground_truth)
            likelihoods[i] = likelihood
            if np.isnan(likelihood):
                print(f"NaN likelihood for prediction {prediction} and ground truth {ground_truth}")
                print(f"prediction**ground_truth: {prediction**ground_truth}")
                print(f"(1 - prediction)**(1 - ground_truth): {(1 - prediction)**(1 - ground_truth)}")

        fold_likelihoods.append(likelihoods)

    return fold_likelihoods

def calculate_nlls(fold_likelihoods):
    # #### sum the logs of the likelihoods, and take the negative

    fold_avg_nlls = np.full(len(fold_likelihoods), np.nan)
    for i, prediction_fold in enumerate(fold_likelihoods):
        fold_nlls = -np.log(prediction_fold)
        fold_avg_nll = np.nanmean(fold_nlls)
        print(f"Fold {i} average NLL: {fold_avg_nll}")

        fold_avg_nlls[i] = fold_avg_nll

    model_avg_nll = np.nanmean(fold_avg_nlls)

    return fold_avg_nlls, model_avg_nll


def save_cross_validation_results(name, model_formula, split_df, num_folds, predictions,
                                  predict_data, fold_avg_nlls, model_avg_nll):
    ''' Save the cross-validation results to a file.
     
      Args:
        name (str): Name of the model type.
        model_formula (str): The formula used for the model.
        split_df (list): List of dataframes for each fold.
        num_folds (int): Number of folds in the cross-validation.
        predictions (list): List of arrays of predictions for each fold.
        predict_data (list): List of dataframes used for predictions (different to split_df
        in that all of the rows with nans in relevant prediction columns are removed).
        fold_avg_nlls (np.array): Average negative log likelihoods for each fold.
        model_avg_nll (float): Average negative log likelihood across all folds.
        k (int): Number of folds in the cross-validation.'''
    
    cross_validation_results = {
        "name": name,
        "model_formula": model_formula,
        "split_df": split_df,
        "num_folds" : num_folds,
        # "models" : models,
        "predictions" : predictions,
        "predict_data": predict_data, 
        "fold_nlls" : fold_avg_nlls,
        "average_nll" : model_avg_nll
    }

   # Save the cross-validation results to a file
    dir = os.path.join('..', 'data', 'k_fold_CV')
    filename = f'{num_folds}-fold-CV_results_{name}.pickle'
    filepath = os.path.join(dir, filename)
    with open(filepath, 'wb') as f:
        pickle.dump(cross_validation_results, f)

    print(f"{num_folds}-fold CV data saved to: ", filepath)

In [11]:
def run_cross_validation(split_df, model_formula, name, save_results=False):
    ''' 
    Run k-fold cross-validation on the given dataframes.
    Returns:
    - model_avg_nll: the average negative log likelihood (NLL) across all folds,
    - models: a list of the fitted model for each fold,
    - predictions: a list of arrays of predictions for each fold,
    - predict_data: a list of dataframes used for predictions for each fold,
    - fold_avg_nlls: a list of the average NLL for each fold.
    '''

    n_folds = len(split_df)

    # Step 1: Fit models on k-1 folds for all iterations
    models = fit_models(split_df, model_formula)

    # Step 2: Calculate predictions on the held-out fold for each model
    predictions, predict_data = calculate_predictions(split_df, models)

    # Step 3: Calculate likelihoods for each prediction
    fold_likelihoods = calculate_likelihoods(split_df, predictions, model_formula)

    # Step 4: Calculate NLLs 
    fold_avg_nlls, model_avg_nll = calculate_nlls(fold_likelihoods)

    # Step 5: Save data to file (optional)
    if save_results:
        save_cross_validation_results(name, model_formula, split_df, n_folds, predictions,
                                       predict_data, fold_avg_nlls, model_avg_nll)

    return model_avg_nll, models, predictions, fold_avg_nlls


### Solo models

In [17]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + (1|GlmPlayerID)'
(model_avg_nll, models,
predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
                                                    "solo_randomintercepts_onlydistance",
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.2766952133024275
Fold 1 average NLL: 0.29547240143271825
Fold 2 average NLL: 0.35201941241464557
Fold 3 average NLL: 0.3050664850705333
Fold 4 average NLL: 0.27903473587078614
Fold 5 average NLL: 0.32706753096465824
Fold 6 average NLL: 0.33218722977686727
Fold 7 average NLL: 0.2911329382949599
Fold 8 average NLL: 0.291881688905881
Fold 9 average NLL: 0.33534334032056234
Fold 10 average NLL: 0.2609944338529338
Fold 11 average NLL: 0.31158630675134713
Fold 12 average NLL: 0.3202526104138483
Fold 13 average NLL: 0.3755990009843784
Fold 14 average NLL: 0.26480240115494963
Fold 15 average NLL: 0.286986946589787
Fold 16 average NLL: 0.2842538365962073
Fold 17 average NLL: 0.3213138941841158
Fold 18 average NLL: 0.3050602679679428
Fold 19 average NLL: 0.3462938016127225
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solo_randomintercepts_onlydistance.pickle


In [18]:
# model_formula = 'ChooseHigh ~ 1 + FirstSeenWall + (1|GlmPlayerID)'
# (model_avg_nll, models,
#   predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
#                                                     "solo_randomintercepts_firstseenonly",
#                                                       save_results=True)

In [19]:
# model_formula = 'ChooseHigh ~ 1 + WallSep + FirstSeenWall + (1|GlmPlayerID)'
# (model_avg_nll, models,
#   predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
#                                                     "solo_randomintercepts_wallsep_firstseen",
#                                                       save_results=True)

In [20]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + WallSep + (1|GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
                                                    "solo_randomintercepts_distancepluswallsep",
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.2608716532217139
Fold 1 average NLL: 0.28423615569857774
Fold 2 average NLL: 0.3537761641730709
Fold 3 average NLL: 0.2765300163988237
Fold 4 average NLL: 0.268371630167254
Fold 5 average NLL: 0.32541908624876065
Fold 6 average NLL: 0.3232770412527495
Fold 7 average NLL: 0.2828335964917251
Fold 8 average NLL: 0.2721974469167486
Fold 9 average NLL: 0.316280123888434
Fold 10 average NLL: 0.2647775482642371
Fold 11 average NLL: 0.3004453462536089
Fold 12 average NLL: 0.2994898980519323
Fold 13 average NLL: 0.3555282746523882
Fold 14 average NLL: 0.2575248079485923
Fold 15 average NLL: 0.26859325569930953
Fold 16 average NLL: 0.2846100754785221
Fold 17 average NLL: 0.30559057058977246
Fold 18 average NLL: 0.29440382299777795
Fold 19 average NLL: 0.3404700918331636
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solo_randomintercepts_distancepluswallsep.pickle


In [21]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + (1|GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
                                                    "solo_randomintercepts",
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.2028514652524724
Fold 1 average NLL: 0.22639336248567762
Fold 2 average NLL: 0.3175652338695547
Fold 3 average NLL: 0.2511471480833932
Fold 4 average NLL: 0.23648463508359766
Fold 5 average NLL: 0.2429810302280435
Fold 6 average NLL: 0.26330095099959133
Fold 7 average NLL: 0.21977558989693743
Fold 8 average NLL: 0.2418839827183015
Fold 9 average NLL: 0.23474079654119584
Fold 10 average NLL: 0.22813391568421867
Fold 11 average NLL: 0.2381150325607726
Fold 12 average NLL: 0.22334582116974142
Fold 13 average NLL: 0.2786534588603849
Fold 14 average NLL: 0.20793963417313363
Fold 15 average NLL: 0.2254346650344118
Fold 16 average NLL: 0.21139702041152308
Fold 17 average NLL: 0.20681982968963997
Fold 18 average NLL: 0.2085992434064052
Fold 19 average NLL: 0.2734534861790109
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solo_randomintercepts.pickle


In [22]:
# model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + (1|GlmPlayerID)'
# (model_avg_nll, models,
#   predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
#                                                     "solo_randomintercepts_distanceplusfirstseen",
#                                                       save_results=True)

In [23]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall + (1|GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
                                                    "solo_randomintercepts_distancesinteraction",
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.20014338210088178
Fold 1 average NLL: 0.22826607915380842
Fold 2 average NLL: 0.30784508589420334
Fold 3 average NLL: 0.24507121754761882
Fold 4 average NLL: 0.24380197133039772
Fold 5 average NLL: 0.23714140504291556
Fold 6 average NLL: 0.26320417954844183
Fold 7 average NLL: 0.22047372125783424
Fold 8 average NLL: 0.2490422880612596
Fold 9 average NLL: 0.2339755197248215
Fold 10 average NLL: 0.2226618990053057
Fold 11 average NLL: 0.23574990613721608
Fold 12 average NLL: 0.22613322693950336
Fold 13 average NLL: 0.2790930080832302
Fold 14 average NLL: 0.20224940817251963
Fold 15 average NLL: 0.22382585923474965
Fold 16 average NLL: 0.21301978349971998
Fold 17 average NLL: 0.2053446185693556
Fold 18 average NLL: 0.20170337557385334
Fold 19 average NLL: 0.2679330262080944
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solo_randomintercepts_distancesinteraction.pickle


In [24]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall + FirstSeenWall:WallSep + (1|GlmPlayerID)'
(nllss, models,
  predictions, likelihoods) = run_cross_validation(split_dataframes['glm_df_solo'], model_formula,
                                                    "solo_randomintercepts_distancesandwallsepinteraction",
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.19983984801028123
Fold 1 average NLL: 0.22225474962594832
Fold 2 average NLL: 0.29599880530204975
Fold 3 average NLL: 0.23961011967606782
Fold 4 average NLL: 0.24788890002978067
Fold 5 average NLL: 0.22847044968944616
Fold 6 average NLL: 0.2512963368373837
Fold 7 average NLL: 0.20964622782961265
Fold 8 average NLL: 0.24624720612164097
Fold 9 average NLL: 0.2303474004048569
Fold 10 average NLL: 0.21832877693643668
Fold 11 average NLL: 0.2309651116179803
Fold 12 average NLL: 0.22073586117716654
Fold 13 average NLL: 0.28777764011756846
Fold 14 average NLL: 0.19779428512453878
Fold 15 average NLL: 0.22797894000434438
Fold 16 average NLL: 0.21018117645372303
Fold 17 average NLL: 0.20596021564579636
Fold 18 average NLL: 0.19720898426324812
Fold 19 average NLL: 0.2669010004274252
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solo_randomintercepts_distancesandwallsepinteraction.pickle


### Social models

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + (1|GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_no-opponentvisible",
                                                      save_results=True)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + OpponentVisible + (1|GlmPlayerID)'

(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_opponentvisible",
                                                      save_results=True)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep  + OpponentD2H' \
' + OpponentD2L + (1|GlmPlayerID)'

(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_no-opponentvisible_opponentdistance",
                                                      save_results=True)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep  + OpponentD2H' \
' + OpponentD2L + OpponentFirstSeenWall + (1|GlmPlayerID)'

(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_no-opponentvisible_opponentdistance_opponentfirstseen",
                                                      save_results=True)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep  + OpponentD2H' \
' + OpponentD2L + OpponentFirstSeenWall + FirstSeenWall:WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall + (1|GlmPlayerID)'

(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_no-opponentvisible_opponentdistance_opponentfirstseen_solointeractions",
                                                      save_results=True)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep  + OpponentD2H' \
' + OpponentD2L + OpponentFirstSeenWall + FirstSeenWall:WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall' \
' + D2H:OpponentD2H + D2L:OpponentD2L + (1|GlmPlayerID)'

(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_social'], model_formula,
                                                    "social_randomintercepts_no-opponentvisible_opponentdistance_opponentfirstseen_solo-and-social-interactions",
                                                      save_results=True)

### Solo-social models

In [12]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.44323099397258114
Fold 1 average NLL: 0.4086499953412541
Fold 2 average NLL: 0.44923306186286266
Fold 3 average NLL: 0.44458361398484164
Fold 4 average NLL: 0.4008870885123465
Fold 5 average NLL: 0.3765003028731802
Fold 6 average NLL: 0.44218749475310126
Fold 7 average NLL: 0.4870591465294052
Fold 8 average NLL: 0.44195917475551594
Fold 9 average NLL: 0.40260156832320965
Fold 10 average NLL: 0.41531309159036767
Fold 11 average NLL: 0.4374880427427129
Fold 12 average NLL: 0.43156253283252405
Fold 13 average NLL: 0.41741624699418894
Fold 14 average NLL: 0.38879645885512537
Fold 15 average NLL: 0.4650907073892123
Fold 16 average NLL: 0.4161268917373218
Fold 17 average NLL: 0.4586117770733195
Fold 18 average NLL: 0.4278904590960472
Fold 19 average NLL: 0.39610183418212785
20-fold CV data saved to:  ..\data\k_fold_CV\20-fold-CV_results_solosocial_randomintercepts.pickle


In [13]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + SocialContext + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts_plus-socialcontext", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.3850923702353639
Fold 1 average NLL: 0.3619628082892744
Fold 2 average NLL: 0.38782187462559164
Fold 3 average NLL: 0.4013579832536461
Fold 4 average NLL: 0.35853661711954526
Fold 5 average NLL: 0.3346764705585567
Fold 6 average NLL: 0.3743304154714246
Fold 7 average NLL: 0.43506792389723287
Fold 8 average NLL: 0.3983580424434907
Fold 9 average NLL: 0.3485542541173233
Fold 10 average NLL: 0.3567213691436151
Fold 11 average NLL: 0.37745269797401126
Fold 12 average NLL: 0.36347606513353614
Fold 13 average NLL: 0.39281605353113963
Fold 14 average NLL: 0.33511785095594454
Fold 15 average NLL: 0.4271343490234305
Fold 16 average NLL: 0.3502830178873332
Fold 17 average NLL: 0.39761965908475394
Fold 18 average NLL: 0.3744165026712919
Fold 19 average NLL: 0.3617709229894236
20-fold CV data saved to:  ..\data\k_fold_CV\20-fold-CV_results_solosocial_randomintercepts_plus-socialcontext.pickle


In [57]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + SocialContext +' \
' FirstSeenWall:WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts_socialcontext_fullsolointeractions", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.38777123463247526
Fold 1 average NLL: 0.341216131496972
Fold 2 average NLL: 0.37322288849466134
Fold 3 average NLL: 0.405168205900133
Fold 4 average NLL: 0.35741841963081933
Fold 5 average NLL: 0.31961069738885306
Fold 6 average NLL: 0.34659226712489855
Fold 7 average NLL: 0.42223155486492714
Fold 8 average NLL: 0.37123444560587965
Fold 9 average NLL: 0.34640983274946663
Fold 10 average NLL: 0.34303172033953433
Fold 11 average NLL: 0.35486445854223275
Fold 12 average NLL: 0.3544704784472647
Fold 13 average NLL: 0.3802617000055061
Fold 14 average NLL: 0.3284275930332824
Fold 15 average NLL: 0.41465735542363097
Fold 16 average NLL: 0.32915292249584144
Fold 17 average NLL: 0.38490522560333484
Fold 18 average NLL: 0.3674733376991403
Fold 19 average NLL: 0.3670216238971828
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solosocial_randomintercepts_socialcontext_fullsolointeractions.pickle


In [58]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + SocialContext + SocialContext:FirstSeenWall + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts_plus-socialcontext_fsw-interactions", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.3846275075567164
Fold 1 average NLL: 0.36234343514584305
Fold 2 average NLL: 0.3873502459973704
Fold 3 average NLL: 0.4012546182150781
Fold 4 average NLL: 0.36096868575623176
Fold 5 average NLL: 0.33401574969543896
Fold 6 average NLL: 0.375139690964165
Fold 7 average NLL: 0.43611067534315373
Fold 8 average NLL: 0.3974477870270088
Fold 9 average NLL: 0.34890755122680966
Fold 10 average NLL: 0.35599937451690267
Fold 11 average NLL: 0.3768744017289001
Fold 12 average NLL: 0.3623506470728137
Fold 13 average NLL: 0.392605874329421
Fold 14 average NLL: 0.3346935919225609
Fold 15 average NLL: 0.4272139610391643
Fold 16 average NLL: 0.3506504046830323
Fold 17 average NLL: 0.39802496923631553
Fold 18 average NLL: 0.3742834116436786
Fold 19 average NLL: 0.3612375214981439
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solosocial_randomintercepts_plus-socialcontext_fsw-interactions.pickle


In [59]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + SocialContext + D2H:SocialContext + D2L:SocialContext + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts_plus-socialcontext_fsw-interactions_socialcontext-dist_interactions", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.38596294642034606
Fold 1 average NLL: 0.35639297556985194
Fold 2 average NLL: 0.3800264229777872
Fold 3 average NLL: 0.40143100767363765
Fold 4 average NLL: 0.35728499255787666
Fold 5 average NLL: 0.32811226156624623
Fold 6 average NLL: 0.3671101552968014
Fold 7 average NLL: 0.4278953364125747
Fold 8 average NLL: 0.39380652398587057
Fold 9 average NLL: 0.3508724544916122
Fold 10 average NLL: 0.3466662525643493
Fold 11 average NLL: 0.37569896484303067
Fold 12 average NLL: 0.36171663114662195
Fold 13 average NLL: 0.3865608456688019
Fold 14 average NLL: 0.3331236527152994
Fold 15 average NLL: 0.41434044610366466
Fold 16 average NLL: 0.3439543693865903
Fold 17 average NLL: 0.3895402551270885
Fold 18 average NLL: 0.36876876164637773
Fold 19 average NLL: 0.361714927831515
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solosocial_randomintercepts_plus-socialcontext_fsw-interactions_socialcontext-dist_interactions.pickle


In [60]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + SocialContext + D2H:SocialContext + D2L:SocialContext ' \
'+ FirstSeenWall:WallSep + D2H:FirstSeenWall + D2L:FirstSeenWall + (1 |GlmPlayerID)'
(model_avg_nll, models,
  predictions, fold_avg_nlls) = run_cross_validation(split_dataframes['glm_df_solosocial'], model_formula,
                                                    "solosocial_randomintercepts_plus-socialcontext_fsw-interactions_socialcontext-dist-interactions_solo-interactions", 
                                                      save_results=True)

IntProgress(value=0, description='Fitting models', max=20)

  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(
  ran_vars = ran_vars.applymap(


Fold 0 average NLL: 0.3905287063059076
Fold 1 average NLL: 0.33633441285843413
Fold 2 average NLL: 0.3688640269729019
Fold 3 average NLL: 0.40403817613974463
Fold 4 average NLL: 0.35445463902409946
Fold 5 average NLL: 0.3146852458950841
Fold 6 average NLL: 0.3422057833672172
Fold 7 average NLL: 0.4167291459502028
Fold 8 average NLL: 0.3664822810411201
Fold 9 average NLL: 0.34714920126273013
Fold 10 average NLL: 0.33521781978902837
Fold 11 average NLL: 0.352955803355181
Fold 12 average NLL: 0.35272081308056075
Fold 13 average NLL: 0.3744979056345644
Fold 14 average NLL: 0.32642372297384353
Fold 15 average NLL: 0.40513215048725115
Fold 16 average NLL: 0.3248745211525455
Fold 17 average NLL: 0.3772293241899721
Fold 18 average NLL: 0.36245558666022365
Fold 19 average NLL: 0.3651005096239049
20-fold CV data saved to:  ../data/k_fold_CV/20-fold-CV_results_solosocial_randomintercepts_plus-socialcontext_fsw-interactions_socialcontext-dist-interactions_solo-interactions.pickle
