In [3]:
%load_ext autoreload
%autoreload 2

import os 
os.environ['R_HOME']= r'C:\Users\tomha\miniconda3\envs\octagon_analysis\lib\R'
# os.environ['R_HOME']= r'D:\Users\Tom\miniconda3\envs\octagon_analysis\lib\R'
# os.environ['R_HOME']=r'D:\Users\Tom\miniconda3\envs\octagon_analysis\lib\R'

import rpy2

import rpy2.robjects as robjects
print(robjects.r('R.version.string'))

import parse_data.prepare_data as prepare_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import globals
import data_strings
import data_extraction.get_indices as get_indices
import analysis.wall_visibility_and_choice as wall_visibility_and_choice
from trajectory_analysis import trajectory_vectors
from plotting import plot_octagon
import parse_data.identify_filepaths as identify_filepaths 
from data_extraction.trial_list_filters import filter_trials_other_visible
from analysis import opponent_visibility
from ipywidgets import IntProgress
from IPython.display import display
import time
from pymer4.models import Lmer



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[1] "R version 4.1.3 (2022-03-10)"



### load data

In [None]:
import pickle

with open('./data/analysis_results_2levelsforFirstSeenWall.pkl', 'rb') as f:
    analysis_results = pickle.load(f)

#### Populate a dataframe, with a row for each trial, and fields for regressors (only including trials with fully-populated regressors)

#### Social df

In [4]:
glm_df_social = pd.DataFrame()

for session_id, players in analysis_results.items():
    for player_id in players:
        
        # take each filtered_regressor array and fill the relevant df field for this player
        player_data = analysis_results[session_id][player_id]['social']['regressors']
        choice = analysis_results[session_id][player_id]['social']['dependent']['choice']
        opponent_player_id = 1 if player_id == 0 else 1
        opponent_player_data = analysis_results[session_id][opponent_player_id]['social']['regressors']
        df_player = pd.DataFrame(
                    {
                        "SessionID" : session_id,
                        "PlayerID" : player_id,
                        "GlmPlayerID" : session_id*2 + player_id,
                        "ChooseHigh" : choice,
                        "WallSep" : player_data['wall_sep'],
                        "FirstSeenWall" : player_data['first_seen'],
                        "D2H" : player_data['d2h'],
                        "D2L" : player_data['d2l'],
                        "OpponentVisible" : player_data['opponent_visible'],
                        "OpponentFirstSeenWall" : player_data['first_seen_opponent'],
                        "OpponentD2H" : player_data['d2h_opponent'],
                        "OpponentD2L" : player_data['d2l_opponent']
                        
                    }
        )


        # append this smaller dataframe to the the full dataframe
        glm_df_social = pd.concat([glm_df_social, df_player], ignore_index=True)



glm_df_social["FirstSeenWall"] = glm_df_social["FirstSeenWall"].astype(str).astype("category")
glm_df_social["OpponentFirstSeenWall"] = glm_df_social["OpponentFirstSeenWall"].astype(str).astype("category")

glm_df_social["WallSep"] = glm_df_social["WallSep"].astype(str).astype("category")

#### solo-social combined df

In [None]:
glm_df_solo_social = pd.DataFrame()

for session_id, players in analysis_results.items():
    for player_id in players:
        
        # take each filtered_regressor array and fill the relevant df field for this player
        player_data_solo = analysis_results[session_id][player_id]['solo']['regressors']
        player_data_social = analysis_results[session_id][player_id]['social']['regressors']
        choice_solo = analysis_results[session_id][player_id]['solo']['dependent']['choice']
        choice_social = analysis_results[session_id][player_id]['social']['dependent']['choice']
        df_player = pd.DataFrame(
                    {
                        "SessionID" : session_id,
                        "PlayerID" : player_id,
                        "GlmPlayerID" : session_id*2 + player_id,
                        "ChooseHigh" : np.concatenate([choice_solo, choice_social]),
                        "WallSep" :  np.concatenate([player_data_solo['wall_sep'], player_data_social['wall_sep']]),
                        "FirstSeenWall" : np.concatenate([player_data_solo['first_seen'], player_data_social['first_seen']]),
                        "D2H" : np.concatenate([player_data_solo['d2h'], player_data_social['d2h']]),
                        "D2L" : np.concatenate([player_data_solo['d2l'], player_data_social['d2l']]),
                        "SocialContext" : np.concatenate([np.ones(player_data_solo["wall_sep"].shape[0]) - 1, np.ones(player_data_social["wall_sep"].shape[0])]) # 0 for solo, 1 for social
                    }
        )

        # append this smaller dataframe to the the full dataframe
        glm_df_solo_social = pd.concat([glm_df_solo_social, df_player], ignore_index=True)


glm_df_solo_social["FirstSeenWall"] = glm_df_solo_social["FirstSeenWall"].astype(str).astype("category")
glm_df_solo_social["WallSep"] = glm_df_solo_social["WallSep"].astype(str).astype("category")

#### Solo df

In [3]:
glm_df_solo = pd.DataFrame()

for session_id, players in analysis_results.items():
    for player_id in players:
        
        # take each filtered_regressor array and fill the relevant df field for this player
        player_data = analysis_results[session_id][player_id]['solo']['regressors']
        choice = analysis_results[session_id][player_id]['solo']['dependent']['choice']
        df_player = pd.DataFrame(
                    {
                        "SessionID" : session_id,
                        "PlayerID" : player_id,
                        "GlmPlayerID" : session_id*2 + player_id,
                        "ChooseHigh" : choice,
                        "WallSep" : player_data['wall_sep'],
                        "FirstSeenWall" : player_data['first_seen'],
                        "D2H" : player_data['d2h'],
                        "D2L" : player_data['d2l']
                    }
        )

        # append this smaller dataframe to the the full dataframe
        glm_df_solo = pd.concat([glm_df_solo, df_player], ignore_index=True)


glm_df_solo["FirstSeenWall"] = glm_df_solo["FirstSeenWall"].astype(str).astype("category")
glm_df_solo["WallSep"] = glm_df_solo["WallSep"].astype(str).astype("category")

In [1]:
from contextlib import redirect_stdout


def generate_leave_one_out_dataframes(df):
    
    # randomise the order of the rows
    df_shuffle = df.sample(frac=1, random_state=17).reset_index(drop=True)

    # create lists to store the DataFrames
    dfs_with_row_removed = []
    dfs_with_removed_row = []

    # iterate through each row index in the DataFrame
    for i in range(len(df)):
        # create a DataFrame with one row removed
        df_without_row = df_shuffle.drop(index=i).reset_index(drop=True)
        dfs_with_row_removed.append(df_without_row)
        
        # create a DataFrame with only the removed row
        df_with_removed_row = df_shuffle.iloc[[i]].reset_index(drop=True)
        dfs_with_removed_row.append(df_with_removed_row)

    # Now you have two lists:
    # 1. dfs_with_row_removed: DataFrames with one row removed
    # 2. dfs_with_removed_row: DataFrames containing only the removed rows

    return dfs_with_row_removed, dfs_with_removed_row


def select_data_for_models(dfs_with_row_removed, dfs_with_removed_row, original_df_size, n=5, random_seed=None):

    if random_seed is not None:
        np.random.seed(random_seed)
    
    # randomly generate n integers between 0 and the length of the DataFrame, without replacement
    random_indices = np.random.choice(original_df_size, size=n, replace=False)

    # restrict the dfs_with_row_removed and dfs_with_removed_row lists to only the randomly selected indices
    dfs_with_row_removed_sampled = [dfs_with_row_removed[i] for i in random_indices]
    dfs_with_removed_row_sampled = [dfs_with_removed_row[i] for i in random_indices]

    print(dfs_with_removed_row_sampled)

    return dfs_with_row_removed_sampled, dfs_with_removed_row_sampled, random_indices

def fit_models(dfs_with_row_removed_sampled, model_formula):
    
    models = []
    max_count = len(dfs_with_row_removed_sampled)
    f = IntProgress(min=0, max=max_count, description='Fitting models')
    display(f)

    # Suppress the output of the models fitting process
    with open(os.devnull, 'w') as fnull:
        with redirect_stdout(fnull):
            for i, df in enumerate(dfs_with_row_removed_sampled):
                model = Lmer(model_formula, data=df, family='binomial')
                model.fit()
                models.append(model)
                print(f"Model {i} fit with {len(df)} rows")
                f.value += 1


    
    return models

def calculate_predictions(models, original_df_size, dfs_with_removed_row_sampled, random_indices):
    
    predictions = np.full(len(dfs_with_removed_row_sampled), np.nan)
    predictions_maintained_index = np.full(original_df_size, np.nan)
    for i, model in enumerate(models):
        # get the row that was removed for this model
        removed_row = dfs_with_removed_row_sampled[i]
        
        # get the prediction for this row
        prediction = model.predict(removed_row, skip_data_checks=True, verify_predictions=False)
        
        # assign the prediction to the correct index in the predictions array
        predictions_maintained_index[random_indices[i]] = prediction[0]

        # also assign the prediction to the next index of a new array
        predictions[i] = prediction[0]

    return predictions, predictions_maintained_index

def calculate_likelihoods(df, predictions_maintained_index, random_indices):
    
    # calculate the metric for each prediction
    likelihoods = np.full(len(random_indices), np.nan)
    for i, idx in enumerate(random_indices):
        predicted_output = predictions_maintained_index[idx]
        true_output = df.iloc[idx]['ChooseHigh']
        likelihood = predicted_output**true_output * (1 - predicted_output)**(1 - true_output)
        likelihoods[i] = likelihood

    return likelihoods

def calculate_nll(likelihoods):
    # #### sum the logs of the likelihoods, and take the negative
    summed_log_likelihoods = np.sum(np.log(likelihoods)) 
    nll = -summed_log_likelihoods

    return nll

def save_cross_validation_results(name, model_formula, df, random_indices, models, predictions, nll):
    ''' Save the cross-validation results to a file. '''
    
    cross_validation_results = {
        "name": name,
        "model_formula": model_formula,
        "dataframe": df,
        "random_indices" : random_indices,
        # "models" : models,
        "predictions" : predictions,
        "nll" : nll
    }

    filename = f'./data/CV_results_{name}.pickle'

    with open(filename, 'wb') as f:
        pickle.dump(cross_validation_results, f)

    print("Data saved to: ", filename)



In [8]:
def run_cross_validation(df, model_formula, name, n=50, save_results=False, random_seed=None):
    ''' Run leave-one-out cross-validation on the given dataframe and model formula.
        Returns the negative log-likelihood (NLL), fitted models, random indices, 
        predictions, and likelihoods.
        
        Arguments:
        df: DataFrame containing the data for cross-validation.
        model_formula: String representing the model formula for the GLM.
        name: String representing the name for saving the models.
        n: Number of random samples to select for cross-validation.
        save_models: Boolean indicating whether to save the models to file.
        
        Returns:
        nll: Negative log-likelihood of the model.
        models: List of fitted models.
        random_indices: List of random indices used for cross-validation.
        predictions: Array of predictions from the models.
        likelihoods: Array of likelihoods calculated from the predictions. '''
    
    n_rows = df.shape[0]

    # Step 1: Generate leave-one-out dataframes
    dfs_with_row_removed, dfs_with_removed_row = generate_leave_one_out_dataframes(df)

    # Step 2: Select data for models
    (dfs_with_row_removed_sampled,
     dfs_with_removed_row_sampled,
     random_indices) = select_data_for_models(dfs_with_row_removed, dfs_with_removed_row, n_rows, n, random_seed=random_seed)

    # Step 3: Fit models
    models = fit_models(dfs_with_row_removed_sampled, model_formula)

    # Step 4: Calculate predictions
    predictions, predictions_maintained_index = calculate_predictions(models, n_rows, dfs_with_removed_row_sampled, random_indices)

    # Step 5: Calculate likelihoods
    likelihoods = calculate_likelihoods(df, predictions_maintained_index, random_indices)

    # Step 6: Calculate NLL
    nll = calculate_nll(likelihoods)

    # Step 7: Save models to file (optional)
    if save_results:
        save_cross_validation_results(name, model_formula, df, random_indices, models, predictions, nll)
    
    return nll, models, random_indices, predictions, likelihoods

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + (1|GlmPlayerID)'
(nll, models, random_indices,
  predictions, likelihoods) = run_cross_validation(glm_df_solo, model_formula,
                                                    "solo_randomintercepts_400", n=400,
                                                      save_results=True, random_seed=17)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + (D2L||GlmPlayerID)'
(nll, models, random_indices,
  predictions, likelihoods) = run_cross_validation(glm_df_solo, model_formula,
                                                    "solo_randomintercepts_randomd2l_400", n=400,
                                                      save_results=True, random_seed=17)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + WallSep:FirstSeenWall + (1|GlmPlayerID)'
(nll, models, random_indices,
  predictions, likelihoods) = run_cross_validation(glm_df_solo, model_formula,
                                                    "solo_randomintercepts_randomd2l_lowinteractions_400", n=400,
                                                      save_results=True, random_seed=17)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + WallSep:FirstSeenWall + D2H:FirstSeenWall + (1|GlmPlayerID)'
(nll, models, random_indices,
  predictions, likelihoods) = run_cross_validation(glm_df_solo, model_formula,
                                                    "solo_randomintercepts_randomd2l_midinteractions_400", n=400,
                                                      save_results=True, random_seed=17)

In [None]:
model_formula = 'ChooseHigh ~ 1 + D2H + D2L + FirstSeenWall + WallSep + WallSep:FirstSeenWall + D2H:FirstSeenWall + D2L:FirstSeenWall + (1|GlmPlayerID)'
(nll, models, random_indices,
  predictions, likelihoods) = run_cross_validation(glm_df_solo, model_formula,
                                                    "solo_randomintercepts_randomd2l_allinteractions_400", n=400,
                                                      save_results=True, random_seed=17)

In [None]:
# random_indices, models, predictions, likelihoods, nll

(array([ 376, 3187, 3099, 2178, 1094,  680, 3275, 2389, 3185, 1418,  248,
        1000, 3539, 3516, 2481, 2030,  767, 2264,  184, 1238, 2989, 1208,
        3695, 3177, 1074, 4280,  794, 1120, 3079, 2630, 3744, 3434, 2254,
        4387, 3135, 1719, 3800, 3824, 3219, 3791, 2652, 2397, 1051,   39,
        3016, 4548,   69, 4470, 3915, 3890, 3223,  808, 1028, 2722, 2698,
        3523, 4170, 3501,  265, 2691, 2559, 1401, 2257, 4377,  805,   65,
        3278, 2443,  684, 1565, 1965, 2633, 2791, 3853, 2575, 2715, 4477,
        2611, 1442, 1882,  243,  756,  981, 2747,  394, 4151, 1908, 1339,
        2203, 3486,  936, 3745, 2029, 4273, 2689,  676,  606, 3239,  749,
        1264,   18, 2199,  686, 1687, 4164, 1662, 2503, 3527, 2265, 3195,
        1482,  179,  866, 1995, 3034, 2111,  912, 4072, 1073, 3851, 1817,
        2384, 2857,    3,  822, 4035, 2579,  780, 1335, 1907,   29, 3491,
        1083, 3119,   72, 3794, 1985, 2196, 1691, 1972,  644, 4317, 2960,
        1718, 3156, 3966, 3797, 1706, 