In [None]:
%load_ext autoreload
%autoreload 2

import parse_data.prepare_data as prepare_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import globals
import data_strings
import data_extraction.get_indices as get_indices
import analysis.wall_visibility_and_choice as wall_visibility_and_choice
from trajectory_analysis import trajectory_vectors
from plotting import plot_octagon
import identify_filepaths
from data_extraction.trial_list_filters import filter_trials_other_visible
from analysis import opponent_visibility


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Create a dataframe to feed into a GLM using D2H, D2L, First Seen, Wall Separation, and PlayerID (random effect) to predict P(Choose High)

In [None]:
data_folder = data_strings.DATA_FOLDER
json_filenames_social, json_filenames_solo = identify_filepaths.get_filenames()

In [51]:
json_filenames_social

['240913_1\\2024-09-13_11-31-00_YW13_JL13_Social.json',
 '240927_1\\2024-09-27_14-25-20_SH27_EN27_Social.json',
 '241017_1\\2024-10-17_14-28-40_SP17_AW17_Social.json',
 '241017_2\\2024-10-17_16-41-38_ZH17_EM17_Social.json',
 '241112_1\\2024-11-12_13-31-14_KA12_WM12_Social.json',
 '241112_2\\2024-11-12_15-23-24_FA12_SL12_Social.json',
 '241113_1\\2024-11-13_14-18-54_NK13_RD13_Social.json',
 '241113_2\\2024-11-13_15-28-07_YL13_HC13_Social.json']

In [54]:
# restrict data for testing
json_filenames_social = json_filenames_social[:8]
json_filenames_solo = json_filenames_solo[:32]

In [None]:
df, trial_lists_social = prepare_data.prepare_data(data_folder, json_filenames_social, combine=False)

In [None]:
df, trial_lists_solo = prepare_data.prepare_data(data_folder, json_filenames_solo, combine=False)

In [None]:
# create a list of combined pre- and post- social solo sessions, removing 5 trials from each pre
trial_lists_combined_solo = []
cut_trials = 5
for i in range(0,len(trial_lists_solo), 2): # iterate over each individual
    # get the trial lists for both solo sessions
    trial_list_first_solo = trial_lists_solo[i]
    trial_list_second_solo = trial_lists_solo[i + 1]

    # cut first cut_trials trials (learning controls/associations) from the first solo
    trial_list_first_solo = trial_list_first_solo[cut_trials:]

    # combine trial lists from the first and second solo sessions (the current and consecutive index)
    trial_list = trial_list_first_solo + trial_list_second_solo

    trial_lists_combined_solo.append(trial_list)


In [130]:
len(trial_lists_combined_solo), len(trial_lists_social)

(16, 8)

### Regressor value extraction functions (for one session)

In [None]:
def extract_wall_sep(trial_list):
    ''' Return wall separation for one session '''
    
    wall_sep = np.full(len(trial_list), np.nan)
    for i, trial in enumerate(trial_list):
        wall_sep_this_trial = get_indices.get_wall_difference(trial=trial)
        wall_sep[i] = wall_sep_this_trial

    return wall_sep


def extract_first_wall_seen(trial_list, player_id):
    ''' Return first visible walls for one player across one session.
        1 for WALL_1, 2 for WALL_2, np.nan for no visible wall (or both initially visible) '''
        
    high_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                        player_id,
                                                                                                        wall_index=0,
                                                                                                        current_fov=110)

    low_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                        player_id, 
                                                                                                        wall_index=1,
                                                                                                        current_fov=110)
    low_wall_first_visible_session = low_wall_first_visible_session*2
    
    first_visible_session = high_wall_first_visible_session + low_wall_first_visible_session

    first_visible_session[first_visible_session == 0] = np.nan

    return first_visible_session

# double check code
def extract_distances_to_walls(trial_list, player_id):
    ''' Return a trial_num, 2 sized array, where column 1
        is distance to WALL_1, and column 2 is distance to WALL_2.
        Data applies to one full session, and specified player_id'''
    
    # get octagon alcove coordinates
    alcove_coordinates = plot_octagon.return_alcove_centre_points()

    positions_session = np.full((len(trial_list), 2), np.nan)
    walls_session = np.full((len(trial_list), 2), np.nan)
    distances_session = np.full((len(trial_list), 2), np.nan)

    # get distances for each trial in the session
    for i, trial in enumerate(trial_list):
        # get WALL_1 and WALL_2 coordinates
        trial_walls = get_indices.get_walls(trial)
        high_wall_idx = trial_walls[0] - 1
        low_wall_idx = trial_walls[1] - 1
        trial_high_coordinates = alcove_coordinates[:,high_wall_idx]
        trial_low_coordinates = alcove_coordinates[:, low_wall_idx]

        # index trajectory at timepoint 0 to get player starting coordinates
        trajectory = trajectory_vectors.extract_trial_player_trajectory(trial=trial, player_id=player_id)
        trial_start_position = trajectory[:,0]

        # find distance between start position and WALL_1/WALL_2
        d2h = np.linalg.norm(trial_high_coordinates - trial_start_position) # WALL_1
        d2l = np.linalg.norm(trial_low_coordinates - trial_start_position) # WALL_2

        walls_session[i,:] = trial_walls 
        positions_session[i,:] = trial_start_position
        distances_session[i,:] = np.hstack((d2h, d2l))

    return distances_session


def extract_opponent_visibility_slice_onset(trial_list, player_id, current_fov=110):
    ''' Return opponent visibility at slice onset for one player for one session '''
    
    # slice onset angle of Other from self centre FoV
    orientation_angle_to_other_session = opponent_visibility.get_angle_of_opponent_from_player_session(player_id, trial_list)

    # boolean array of Other visible
    other_visible_session = opponent_visibility.get_other_visible_session(orientation_angle_to_other_session, current_fov)
    other_visible_session = other_visible_session.astype(int) # converted to int for categorical regressor

    return other_visible_session


def extract_player_choice(trial_list, player_id, inferred_choice=True):
    ''' Return (inferred by default) player choice for one player for one session.
        Where inferred and actual choice are both missing, values are np.nan '''

    # array of wall numbers where player won, np.nan where player did not
    player_choice = wall_visibility_and_choice.get_player_wall_choice(trial_list, player_id,
                                                                        inferred_choice=inferred_choice, debug=False)

    # 2 where player chose High, 0 where player chose Low, np.nan where player lost
    high_wall_chosen_session = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                    given_wall_index=0)
    high_wall_chosen_session = high_wall_chosen_session*2

    # 1 where player chose Low, 0 where player chose High, np.nan where player lost
    low_wall_chosen_session  = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                    given_wall_index=1)

    # 1 or 2 where player chose Low or High respectively, np.nan where player lost
    chosen_wall_session = high_wall_chosen_session + low_wall_chosen_session

    return chosen_wall_session


def extract_trial_outcome(trial_list, player_id):
    ''' Return whether this player won the trial for one player for one session '''
    
    trigger_activators = get_indices.get_trigger_activators(trial_list)
    this_player_won_session = (trigger_activators-1)*-1 if player_id == 0 else trigger_activators

    return this_player_won_session

### Extract 1D arrays for each player for the regressor values (Sandbox, applied to a single trial list)

In [70]:
trial_list = trial_lists_social[6]
trial_indices = get_indices.get_trials_trialtype(trial_list, trial_type=globals.HIGH_LOW)
trial_list = [trial_list[i] for i in trial_indices]
player_id = 0
current_fov = 110

In [71]:
len(trial_list)

93

#### Wall Separation

In [None]:

wall_sep = np.full(len(trial_list), np.nan)
for i, trial in enumerate(trial_list):
    wall_sep_this_trial = get_indices.get_wall_difference(trial=trial)
    wall_sep[i] = wall_sep_this_trial


In [14]:
np.count_nonzero(wall_sep == 4)/np.count_nonzero(wall_sep)

0.3157894736842105

#### First Seen

In [16]:
    
high_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                    player_id,
                                                                                                    wall_index=0,
                                                                                                    current_fov=110)

low_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                     player_id, 
                                                                                                     wall_index=1,
                                                                                                     current_fov=110)
low_wall_first_visible_session = low_wall_first_visible_session*2



first_visible_session = high_wall_first_visible_session + low_wall_first_visible_session

In [17]:
first_visible_session

array([2., 1., 1., 2., 0., 0., 2., 1., 1., 1., 2., 2., 1., 1., 2., 2., 2.,
       2., 2., 1., 1., 0., 0., 2., 2., 0., 2., 2., 1., 2., 2., 2., 2., 2.,
       2., 2., 0., 1., 1., 2., 2., 0., 2., 1., 0., 0., 2., 2., 1., 0., 2.,
       2., 0., 2., 0., 0., 1., 1., 1., 1., 2., 2., 2., 2., 1., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 2., 2., 2., 1., 2., 2., 0., 2., 1., 1., 2.,
       2., 2., 2., 1., 1., 0., 2., 2., 2., 1., 1., 2., 1., 1., 0., 2., 1.,
       1., 2., 1., 2., 2., 2., 2., 1., 1., 2., 1., 0.])

#### Distance to High, Low
(Check code validity)

In [None]:
# get octagon alcove coordinates
alcove_coordinates = plot_octagon.return_alcove_centre_points()

start_positions = np.full((len(trial_list), 2), np.nan)
session_walls = np.full((len(trial_list), 2), np.nan)
distances = np.full((len(trial_list), 2), np.nan)

# get distances for each trial in the session
for i, trial in enumerate(trial_list):
    # get WALL_1 and WALL_2 for each trial in the session
    trial_walls = get_indices.get_walls(trial)
    high_wall_idx = trial_walls[0] - 1
    low_wall_idx = trial_walls[1] - 1
    trial_high_coordinates = alcove_coordinates[:,high_wall_idx]
    trial_low_coordinates = alcove_coordinates[:, low_wall_idx]

    # index trajectory at timepoint 0 to get starting position
    trajectory = trajectory_vectors.extract_trial_player_trajectory(trial=trial, player_id=player_id)
    trial_start_position = trajectory[:,0]

    # find distance between start position and WALL_1/WALL_2
    d2h = np.linalg.norm(trial_high_coordinates - trial_start_position) # WALL_1
    d2l = np.linalg.norm(trial_low_coordinates - trial_start_position)

    session_walls[i,:] = trial_walls 
    start_positions[i,:] = trial_start_position
    distances[i,:] = np.hstack((d2h, d2l))


#### Opponent visibility

In [82]:
# slice onset angle of Other from self centre FoV
orientation_angle_to_other_session = opponent_visibility.get_angle_of_opponent_from_player_session(player_id, trial_list)

# boolean array of Other visible
other_visible_session = opponent_visibility.get_other_visible_session(orientation_angle_to_other_session, current_fov)
other_visible_session = other_visible_session.astype(int) # converted to int for categorical regressor


In [81]:
other_visible_session

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0])

#### Player choice

In [None]:
# array of wall numbers where player choice is confident, np.nan where player lost and choice is unconfident
player_choice = wall_visibility_and_choice.get_player_wall_choice(trial_list, player_id,
                                                                    inferred_choice=True, debug=False)

# 2 where player chose High, 0 where player chose Low, np.nan where player lost
high_wall_chosen_session = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                given_wall_index=0)
high_wall_chosen_session = high_wall_chosen_session*2

# 1 where player chose Low, 0 where player chose High, np.nan where player lost
low_wall_chosen_session  = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                given_wall_index=1)

# 1 or 2 where player chose Low or High respectively, np.nan where player lost
chosen_wall_session_wins_and_losses = high_wall_chosen_session + low_wall_chosen_session

### Filter trials to only include those with full information for the GLM 
- Remove trials without recorded choice (np.nan in choice array) (whether I'm using inferred-choice or not)
- Remove trials without a first visible wall (np.nan in first seen array)
- Filter HighLow trials initially

The best way to do the above may be to keep an array of 'original indices', filter this array in the same way as I do my normal trial list filtering, and then I have an indices array with preserved numbering that I can use to index valid trials to add to my dataframe

In [109]:
np.sum(one_wall_first_visible_mask)

np.int64(82)

In [110]:
np.sum(retrievable_choice_mask)

np.int64(87)

In [104]:
# identify indices of trial list with HighLow trials
high_low_trial_indices = get_indices.get_trials_trialtype(trial_list, trial_type=globals.HIGH_LOW)

# get choice and first visible wall data for one player, session
player_choice_session = extract_player_choice(trial_list, player_id)
first_wall_seen_session = extract_first_wall_seen(trial_list, player_id)


# apply masks for one visible wall and a retrievable choice to the set of HighLow trials
retrievable_choice_mask = ~np.isnan(player_choice_session[high_low_trial_indices])
one_wall_first_visible_mask = ~np.isnan(first_wall_seen_session[high_low_trial_indices])

# combine masks into one 
final_mask = retrievable_choice_mask & one_wall_first_visible_mask

In [115]:
filtered_indices = high_low_trial_indices[final_mask]
filtered_trials = [trial_list[i] for i in filtered_indices]

#### Filtering function

In [131]:
def filter_valid_trial_indices(trial_list, player_id):
    ''' Return the indices of a filtered trial list that correspond to trials in which there is a 
        retrievable choice and an identifiable first seen wall.'''

    # identify indices of trial list with HighLow trials
    high_low_trial_indices = get_indices.get_trials_trialtype(trial_list, trial_type=globals.HIGH_LOW)

    # get choice and first visible wall data for one player, session
    player_choice_session = extract_player_choice(trial_list, player_id)
    first_wall_seen_session = extract_first_wall_seen(trial_list, player_id)

    # apply masks for one visible wall and a retrievable choice to the set of HighLow trials
    retrievable_choice_mask = ~np.isnan(player_choice_session[high_low_trial_indices])
    one_wall_first_visible_mask = ~np.isnan(first_wall_seen_session[high_low_trial_indices])

    # combine masks
    final_mask = retrievable_choice_mask & one_wall_first_visible_mask

    # filter the trial list indices based on masks
    filtered_indices = high_low_trial_indices[final_mask]

    assert filtered_indices.size > 0
    
    return filtered_indices

### Create a dictionary to hold, for each session and player, regressor values for the session, only including trials with fully-populated regessors
- Fields for each of the regressors applied to all trials
- Fields for each of the regressors with only valid trials


In [22]:
solo = False
player_ids = [0] if solo else [0,1]


In [None]:
analysis_results = {
    experiment_id: {
        player_id: {
            session_type: {

                'regressors': {
                    'wall_sep': None,
                    'first_seen': None,
                    'd2h': None,
                    'd2l': None,
                    'opponent_visible': None,
                },

                'dependent': {
                    'choice': None
                }
                
            }
            for session_type in ['solo', 'social']
        }   
        for player_id in player_ids
    }
    for experiment_id in np.arange(len(trial_lists_social))
}

In [None]:
analysis_results_solo = {
    session_id: {
        player_id: {

            # unfiltered regressors
            'regressors': {
                'wall_sep': None,
                'first_seen': None,
                'd2h': None,
                'd2l': None,
                'opponent_visible': None,
            },

            # regressors filtered for trials fully-populated regressor values
            'regressors_filtered': {
                'wall_sep': None,
                'first_seen': None,
                'd2h': None,
                'd2l': None,
                'opponent_visible': None,
            },

            'dependent': {
                'choice': None
            }

        }
        for player_id in player_ids
    }
    for session_id in np.arange(len(trial_lists_social))
}

### Populate the dictionary with data

In [24]:
inferred_choice = True

In [98]:
np.count_nonzero(analysis_results[14][1]['regressors_filtered']['outcome'] == 1)

48

In [121]:
len(trial_lists_solo)

32

In [None]:
for experiment_id, players in analysis_results.items():
    for player_id, data in players.items():
        
        # get the trial lists for this session and player
        trial_list_social = trial_lists_social[experiment_id]
        trial_list_solo = trial_lists_combined_solo[experiment_id*2 + player_id] # check this fits the above

        # filter the trial list for regressor valid trials
        trial_list_social = [trial_list_social[i] for i in filter_valid_trial_indices(trial_list_social, player_id)]
        trial_list_solo = [trial_list_solo[i] for i in filter_valid_trial_indices(trial_list_solo, player_id)]
        
        # regressors social
        player_data = analysis_results[experiment_id][player_id]['social']
        distances = extract_distances_to_walls(trial_list_social, player_id)
        player_data['regressors']['wall_sep'] = extract_wall_sep(trial_list_social)
        player_data['regressors']['first_seen'] = extract_first_wall_seen(trial_list_social, player_id)
        player_data['regressors']['d2h'] = distances[:,1]
        player_data['regressors']['d2l'] = distances[:,2]
        player_data['regressors']['opponent_visible'] = extract_opponent_visibility_slice_onset(trial_list_social, player_id)

        # dependent variable social
        player_data['dependent']['outcome'] = extract_trial_outcome(trial_list_social, player_id)

        # regressors solo
        player_data = analysis_results[experiment_id][player_id]['solo']





#### Populate a dataframe, with a row for each trial, and fields for regressors (only including trials with fully-populated regressors)

In [152]:
glm_df = pd.DataFrame()

for session_id, players in analysis_results.items():
    for player_id in players:
        
        # take each filtered_regressor array and fill the relevant df field for this player
        player_data = analysis_results[session_id][player_id]['regressors_filtered']
        df_player = pd.DataFrame(
                    {
                        "SessionID" : session_id,
                        "PlayerID" : player_id,
                        "GlmPlayerID" : session_id*2 + player_id,
                        "WallSep" : player_data['wall_sep'],
                        "FirstSeenWall" : player_data['first_seen'],
                        "ChooseHigh" : player_data['choice'],
                        "PlayerWin" : player_data['outcome']
                    }
        )


        # append this smaller dataframe to the the full dataframe
        glm_df = pd.concat([glm_df, df_player], ignore_index=True)

In [100]:
analysis_results[14][0]['regressors_filtered']['outcome']

array([ 1., -0., -0.,  1.,  1.,  1.,  1.,  1.,  1., -0.,  1.,  1., -0.,
       -0.,  1.,  1., -0., -0.,  1., -0.,  1.,  1., -0.,  1.,  1., -0.,
       -0., -0., -0.,  1.,  1., -0.,  1., -0.,  1.,  1.,  1., -0., -0.,
       -0., -0., -0.,  1., -0.,  1.,  1., -0.,  1., -0., -0.,  1., -0.,
       -0., -0., -0.,  1.,  1.,  1., -0., -0., -0.,  1.,  1., -0.,  1.,
        1., -0., -0.,  1., -0.])

In [118]:
glm_df[(glm_df['SessionID'] == 14) & (glm_df['PlayerID'] == 1)].iloc[-20:-10]

Unnamed: 0,SessionID,PlayerID,GlmPlayerID,WallSep,FirstSeenWall,ChooseHigh,PlayerWin
1924,14,1,29,1.0,2.0,1.0,1.0
1925,14,1,29,2.0,2.0,1.0,1.0
1926,14,1,29,1.0,2.0,2.0,1.0
1927,14,1,29,4.0,1.0,0.0,0.0
1928,14,1,29,2.0,1.0,2.0,0.0
1929,14,1,29,1.0,2.0,2.0,0.0
1930,14,1,29,2.0,1.0,2.0,1.0
1931,14,1,29,1.0,1.0,2.0,1.0
1932,14,1,29,2.0,2.0,1.0,1.0
1933,14,1,29,1.0,2.0,1.0,1.0


In [119]:
glm_df['ChooseHigh']

0       2.0
1       1.0
2       1.0
3       1.0
4       2.0
       ... 
2466    1.0
2467    1.0
2468    1.0
2469    2.0
2470    1.0
Name: ChooseHigh, Length: 2471, dtype: float64

### Build the GLM in statsmodels

In [132]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [153]:
# # Convert categorical predictors into factors
glm_df["Separation"] = glm_df["WallSep"].astype("category")
glm_df["Outcome"] = glm_df["PlayerWin"].astype("category")
glm_df["FirstSeenWall"] = glm_df["FirstSeenWall"].astype("category")

# Fit a logistic regression (GLM with binomial link)
glm_model = smf.glm(
    formula="ChooseHigh ~ WallSep + PlayerWin + FirstSeenWall",
    data=glm_df,
    family=sm.families.Binomial()
).fit()

# Display the results
print(glm_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             ChooseHigh   No. Observations:                 2471
Model:                            GLM   Df Residuals:                     2467
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                   -inf
Date:                Mon, 10 Feb 2025   Deviance:                   1.2997e+05
Time:                        15:10:07   Pearson chi2:                 7.55e+18
No. Iterations:                     4   Pseudo R-squ. (CS):                nan
Covariance Type:            nonrobust                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept             4.591e+15 

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  n * np.log(1 - mu + 1e-20)) * var_weights


In [None]:
smf.mixedlm("Outcome ~ X1 + X2", data, groups=data["PlayerID"])

In [154]:
glmm_model = smf.mixedlm(

    formula = "ChooseHigh ~ WallSep + PlayerWin + FirstSeenWall",
    data = glm_df,
    groups = glm_df['GlmPlayerID'],
    family = sm.families.Binomial()
).fit()

print(glmm_model.summary())

ValueError: argument family not permitted for MixedLM initialization