In [19]:
%load_ext autoreload
%autoreload 2

import parse_data.prepare_data as prepare_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import globals
import data_strings
import data_extraction.get_indices as get_indices
import analysis.wall_visibility_and_choice as wall_visibility_and_choice
from trajectory_analysis import trajectory_vectors
from plotting import plot_octagon



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Create a dataframe to feed into a GLM using Choice, Outcome, Wall Separation, and PlayerID to predict P(Choose High)

In [7]:
data_folder = data_strings.DATA_FOLDER
json_filenames_all_social = data_strings.JSON_FILENAMES_SOCIAL
json_filenames_all_solo = data_strings.JSON_FILENAMES_SOLO

In [8]:
json_filenames = json_filenames_all_social[:8]

In [9]:
df, trial_lists = prepare_data.prepare_data(data_folder, json_filenames, combine=False)

filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\240913_1\2024-09-13_11-31-00_YW13_JL13_Social.json
Data is from period before 2024-09-13 00:00:00
Running dataframe through playerinfo_playerposition_conversion.
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\240927_1\2024-09-27_14-25-20_SH27_EN27_Social.json
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\241017_1\2024-10-17_14-28-40_SP17_AW17_Social.json
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\241017_2\2024-10-17_16-41-38_ZH17_EM17_Social.json
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\241112_1\2024-11-12_13-31-14_KA12_WM12_Social.json
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\241112_2\2024-11-12_15-23-24_FA12_SS12_Social.json
Loading complete.
Preprocessing complete.
filepath: C:\Users\tomha\OneDrive\PhD\SWC\data\24

### Regressor value extraction functions

In [10]:
def extract_wall_sep(trial_list):
    
    wall_sep = np.full(len(trial_list), np.nan)
    for i, trial in enumerate(trial_list):
        wall_sep_this_trial = get_indices.get_wall_difference(trial=trial)
        wall_sep[i] = wall_sep_this_trial

    return wall_sep

def extract_first_wall_seen(trial_list, player_id):
        
    high_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                        player_id,
                                                                                                        wall_index=0,
                                                                                                        current_fov=110)

    low_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                        player_id, 
                                                                                                        wall_index=1,
                                                                                                        current_fov=110)
    low_wall_first_visible_session = low_wall_first_visible_session*2
    
    first_visible_session = high_wall_first_visible_session + low_wall_first_visible_session

    return first_visible_session


def extract_player_choice(trial_list, player_id, inferred_choice):
    
    # array of wall numbers where player won, np.nan where player did not
    player_choice = wall_visibility_and_choice.get_player_wall_choice(trial_list, player_id,
                                                                        inferred_choice=inferred_choice, debug=False)

    # 2 where player chose High, 0 where player chose Low, np.nan where player lost
    high_wall_chosen_session = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                    given_wall_index=0)
    high_wall_chosen_session = high_wall_chosen_session*2

    # 1 where player chose Low, 0 where player chose High, np.nan where player lost
    low_wall_chosen_session  = get_indices.was_given_wall_chosen(trial_list, player_choice,
                                                                    given_wall_index=1)

    # 1 or 2 where player chose Low or High respectively, np.nan where player lost
    chosen_wall_session = high_wall_chosen_session + low_wall_chosen_session

    return chosen_wall_session


def extract_trial_outcome(trial_list, player_id):
    
    trigger_activators = get_indices.get_trigger_activators(trial_list)
    this_player_won = (trigger_activators-1)*-1 if player_id == 0 else trigger_activators

    return this_player_won

### Extract 1D arrays for each player for the regressor values (Sandbox)

#### Wall Separation

In [13]:
trial_list = trial_lists[6]

wall_sep = np.full(len(trial_list), np.nan)
for i, trial in enumerate(trial_list):
    wall_sep_this_trial = get_indices.get_wall_difference(trial=trial)
    wall_sep[i] = wall_sep_this_trial


In [14]:
np.count_nonzero(wall_sep == 4)/np.count_nonzero(wall_sep)

0.3157894736842105

#### First Seen

In [15]:
player_id = 0

In [16]:
    
high_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                    player_id,
                                                                                                    wall_index=0,
                                                                                                    current_fov=110)

low_wall_first_visible_session = wall_visibility_and_choice.get_given_wall_first_visible_session(trial_list,
                                                                                                     player_id, 
                                                                                                     wall_index=1,
                                                                                                     current_fov=110)
low_wall_first_visible_session = low_wall_first_visible_session*2



first_visible_session = high_wall_first_visible_session + low_wall_first_visible_session

In [17]:
first_visible_session

array([2., 1., 1., 2., 0., 0., 2., 1., 1., 1., 2., 2., 1., 1., 2., 2., 2.,
       2., 2., 1., 1., 0., 0., 2., 2., 0., 2., 2., 1., 2., 2., 2., 2., 2.,
       2., 2., 0., 1., 1., 2., 2., 0., 2., 1., 0., 0., 2., 2., 1., 0., 2.,
       2., 0., 2., 0., 0., 1., 1., 1., 1., 2., 2., 2., 2., 1., 2., 2., 2.,
       2., 1., 1., 1., 1., 1., 2., 2., 2., 1., 2., 2., 0., 2., 1., 1., 2.,
       2., 2., 2., 1., 1., 0., 2., 2., 2., 1., 1., 2., 1., 1., 0., 2., 1.,
       1., 2., 1., 2., 2., 2., 2., 1., 1., 2., 1., 0.])

#### Distance to High, Low
(Check code validity)

In [None]:
# get octagon alcove coordinates
alcove_coordinates = plot_octagon.return_alcove_centre_points()

start_positions = np.full((len(trial_list), 2), np.nan)
session_walls = np.full((len(trial_list), 2), np.nan)
distances = np.full((len(trial_list), 2), np.nan)

# get distances for each trial in the session
for i, trial in enumerate(trial_list):
    # get WALL_1 and WALL_2 for each trial in the session
    trial_walls = get_indices.get_walls(trial)
    high_wall_idx = trial_walls[0] - 1
    low_wall_idx = trial_walls[1] - 1
    trial_high_coordinates = alcove_coordinates[:,high_wall_idx]
    trial_low_coordinates = alcove_coordinates[:, low_wall_idx]

    # index trajectory at timepoint 0 to get starting position
    trajectory = trajectory_vectors.extract_trial_player_trajectory(trial=trial, player_id=player_id)
    trial_start_position = trajectory[:,0]

    # find distance between start position and WALL_1/WALL_2
    d2h = np.linalg.norm(trial_high_coordinates - trial_start_position) # WALL_1
    d2l = np.linalg.norm(trial_low_coordinates - trial_start_position)

    session_walls[i,:] = trial_walls 
    start_positions[i,:] = trial_start_position
    distances[i,:] = np.hstack((d2h, d2l))








### Filter trials to only include those with full information for the GLM 
- Remove trials without recorded choice (np.nan in choice array) (whether I'm using inferred-choice or not)
- Remove trials without a first visible wall (np.nan in first seen array)
- Filter HighLow trials initially

The best way to do the above may be to keep an array of 'original indices', filter this array in the same way as I do my normal trial list filtering, and then I have an indices array with preserved numbering that I can use to index valid trials to add to my dataframe

In [17]:
# identify indices of trial list with HighLow trials
high_low_trial_indices = get_indices.get_trials_trialtype(trial_list, trial_type=globals.HIGH_LOW)

In [18]:
# apply masks for one visible wall, and a retrievable choice, to the set of HighLow trials
retrievable_choice_mask = ~np.isnan(chosen_wall_session_wins_and_losses[high_low_trial_indices])
one_wall_first_visible_mask = first_visible_session[high_low_trial_indices] > 0

# combine masks into one 
final_mask = retrievable_choice_mask & one_wall_first_visible_mask

In [19]:
filtered_indices = high_low_trial_indices[final_mask]

In [20]:
filtered_indices

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  12,  15,  17,  18,
        20,  21,  22,  24,  26,  31,  32,  33,  36,  37,  38,  39,  41,
        43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  56,  57,
        61,  64,  71,  72,  73,  77,  78,  82,  83,  85,  89,  90,  92,
        93,  95,  97,  98, 100, 101, 102, 103, 104, 105, 106, 109, 110,
       111, 113, 114, 121, 122, 123, 125, 131, 132, 134])

#### Filtering function

In [21]:
def filter_valid_trial_indices(trial_list, first_visible_session, chosen_wall_session):

    # identify indices of trial list with HighLow trials
    high_low_trial_indices = get_indices.get_trials_trialtype(trial_list, trial_type=globals.HIGH_LOW)

    # apply masks for one visible wall, and a retrievable choice, to the set of HighLow trials
    retrievable_choice_mask = ~np.isnan(chosen_wall_session[high_low_trial_indices])
    one_wall_first_visible_mask = first_visible_session[high_low_trial_indices] > 0

    # combine masks into one 
    final_mask = retrievable_choice_mask & one_wall_first_visible_mask

    filtered_indices = np.array(high_low_trial_indices)[final_mask]

    return filtered_indices

### Create a dictionary to hold, for each session and player, regressor values for the session, only including trials with fully-populated regessors
- Fields for each of the regressors applied to all trials
- Fields for each of the regressors with only valid trials


In [22]:
solo = False
player_ids = [0] if solo else [0,1]


In [23]:
analysis_results = {
    session_id: {
        player_id: {

            # unfiltered regressors
            'regressors': {
                'wall_sep': None,
                'first_seen': None,
                'choice': None,
                'outcome': None
            },

            # regressors filtered for trials fully-populated regressor values
            'regressors_filtered': {
                'wall_sep': None,
                'first_seen': None,
                'choice': None,
                'outcome': None
            }
        }
        for player_id in player_ids
    }
    for session_id in np.arange(len(trial_lists))
}

### Populate the dictionary with data

In [24]:
inferred_choice = True

In [98]:
np.count_nonzero(analysis_results[14][1]['regressors_filtered']['outcome'] == 1)

48

In [100]:
analysis_results[14][0]['regressors_filtered']['outcome']

array([ 1., -0., -0.,  1.,  1.,  1.,  1.,  1.,  1., -0.,  1.,  1., -0.,
       -0.,  1.,  1., -0., -0.,  1., -0.,  1.,  1., -0.,  1.,  1., -0.,
       -0., -0., -0.,  1.,  1., -0.,  1., -0.,  1.,  1.,  1., -0., -0.,
       -0., -0., -0.,  1., -0.,  1.,  1., -0.,  1., -0., -0.,  1., -0.,
       -0., -0., -0.,  1.,  1.,  1., -0., -0., -0.,  1.,  1., -0.,  1.,
        1., -0., -0.,  1., -0.])

In [None]:
for session_id, players in analysis_results.items():
    for player_id, data in players.items():
        
        # get the trial list for this session
        trial_list = trial_lists[session_id]
        
        # get regressors for all trials in session
        player_data = analysis_results[session_id][player_id]['regressors']
        player_data['wall_sep'] = extract_wall_sep(trial_list)
        player_data['first_seen'] = extract_first_wall_seen(trial_list, player_id)
        player_data['choice'] = extract_player_choice(trial_list, player_id, inferred_choice)
        player_data['outcome'] = extract_trial_outcome(trial_list, player_id)

        # filtered the trials to only include those with fully-populated regressors, and store for only these indices
        filtered_indices = filter_valid_trial_indices(trial_list, player_data['first_seen'], player_data['choice'])

        # account for filtering removing all trials from a player
        if filtered_indices.size > 0:
            player_data_valid_trials = analysis_results[session_id][player_id]['regressors_filtered']
            player_data_valid_trials['wall_sep'] = player_data['wall_sep'][filtered_indices]
            player_data_valid_trials['first_seen'] = player_data['first_seen'][filtered_indices]
            player_data_valid_trials['choice'] = player_data['choice'][filtered_indices]
            player_data_valid_trials['outcome'] = player_data['outcome'][filtered_indices]
        else:
            player_data_valid_trials['wall_sep'] = np.array([])
            player_data_valid_trials['first_seen'] = np.array([])
            player_data_valid_trials['choice'] = np.array([])
            player_data_valid_trials['outcome'] = np.array([])




#### Populate a dataframe, with a row for each trial, and fields for regressors (only including trials with fully-populated regressors)

In [152]:
glm_df = pd.DataFrame()

for session_id, players in analysis_results.items():
    for player_id in players:
        
        # take each filtered_regressor array and fill the relevant df field for this player
        player_data = analysis_results[session_id][player_id]['regressors_filtered']
        df_player = pd.DataFrame(
                    {
                        "SessionID" : session_id,
                        "PlayerID" : player_id,
                        "GlmPlayerID" : session_id*2 + player_id,
                        "WallSep" : player_data['wall_sep'],
                        "FirstSeenWall" : player_data['first_seen'],
                        "ChooseHigh" : player_data['choice'],
                        "PlayerWin" : player_data['outcome']
                    }
        )


        # append this smaller dataframe to the the full dataframe
        glm_df = pd.concat([glm_df, df_player], ignore_index=True)

In [118]:
glm_df[(glm_df['SessionID'] == 14) & (glm_df['PlayerID'] == 1)].iloc[-20:-10]

Unnamed: 0,SessionID,PlayerID,GlmPlayerID,WallSep,FirstSeenWall,ChooseHigh,PlayerWin
1924,14,1,29,1.0,2.0,1.0,1.0
1925,14,1,29,2.0,2.0,1.0,1.0
1926,14,1,29,1.0,2.0,2.0,1.0
1927,14,1,29,4.0,1.0,0.0,0.0
1928,14,1,29,2.0,1.0,2.0,0.0
1929,14,1,29,1.0,2.0,2.0,0.0
1930,14,1,29,2.0,1.0,2.0,1.0
1931,14,1,29,1.0,1.0,2.0,1.0
1932,14,1,29,2.0,2.0,1.0,1.0
1933,14,1,29,1.0,2.0,1.0,1.0


In [119]:
glm_df['ChooseHigh']

0       2.0
1       1.0
2       1.0
3       1.0
4       2.0
       ... 
2466    1.0
2467    1.0
2468    1.0
2469    2.0
2470    1.0
Name: ChooseHigh, Length: 2471, dtype: float64

### Build the GLM in statsmodels

In [132]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [153]:
# # Convert categorical predictors into factors
glm_df["Separation"] = glm_df["WallSep"].astype("category")
glm_df["Outcome"] = glm_df["PlayerWin"].astype("category")
glm_df["FirstSeenWall"] = glm_df["FirstSeenWall"].astype("category")

# Fit a logistic regression (GLM with binomial link)
glm_model = smf.glm(
    formula="ChooseHigh ~ WallSep + PlayerWin + FirstSeenWall",
    data=glm_df,
    family=sm.families.Binomial()
).fit()

# Display the results
print(glm_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             ChooseHigh   No. Observations:                 2471
Model:                            GLM   Df Residuals:                     2467
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                   -inf
Date:                Mon, 10 Feb 2025   Deviance:                   1.2997e+05
Time:                        15:10:07   Pearson chi2:                 7.55e+18
No. Iterations:                     4   Pseudo R-squ. (CS):                nan
Covariance Type:            nonrobust                                         
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept             4.591e+15 

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  n * np.log(1 - mu + 1e-20)) * var_weights


In [None]:
smf.mixedlm("Outcome ~ X1 + X2", data, groups=data["PlayerID"])

In [154]:
glmm_model = smf.mixedlm(

    formula = "ChooseHigh ~ WallSep + PlayerWin + FirstSeenWall",
    data = glm_df,
    groups = glm_df['GlmPlayerID'],
    family = sm.families.Binomial()
).fit()

print(glmm_model.summary())

ValueError: argument family not permitted for MixedLM initialization