In [1]:
# IPython magig  tools
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import StratifiedKFold, KFold
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

import os 
import pandas as pd
import seaborn as sns
sns.set_context('talk')
import matplotlib.pyplot as plt

import sys
sys.path.append("G:/My Drive/WORKING_MEMORY/EXPERIMENTS/ELECTROPHYSIOLOGY/ANALYSIS/src/functions/")
import ephys_functions as ephys
import model_functions as mod
import behavioral_functions as beh

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings. filterwarnings('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
path_to_data = r'C:\Users\tiffany.ona\Documents\Ephys\summary_complete'
save_data = r'G:\My Drive\WORKING_MEMORY\PAPER\2ND_SUBMISSION_NAT_NEURO\data_for_resubmission'
save_figures = r'G:\My Drive\WORKING_MEMORY\PAPER\2ND_SUBMISSION_NAT_NEURO\figures_for_resubmission'

____________________
# Decode external drive across different conditions

### **Look for the code splitting and aligning on stimulus and delay_OFF**

In [None]:
df_cum_res = pd.DataFrame()
df_cum_shuffle = pd.DataFrame()
df_cum_sti = pd.DataFrame()
for filename in os.listdir(os.getcwd())[2:]:
    if filename[-3:] != 'pdf':
        df = pd.read_csv(path_to_data + "/" + filename, sep=',', index_col=0)
        print(filename)
    else:
        continue
    name = filename

    # Variables used for decoder training
    nsplits = 2 # number of splits of the Kfold
    decode = 'drive' # decoding variable
    align = 'Delay_OFF' # aligning epoch
    r = 0.25 # binning window
    start = -0.25 # Window to select the training window/obtain number of trials for each condition
    stop = 0.0 # Window to select the training window/obtain number of trials for each condition
    variable_train = 'WM_roll'
    hit_train = 1
    ratio_train = 0.6

    # Variables for testing
    colors = ['indigo']
    variables_test= ['WM_roll','RL_roll']
    hits_test = [1,'all']
    ratios_test = [0.6, 0.4]
    variables_combined = [variables_test[0]+'_'+str(hits_test[0])]

    # variables_combined = [variables_test[0]+'_'+str(hits_test[0]),variables_test[1]+'_'+str(hits_test[1])]

    # Extra control
    reduce_WM = 'all' # Reduce the number of WM trials to match RL correct numbers (either all or only correct) 

    df = ephys.add_time_before_stimulus(df, -12)

    df['delay'] = np.around(df.delay, 2)

    # Remove trials with 0 delay.
    df = df.loc[(df.delay != 0.1) & (df.delay != 0.2)]

    if decode == 'previous_vector_answer':
        # This is only for doing the previous choice
        df['after_correct'] = np.where(
            df['previous_vector_answer'] == df['previous_reward_side'], 1, 0)
        df = df.loc[df.after_correct == 1]
    elif decode == 'drive':
        if align == 'Delay_OFF':
            df['drive'] = np.where((df.times>-df.delay)&(df.times< 0), 1, 0)
        elif align == 'Stimulus_ON':
            df['drive'] = np.where((df.times>0)&(df.times< df.delay+0.4), 1, 0)
    
    # Recover trials with only the selected conditions
    # df = df.loc[(df[variables[0]]>ratio)&(df.hit==hits[0])]

    # Recover all the neurons in these session. This is because sometimes in some type of trials there are no activity for some neurons
    # even though in the training set there was. We need to fill such neurons with 0 always.
    cluster_list = df.cluster_id.unique()
    if hit_train == 'all' and variable_train == 'all':
        df_train = df
    elif hit_train == 'all':
        df_train = df.loc[(df[variable_train] > ratio_train)]
    elif variable_train == 'all':
        df_train = df.loc[(df['hit'] == hit_train)]
    else:
        df_train = df.loc[(df[variable_train] > ratio_train)
                          & (df.hit == hit_train)]

    df_final, y = ephys.interval_extraction(df_train, decode=decode, align=align, start=start, stop=stop, cluster_list=cluster_list)
    df_final.reset_index(inplace=True)
    y = y.reset_index()
    y = y.drop(columns='trial')
    df_final = df_final.drop(columns='y')

    print('Trials for training: ', len(df_final))

    if len(df.loc[(df['RL_roll'] > 0.4)].trial.unique()) <= 10 or len(df.loc[(df['RL_roll'] > 0.4)].vector_answer.unique()) != 2:
        print('Not enough RL trials')
        continue

    # skf = KFold(n_splits=nsplits)
    skf = StratifiedKFold(n_splits=nsplits, shuffle=True)

    if len(y) <= 6:
        print('Not enough total amount of trials')
        continue
    
    fig, ([ax1, ax2]) = plt.subplots(1, 2, figsize=(12, 4), sharey=True)

    fold_no = 1
    for train_index, test_index in skf.split(df_final, y):
        print(fold_no)
        for color, variable,hit,ratio,left,right in zip(colors,variables_test,hits_test,ratios_test,[ax1,ax1],[ax2,ax2]):
            print(variable, hit)
            if len(test_index) > 1:
                if 1 not in y.iloc[test_index].y.values or 1 not in y.iloc[train_index].y.values or -1 not in y.iloc[test_index].y.values or -1 not in y.iloc[train_index].y.values:
                    # if 1 not in y[train_index] or  -1 not in y[train_index]:
                    print('Skip this fold because only one choice')
                    # print(y[test_index],y[train_index])
                    fold_no += 1
                    continue

            variable_full = str(variable)+'_'+str(hit)

            # -------------
            epoch = 'Stimulus_ON'
            df_real, df_temp = ephys.train_test(df, decode=decode, epoch=epoch, initrange=-6, endrange=1.5, r=r,
                                        cluster_list=cluster_list,
                                        variable=variable, hit=hit, ratio=ratio, nsurrogates=100,
                                        train_index=train_index, test_index=test_index,
                                        variable_train=variable_train, ratio_train=ratio_train, hit_train=hit_train)
            if len(df_temp) == 0:
                continue
        
            real = df_real.loc[(df_real['trial_type'] == variable_full)].mean(axis=0).to_numpy()
            times = np.around(np.array(df_real.columns)[:-1].astype(float),2)

            df_new= df_temp.loc[(df_temp.epoch==epoch)].groupby('times')['score']
            shuffle_mean = df_new.mean().values

            ephys.plot_results_decoder(fig, real, times, df_new, left, color = color, epoch = epoch, substract=False)
            df_cum_sti, df_cum_shuffle = ephys.add_to_summary(real, shuffle_mean, times, filename, variable_full, epoch, fold_no=fold_no, df_iter=df_temp, 
                   df_cum=df_cum_sti, df_cum_shuffle=df_cum_shuffle, substract=False)

            # --------------
            epoch = 'Delay_OFF'
            df_real, df_temp = ephys.train_test(df, decode=decode,  epoch=epoch, initrange=-1, endrange=4, r=r,
                                        cluster_list=cluster_list,
                                        variable=variable, hit=hit, ratio=ratio, nsurrogates=100,
                                        train_index=train_index, test_index=test_index,
                                        variable_train=variable_train, ratio_train=ratio_train, hit_train=hit_train)

            real = df_real.loc[(df_real['trial_type'] ==variable_full)].mean(axis=0).to_numpy()
            times = np.around(np.array(df_real.columns)[:-1].astype(float),2)
            df_new= df_temp.loc[(df_temp.epoch==epoch)].groupby('times')['score']
            shuffle_mean = df_new.mean().values
            
            ephys.plot_results_decoder(fig, real, times, df_new,  right, color = color, epoch = epoch, substract=False)
            df_cum_res, df_cum_shuffle = ephys.add_to_summary(real, shuffle_mean, times, filename, variable_full, epoch, fold_no=fold_no, df_iter=df_temp, 
                   df_cum=df_cum_res, df_cum_shuffle=df_cum_shuffle, substract=False)
            sns.despine()

        fold_no+=1
    plt.show()

### **Look for it across delays**

In [None]:
## Dataframe used for cumulative analysis
df_cum = pd.DataFrame()
df_cum_shuffle = pd.DataFrame()

os.chdir(path_to_data)
for filename in os.listdir(os.getcwd()):
# for filename in list_of_sessions:
    # 
    if filename[-3:] != 'pdf':
        df = pd.read_csv(filename, sep=',',index_col=0)
    else:
        continue
        
    print(filename, '/ Total session trials: ', len(df.trial.unique()), '/ Number of neurons: ', len(df.cluster_id.unique()))
    
    df = ephys.add_time_before_stimulus(df, 4)

    substract = True
    df['delay'] = np.around(df.delay,2)
    
    # Variables used for decoder training
    decode = 'drive'
    align='Delay_OFF'
    ratio = 0.6
    delay_train = 'all'
    start = -0.5
    stop = 0
    type_trial ='WM_roll'
    hit = 1
    nsplits = 5
    
    #Variables for testing
    colors=['black','darkgreen','orangered']
    variables = ['WM_roll','WM_roll','WM_roll']
    hits = [1, 1,1]
    ratios = [0.6, 0.6,0.6]
    delays = [1, 3, 10]
    for i in range(len(variables)):
        variables_combined.append(variables[i]+'_'+str(hits[i]))

    cluster_list = df.cluster_id.unique()

    if decode == 'previous_vector_answer':
        # This is only for doing the previous choice
        df['after_correct'] = np.where(
            df['previous_vector_answer'] == df['previous_reward_side'], 1, 0)
        df = df.loc[df.after_correct == 1]
    elif decode == 'drive':
        if align == 'Delay_OFF':
            df['drive'] = np.where((df.times>-df.delay)&(df.times< 0), 1, 0)
        elif align == 'Stimulus_ON':
            df['drive'] = np.where((df.times>0)&(df.times< df.delay+0.4), 1, 0)
            
    skf = StratifiedKFold(n_splits=nsplits, shuffle=True)
    # skf = KFold(n_splits=nsplits, shuffle=True)

    # Create a dataframe for training data
    if type_trial == 'all':
        df_train = df.loc[(df.hit==hit)]
    elif hit == 'all':
        df_train = df.loc[(df[type_trial]>=ratio)]
    else:
        df_train = df.loc[(df[type_trial]>=ratio)&(df.hit==hit)]

    if delay_train != 'all':
        df_train = df_train.loc[(df_train.delay==delay_train)]
    else:
        df_train = df_train.loc[(df_train.delay!=0.2)&(df_train.delay!=0.1)]
        
    df_final, y = ephys.interval_extraction_trial(df_train, variable = decode, align = align, start = start, stop = stop, cluster_list=cluster_list)
    df_final.reset_index(inplace=True)
    df_final = df_final.drop(columns ='trial')

    fold_no = 1
    if len(y) < nsplits:
        print('Skip session because not enough trials')
        continue
        
    for train_index, test_index in skf.split(df_final, y):
        print('Fold_no:', fold_no)
        model, train_cols, score, sc_fit = ephys.train(df_train, decode=decode, align=align, start=start,stop=stop, cluster_list = cluster_list, 
                                  test_index=test_index,  train_index=train_index)

        # Remove a fifth of the dataset so it can be compared to crossvalidated data. If we want to randomly reduce it, add reduce to trian function
        # drop_list = np.array_split(df_train.trial.unique(), 5)[fold_no]
        # df_train = df_train[~df_train['trial'].isin(drop_list)]
        # index_train_trials = df_train.trial.unique()
        # print('Total of left: ', len(df_train.loc[df_train['vector_answer'] == 0].groupby('trial').mean()), '; Total of right: ', len(df_train.loc[df_train['vector_answer'] == 1].groupby('trial').mean()))

        fig, ax1 = plt.subplots(1,1, figsize=(8, 4), sharey=True)

        for color, variable, delay, hit, ratio, left in zip(colors,variables, delays, hits,ratios,[ax1,ax1,ax1,ax1]):
            df_res = pd.DataFrame()
            df_sti = pd.DataFrame()
            df_iter = pd.DataFrame()
            try:
                df_delay = df.loc[np.around(df.delay,1)==delay]
                delay=np.around(df_delay.delay.iloc[0],1)
                print('Delay:', delay)
            except:
                pass

            if delay == 0.1 or delay == 0.2:
                endrange=3.5
                r=0.25

            elif delay == 1:
                endrange=4.5
                r=0.25

            elif delay == 3:
                endrange=6.5    
                r=0.25
                
            elif delay == 10:
                endrange=14.5
                r=0.25
            
            # Create a dataframe for testing data
            if variable == 'all':
                df_test = df_delay.loc[(df_delay.hit==hit)]
            elif hit == 'all':
                df_test = df_delay.loc[(df_delay[variable]>=ratio)]
            else:
                df_test = df_delay.loc[(df_delay[variable]>=ratio)&(df_delay.hit==hit)]
                
            if fold_no == 1:
                print(delay, variable, 'Threshold:', ratio, 'Hit:', hit, 'NÂº of trials:', len(df_test.trial.unique()))

# -----------  Remove the trials that overlap with the training set.
            list_train_trials = df_train.trial.unique()[train_index]
            df_test = df_test[~df_test['trial'].isin(list_train_trials)] 
            
            if len(df_test.trial.unique())<5:
                print('Not enough trials with this condition')
                continue

            df_real,df_temp = ephys.test(df_test, sc_fit, decode= decode,epoch='Stimulus_ON',initrange=-2,endrange=endrange, r=r, model = model, delay_only=delay, variable=variable, hit=hit, nsurrogates = 100,train_cols = train_cols, cluster_list = cluster_list)

            df_sti = pd.concat([df_real,df_sti])
            df_iter = pd.concat([df_iter,df_temp])
            
            variable = str(variable)+'_'+str(hit)

            # Aligmnent for Stimulus cue
            real = df_sti.loc[(df_sti['trial_type'] ==variable)].to_numpy()
            times = np.around(np.array(df_sti.columns)[:-1].astype(float),2)

            df_new= df_iter.loc[(df_iter.epoch=='Stimulus_ON')].groupby('times')['score']
            y_mean= df_new.mean().values
            lower =  df_new.quantile(q=0.975, interpolation='linear') - y_mean
            upper =  df_new.quantile(q=0.025, interpolation='linear') - y_mean
            x=times
            
            ephys.plot_results_decoder(fig, real[0][:len(y_mean)], times, df_new,  ax1, color = color, epoch = 'Delay_OFF', 
                        y_range = [-0.05, 0.5], x_range = None, substract=True)
            
            df_cum, df_cum_shuffle = ephys.add_to_summary(real[0][:len(y_mean)], y_mean, times, filename, variable, epoch = 'Stimulus_ON', fold_no=fold_no, df_iter=df_iter, df_cum=df_cum, df_cum_shuffle=df_cum_shuffle, substract=False, delay=delay)
            
            sns.despine()
            
        fold_no+=1
        plt.tight_layout()
        plt.show()
