In [1]:
# IPython magig  tools
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import os 
import pandas as pd
import seaborn as sns
sns.set_context('talk')

import sys
sys.path.append("G:/My Drive/WORKING_MEMORY/EXPERIMENTS/ELECTROPHYSIOLOGY/ANALYSIS/src/functions/")
import ephys_functions as ephys

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings. filterwarnings('ignore', category=UserWarning)
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [1]:
path_to_data = r'E:\Ephys\summary_complete'
save_data = r'G:\My Drive\WORKING_MEMORY\PAPER\2ND_SUBMISSION_NAT_NEURO\data_for_resubmission'
save_figures = r'G:\My Drive\WORKING_MEMORY\PAPER\2ND_SUBMISSION_NAT_NEURO\figures_for_resubmission'

In [2]:
def interval_extraction(df, cluster_list=[], decode='vector_answer', align='Delay_OFF', start:float=0.0, stop=1.0, delay_only=False):
    y = []
    d = {}

    if delay_only == False:
        # print('Skipping delays')
        if align == 'Delay_OFF' and start < 0:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2)]
        if align == 'Delay_OFF' and start < -1.1:
            df = df.loc[(df.delay != 0.1) & (
                df.delay != 0.2) & (df.delay != 1)]

        if align == 'Stimulus_ON' and stop > 0.5:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2)]

        if align == 'Stimulus_ON' and stop > 1.5:
            df = df.loc[(df.delay != 0.1) & (
                df.delay != 0.2) & (df.delay != 1)]

    # print('Recovered from: ', str(len(df.trial.unique())), ' trials')
    # Create new aligment to the end of the session
    df['a_'+align] = df.fixed_times-df[align]

    # cluster_list = df_all.cluster_id.unique()
    df = df.sort_values('trial')
    
    # Filter for the spikes that occur in the interval we are analyzing
    df = df.loc[(df['a_'+align] > start) & (df['a_'+align] < stop)]

    y = df.groupby('trial')[decode].first()
    
    df_final = pd.DataFrame()
    df_final = df.groupby(['trial', 'cluster_id']).count()
    df_final.reset_index(inplace=True)
    df_final = df_final.pivot_table(
        index=['trial'], columns='cluster_id', values='fixed_times', fill_value=0).rename_axis(None, axis=1)
    df_final = df_final.reindex(cluster_list, axis=1, fill_value=0)

    result = pd.merge(df_final, y, how="right", on=["trial"]).fillna(0)
    result = result.rename(columns={decode: "y"})
    result['y'] = np.where(result['y'] == 0, -1, result['y'])

    return result, result['y']

____________________
# Decode external drive across different conditions

### **Look for the code splitting and aligning on stimulus and delay_OFF**

In [3]:
def train(df_final, decode='vector_answer', align='Delay_OFF', start=-0.5, stop=0, cluster_list = [], 
            ratio=0.65, test_index=[],  train_index=[], fakey=[], delay_only=False):
    
    # This is mainly for the session shuffles
    if len(fakey) > 1:
        print('Using shuffled session')
        y = fakey[len(fakey)-len(y):]
        df_final['y'] = y   
        
    train_cols = df_final.columns
        
    if len(test_index) >= 1:
        train = df_final.loc[train_index,:]
        test = df_final.loc[test_index,:]
        x_test = test.iloc[:, test.columns != 'y']
        y_test = test['y']
        x_train = train.iloc[:, train.columns != 'y']
        y_train = train['y']
        
    else:
        x_train = df_final.iloc[:, df_final.columns != 'y']
        y_train = df_final['y']
        x_test = x_train
        y_test = y_train

        
    #Normalize the X data
    # sc = RobustScaler()
    # sc_fit = sc.fit(x_train)
    # x_train = sc.fit_transform(x_train)
    # x_test = sc.fit_transform(x_test)
    
    model = LogisticRegression(solver='liblinear', penalty = 'l2', class_weight='balanced').fit(x_train, y_train)
    # model = LogisticRegression(solver='liblinear', penalty = 'l1', C=0.9).fit(x_train, y_train)

    train_cols = df_final.columns
    
    p_pred = model.predict_proba(x_test)    
    y_pred = model.predict(x_test)    
    f1score= f1_score(y_test, y_pred, average='weighted')

    y_test = np.where(y_test == -1, 0, y_test) 
    y_new = y_test.reshape(len(y_test), 1).astype(int)
    score_ =  np.take_along_axis(p_pred,y_new,axis=1)   

    print('score:', np.mean(score_), 'f1_score ', f1score)
    
    return model, train_cols, np.mean(score_)

In [4]:
def test(df,model, epoch='Stimulus_ON',initrange=-0.4,endrange=1.5,r=0.2, train_cols=None, variable='ra_accuracy',
                      hit=1, nsurrogates = 100, decode='vector_answer', ratio=0, cluster_list = [], test_index=[], fakey=[], 
                        delay_only=False, score_options = 'standard'):
    '''
    Function that tests a previously trained function (func. train_decoder) on population activity of specific segments
    
    Attributes
        - df: DataFrame. it contains a whole ephys session without curation. 
        - WM and RL are the variables to consider a trial in the RL or in the WM-module. Both need to be floats. 
        - epoch: str. Moment at which the data will be aligned to. 
        - initrange: float. 
        - endrange: float.
        - r: float 
        - model. function. 
        - train_cols
        - name. String
        - variables. List. 
        - hits. List. 
        - colors. List
        - nsurrogates. Int. 
        - indexes. List 
        - decode. String
    
    Return
        - df_real
        - df_iter
        It will also make a plot. 
    '''
    
    df_real = pd.DataFrame()
    df_iter = pd.DataFrame(columns = ['iteration','score', 'times','epoch' ,'variable'])
    

    times = [] # Timestamps
    real_score = [] # real scoring of the decoded
    index_iter = 0
    print_value = True

    for start, stop in zip(np.arange(initrange,endrange-r,r),np.arange(initrange+r,endrange,r)):
        times.append((start+stop)/2)
        df_final, y = interval_extraction(df,decode = decode, align = epoch, start = start, stop = stop, cluster_list=cluster_list, delay_only=delay_only)
        
        # Sometimes the testing and the trainind dataset have different neurons since they are looking at different trials and perhaps there were no spikes
        # coming from all neurons. We compare which columns are missing and add them containing 0 for the model to work. 
        test_cols = df_final.columns
        
        common_cols = train_cols.intersection(test_cols)
        train_not_test = train_cols.difference(test_cols)
        for col in train_not_test:
            df_final[col] = 0

        #The other way round. When training in segmented data, sometimes the training set is smaller than the testing (for instance, when training in Hb trials and testing in WM)
        test_not_train = test_cols.difference(train_cols)
        for col in test_not_train:
            df_final.drop(columns=[col],inplace=True)
        
        # Reorder so we can use the fit from the trianing of the Robustscaler
        df_final = df_final.reindex(columns=train_cols)
        
        # This is for the session shuffles
        if len(fakey) > 1:
            print('Using shuffled session')
            y = fakey[len(fakey)-len(y):]
            df_final['y'] = y   
            
        #Train the model"
        if len(test_index) >= 1:
            # Split data in training and testing
            # x_train, x_test, y_train, y_test =\
            #     train_test_split(df_final, y, test_size=test_sample,random_state=random_state)
            
            df_final.reset_index(inplace=True)
            df_final = df_final.drop(columns ='trial')            
            test = df_final.loc[test_index,:]
            # print('Fold',str(fold_no),'Class Ratio:',sum(test['y'])/len(test['y']))
            x_test = test.iloc[:, test.columns != 'y']
            y_test = test['y']             

        else:
            x_train = df_final.iloc[:, df_final.columns != 'y']
            y_train = df_final['y']
            x_test = x_train
            y_test = y_train

        #Normalize the X data
        # sc = RobustScaler()
        # x_test = sc.fit_transform(x_test)
        # x_test = sc_fit.transform(x_test)

        p_pred = model.predict_proba(x_test)
        y_pred = model.predict(x_test)

        if score_options == 'standard':
            score_ = model.score(x_test, y_test)
        
        elif score_options == 'drive_on':
            score_ =  p_pred[:, 1]
            
        elif score_options == 'drive_off':
            score_ =  p_pred[:, 0]
        else:
            y_test = np.where(y_test == -1, 0, y_test)
            y_new = y_test.reshape(len(y_test), 1).astype(int)
            # y_new = y_test.values.reshape(len(y_test), 1).astype(int)
            score_ =  np.take_along_axis(p_pred,y_new,axis=1)
        
        real_score.append(score_[0])

        # precision = np.mean(precision_score(y_test, y_pred))
        # recall = np.mean(recall_score(y_test, y_pred))
        for i in np.arange(nsurrogates):
            y_perr = shuffle(y_test)
            score_ = model.score(x_test, y_perr)   
            score_  = np.mean(score_)
            # if score_options == 'standard':
            #     score_ = model.score(x_test, y_perr)   
            #     score_  = np.mean(score_)
            # else:
            #     # y_new = y_perr.values.reshape(len(y_perr), 1).astype(int)
            #     y_new = y_perr.reshape(len(y_perr), 1).astype(int)
            #     result =  np.take_along_axis(p_pred,y_new,axis=1)
            #     score_  = np.mean(result)
                        
            new_row = {'iteration': i, 'score': score_, 'times': (start+stop)/2, 'epoch' : epoch, 'variable' : str(variable)+'_'+str(hit)}
            
            # Use the `loc` indexer to insert the row
            df_iter.loc[index_iter] = new_row
            index_iter +=1 
            
    times.append('trial_type')
    real_score.append(variable+'_'+str(hit))
    a_series = pd.DataFrame([real_score], columns = times)
    df_real = pd.concat([df_real,a_series], ignore_index=True)
    
    return df_real, df_iter

In [5]:
def test_trial(df,sc_fit, epoch='Stimulus_ON',initrange=-0.4,endrange=1.5,r=0.2, model = None, train_cols=None, variable='ra_accuracy',
                      hit=1, nsurrogates = 100, decode='vector_answer', ratio=0, cluster_list = [], test_index=[], fakey=[], delay_only=False):
    '''
    Function that tests a previously trained function (func. train_decoder) on population activity of specific segments
    
    Attributes
        - df: DataFrame. it contains a whole ephys session without curation. 
        - WM and RL are the variables to consider a trial in the RL or in the WM-module. Both need to be floats. 
        - epoch: str. Moment at which the data will be aligned to. 
        - initrange: float. 
        - endrange: float.
        - r: float 
        - model. function. 
        - train_cols
        - name. String
        - variables. List. 
        - hits. List. 
        - colors. List
        - nsurrogates. Int. 
        - indexes. List 
        - decode. String
    
    Return
        - df_real
        - df_iter
        It will also make a plot. 
    '''
    
    df_real = pd.DataFrame()
    df_iter = pd.DataFrame(columns=['iteration', 'score', 'times', 'epoch', 'variable'])
        
    times = [] # Timestamps
    real_score = [] # real scoring of the decoded
    mean_sur=[] # mean of the surrogate data

    for start, stop in zip(np.arange(initrange,endrange-r,r/2),np.arange(initrange+r,endrange,r/2)):
        times.append((start+stop)/2)
        df_final, y = interval_extraction_trial(df,variable = decode, align = epoch, start = start, stop = stop, cluster_list=cluster_list, delay_only=delay_only)

        # Sometimes the testing and the trainind dataset have different neurons since they are looking at different trials and perhaps there were no spikes
        # coming from all neurons. We compare which columns are missing and add them containing 0 for the model to work. 
        test_cols = df_final.columns
        common_cols = train_cols.intersection(test_cols)
        train_not_test = train_cols.difference(test_cols)
        for col in train_not_test:
            df_final[col] = 0

        #The other way round. When training in segmented data, sometimes the training set is smaller than the testing (for instance, when training in Hb trials and testing in WM)
        test_not_train = test_cols.difference(train_cols)
        for col in test_not_train:
            df_final.drop(columns=[col],inplace=True)

        #Train the model"
        if len(test_index) >= 1:
            print('Train splitting trials')
            # Split data in training and testing
            # x_train, x_test, y_train, y_test =\
            #     train_test_split(df_final, y, test_size=test_sample,random_state=random_state)
            
            df_final.reset_index(inplace=True)
            df_final = df_final.drop(columns ='trial')
            test = df_final.loc[test_index,:]
            # print('Fold',str(fold_no),'Class Ratio:',sum(test['y'])/len(test['y']))
            x_test = test.iloc[:, test.columns != 'y']
            y_test = test['y']             

        else:
            x_train = df_final.iloc[:, df_final.columns != 'y']
            y_train = df_final['y']
            x_test = x_train
            y_test = y_train
        
        #Normalize the X data
        # sc = RobustScaler()
        # x_test = sc.fit_transform(x_test)
        # x_test = sc_fit.transform(x_test)
        
        p_pred = model.predict_proba(x_test)
        y_pred = model.predict(x_test)
        score_ = model.score(x_test, y_test)
        real_score.append(score_)

        # y_test = np.where(y_test == -1, 0, y_test) 
        # y_new = y_test.reshape(len(y_test), 1).astype(int)
        # corrected_score =  np.take_along_axis(p_pred,y_new,axis=1)   
        # real_score.append(np.mean(corrected_score))

        # print('score:', score_, 'corrected score: ', np.mean(corrected_score), end='\n\n')

        i=0
        rows = []
        while i <= nsurrogates:
            i+=1
            y_perr = shuffle(y_test)
            score_ = model.score(x_test, y_perr)

            # y_new = y_perr.reshape(len(y_perr), 1).astype(int)
            # result =  np.take_along_axis(p_pred,y_new,axis=1)     
            # score_  = np.mean(result)

            new_row = pd.DataFrame({'iteration': [i], 'score': [score_], 'times': [(start+stop)/2], 'epoch' : [epoch], 
                                      'variable' : [variable+'_'+str(hit)]})
            df_iter = pd.concat([df_iter,new_row], ignore_index=True)
        
    times.append('trial_type')
    real_score.append(variable+'_'+str(hit))
    a_series = pd.DataFrame([real_score], columns = times)

    df_real = pd.concat([df_real,a_series], ignore_index=True)
    
    return df_real, df_iter

In [6]:
def interval_extraction_trial(df, cluster_list=[], variable = 'vector_answer', align = 'Delay_OFF', start = 0, stop = 1, delay_only=False):
    y = []
    d = {}
    
    if delay_only == False:
        # print('Skipping delays')
        if align == 'Delay_OFF' and start < 0:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2)]
        if align == 'Delay_OFF' and start < -1:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2) & (df.delay != 1)]

        if align == 'Stimulus_ON' and stop > 0.5:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2)]

        if align == 'Stimulus_ON' and stop > 1.5:
            df = df.loc[(df.delay != 0.1) & (df.delay != 0.2) & (df.delay != 1)]
    
    # print('Recovered from: ', str(len(df.trial.unique())), ' trials')
    # Create new aligment to the end of the session
    df['a_'+align] = df.fixed_times-df[align]

    # cluster_list = df_all.cluster_id.unique()
    df = df.sort_values('trial')
    
    y = df.groupby('trial')[variable].mean()

    # Filter for the spikes that occur in the interval we are analyzing
    df = df.loc[(df['a_'+align]>start)&(df['a_'+align]<stop)]

    df_final = pd.DataFrame()
    df_final = df.groupby(['trial','cluster_id']).count()
    df_final.reset_index(inplace=True)
    df_final = df_final.pivot_table(index=['trial'], columns='cluster_id', values='fixed_times', fill_value=0).rename_axis(None, axis=1)
    df_final = df_final.reindex(cluster_list, axis=1,fill_value=0)

    result = pd.merge(df_final, y, how="right", on=["trial"]).fillna(0)
    result = result.rename(columns={variable: "y"})
    result['y'] = np.where(result['y'] == 0, -1, result['y']) 
    
    return result, result['y']

In [10]:
# Variables used for decoder training
nsplits = 2 # number of splits of the Kfold
decode = 'drive' # decoding variable
align = 'Stimulus_ON' # aligning epoch
r = 0.25 # binning window
start = -0.25 # Window to select the training window/obtain number of trials for each condition
stop = 0.0 # Window to select the training window/obtain number of trials for each condition
variable_train = 'WM_roll'
hit_train = 1
ratio_train = 0.6

# Variables for testing
colors = ['indigo', 'darkorange']
variables_test= ['WM_roll','RL_roll']
hits_test = [1,1]
ratios_test = [0.6, 0.4]
variables_combined=[variables_test[0]+'_'+str(hits_test[0]),variables_test[1]+'_'+str(hits_test[1])]

In [11]:
df_cum = pd.DataFrame()
df_cum_shuffle = pd.DataFrame()
for filename in os.listdir(path_to_data)[2:]:
    if filename[-3:] != 'pdf':
        df = pd.read_csv(path_to_data + "/" + filename, sep=',', index_col=0)
        print(filename)
    else:
        continue
    name = filename

    df = ephys.add_time_before_stimulus(df, -4)
    delay = 10
    df['delay'] = np.around(df.delay, 2)
    df = df.loc[df.delay == delay]

    df['drive'] = np.where((df['a_'+align]>0)&(df['a_'+align] < delay + .5), 1, 0)
    
    # Recover all the neurons in these session. This is because sometimes in some type of trials there are no activity for some neurons
    # even though in the training set there was. We need to fill such neurons with 0 always.
    cluster_list = df.cluster_id.unique()
    if hit_train == 'all' and variable_train == 'all':
        df_train = df
    elif hit_train == 'all':
        df_train = df.loc[(df[variable_train] > ratio_train)]
    elif variable_train == 'all':
        df_train = df.loc[(df['hit'] == hit_train)]
    else:
        df_train = df.loc[(df[variable_train] > ratio_train)
                          & (df.hit == hit_train)]

    cum_df_final = pd.DataFrame()
    cum_y = pd.DataFrame()
            
    for start, stop in zip(np.arange(-4, delay+ 0.5, r), np.arange(-4 + r, delay + 0.5 + r, r)):
            df_final, y = interval_extraction(df_train, decode=decode, align=align, start=start, stop=stop, cluster_list=cluster_list)
            df_final.reset_index(inplace=True)
            df_final['times'] = str(start)+'_'+str(stop)
            cum_df_final = pd.concat([cum_df_final, df_final], axis=0)
            
    # y = cum_df_final['y']
    cum_df_final = cum_df_final.drop(columns=['trial', 'times'])

    print('Trials for training: ', len(df_final))

    skf = StratifiedKFold(n_splits=nsplits, shuffle=True)
    model, train_cols, score = train(cum_df_final, decode=decode, align=align, start=start,stop=stop, cluster_list = cluster_list)
    fold_no=1
    
    for train_index, test_index in skf.split(df_final, y):
            fig, ax1 = plt.subplots(1,1, figsize=(8, 4), sharey=True)

            for color, variable, hit, ratio, left in zip(colors,variables_test, hits_test,ratios_test,[ax1,ax1,ax1,ax1]):
                df_res = pd.DataFrame()
                df_sti = pd.DataFrame()
                df_iter = pd.DataFrame()
                try:
                    df_delay = df.loc[np.around(df.delay,1)==delay]
                    delay=np.around(df_delay.delay.iloc[0],1)
                except:
                    pass
                
                # Create a dataframe for testing data
                if variable == 'all':
                    df_test = df_delay.loc[(df_delay.hit==hit)]
                elif hit == 'all':
                    df_test = df_delay.loc[(df_delay[variable]>=ratio)]
                else:
                    df_test = df_delay.loc[(df_delay[variable]>=ratio)&(df_delay.hit==hit)]

                if df_test.empty or df_test.trial.nunique() < 5:
                    continue
                
                df_real,df_temp = test(df_test, decode= decode,epoch='Stimulus_ON',initrange=-4,endrange=delay+0.5, r=r, model = model, delay_only=delay, variable=variable, hit=hit, nsurrogates = 2, train_cols = train_cols, cluster_list = cluster_list, score_options='drive_off')
                df_sti = pd.concat([df_real,df_sti], axis=0)
                df_iter = pd.concat([df_iter,df_temp], axis=0)
                
                variable = str(variable)+'_'+str(hit)

                # Aligmnent for Stimulus cue
                real = df_sti.loc[(df_sti['trial_type'] ==variable)].to_numpy()
                times = np.around(np.array(df_sti.columns)[:-1].astype(float),2)

                df_new= df_iter.loc[(df_iter.epoch=='Stimulus_ON')].groupby('times')['score']
                y_mean= df_new.mean().values
                lower =  df_new.quantile(q=0.975, interpolation='linear') - y_mean
                upper =  df_new.quantile(q=0.025, interpolation='linear') - y_mean
                x=times
                
                ephys.plot_results_decoder(fig, real[0][:len(y_mean)], times, df_new,  ax1, color = color, epoch = 'Delay_OFF', 
                            y_range = [0.1, 1], x_range = None, substract=False)
                
                df_cum, df_cum_shuffle = ephys.add_to_summary(real[0][:len(y_mean)], y_mean, times, filename, variable, epoch = 'Stimulus_ON', fold_no=fold_no, df_iter=df_iter, df_cum=df_cum, df_cum_shuffle=df_cum_shuffle, substract=False, delay=delay)
                
                sns.despine()
                fold_no +=1
            plt.show()

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'E:\\Ephys\\summary_complete'

In [None]:
fig, ax = plt.subplots(1,1, figsize=(8, 4), sharey=True)

ephys.plot_results_session_summary(ax, df_cum, colors, variables_combined, 
                                 y_range = [-0.1, 1], x_range = [-2, 5], epoch = 'Stimulus_ON', baseline=0.5)