In [2]:
# ! pip install pycaret

In [3]:
# ! pip install --upgrade networkx

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
def history_presentation(sides, outputs):

    """
    sides : np.array of the sides where the stimulus was presented (-1: Left, 1: right)
    outputs : np.array of the outputs (0: incorrect, 1: correct)
    """
    reinforced = sides * outputs
    punished = sides * (1 - outputs)

    return(np.sum(reinforced - punished))


def history_decision(decisions, outputs):

    """
    decisions : np.array of the decisions where the stimulus was presented (-1: Left, 1: right)
    outputs : np.array of the outputs (0: incorrect, 1: correct)
    """
    reinforced = decisions * outputs
    punished = decisions * (1 - outputs)

    return(np.sum(reinforced - punished))


def bias(sides):
    """
    sides : np.array of the sides where the stimulus was presented (-1: Left, 1: right)
    """

    return(np.mean(sides))


In [6]:
class IBLTrialDataset:

    @staticmethod
    def apply_hist_per_subject(x, func, window_size=1):
        """
        Applies function (func) on dataframe (x) in given window (wind) after groupBy per subject.

        x: pd.DataFrame: (N, n_cols [1 or 2]) - two or one-column dataframe with values need to run function
        window_size: int - numer of trials in history

        """
        x = x.values
        result = np.zeros(x.shape[0])
        if x.ndim > 1:
            padded = np.vstack([np.zeros((window_size, x.shape[-1])), x])
            for i in range(window_size, padded.shape[0]):
                x1, x2 = padded[i-window_size: i, 0], padded[i-window_size: i, 1]
                result[i-window_size] = func(x1, x2)
        else:
            padded = np.concatenate([np.zeros(window_size), x])
            for i in range(window_size, padded.shape[0]):
                x1 = padded[i-window_size: i]
                result[i-window_size] = func(x1)

        return pd.DataFrame(result)


    def create_dataset(self, df, window_size):
        """
        Creates dataset as model input (X,y)

        df: pd.DataFrame - input data frame
        window_size: int - window_size of history trials

        output:
        X: np.array(N, num_features) - transformed dataset of features that can be an input to the model
        y: np.array(N,) - numpy array of choices true choices
        """

        df.sort_values(by=['session_start_time', 'trial_id'], inplace=True) # to make sure we're sorted for the grouping later on

        # ground truth stimuli sides
        df['sides'] = df['signed_contrast']
        df.loc[df.sides > 0, 'sides'] = 1
        df.loc[df.sides < 0, 'sides'] = -1
        
        # the actual mice response to the stimuli
        # TODO: verify !
        df.loc[df.trial_response_choice == 'No Go'] = 0
        df.loc[df.trial_response_choice == 'CCW', 'trial_response_choice'] = 1
        df.loc[df.trial_response_choice == 'CW', 'trial_response_choice'] = -1
        
        df.trial_response_choice = df.trial_response_choice.astype('float')

        # derive a column for correct answers
        df['correct'] = df.trial_response_choice == df.sides
        df['correct'] = df['correct'].astype('float')

#         # get history presentation per trial
        df['history_presentation'] = (df.groupby('subject_uuid')['sides', 'correct'].apply(lambda x: self.apply_hist_per_subject(x, history_presentation, window_size)).reset_index()).iloc[:, -1]
    
#         # get history decision per trial
        df['history_decision'] = (df.groupby('subject_uuid')['trial_response_choice', 'correct'].apply(lambda x: self.apply_hist_per_subject(x, history_decision, window_size)).reset_index()).iloc[:, -1]

        # get bias history per trial
        df['bias'] = (df.groupby('subject_uuid')['sides'].apply(lambda x: self.apply_hist_per_subject(x, bias, window_size)).reset_index()).iloc[:, -1]
#         df = df.dropna()
        # TODO: return specified columns
        df = df.loc[df.trial_response_choice != 0.0]

        
        return df[['sides', 'history_presentation', 'history_decision', 'bias', 'signed_contrast']], df['trial_response_choice']


In [7]:
##### TO CHANGE DEPENDING ON YOUR PATH
data_df = pd.read_csv('/home/a103/Documents/GitHub/Neuromatch_IBL/ibl_dataframe_fully_trained_mice.csv')

ds = IBLTrialDataset()
X, y = ds.create_dataset(data_df, window_size = 10)

  df['history_presentation'] = (df.groupby('subject_uuid')['sides', 'correct'].apply(lambda x: self.apply_hist_per_subject(x, history_presentation, window_size)).reset_index()).iloc[:, -1]
  df['history_decision'] = (df.groupby('subject_uuid')['trial_response_choice', 'correct'].apply(lambda x: self.apply_hist_per_subject(x, history_decision, window_size)).reset_index()).iloc[:, -1]


In [8]:
# just a validation
type(X)
X.head()

Unnamed: 0,sides,history_presentation,history_decision,bias,signed_contrast
1418035,-1.0,-6.0,-6.0,-0.6,-0.5
1418036,-1.0,-6.0,-6.0,-0.6,-0.5
1418037,-1.0,-6.0,-6.0,-0.6,-0.5
1418038,-1.0,-7.0,-6.0,-0.7,-0.5
1418039,-1.0,-6.0,-4.0,-0.6,-0.5


In [9]:
from pycaret.classification import *

In [10]:
# seeting up experiments with pre-processing activiites, including specification of CV and missing value handling
s = setup(data = X, target = y, session_id=2607)

Unnamed: 0,Description,Value
0,Session id,2607
1,Target,trial_response_choice
2,Target type,Binary
3,Target mapping,"-1.0: 0, 1.0: 1"
4,Original data shape,"(3763674, 6)"
5,Transformed data shape,"(3763674, 6)"
6,Transformed train set shape,"(2634571, 6)"
7,Transformed test set shape,"(1129103, 6)"
8,Numeric features,5
9,Preprocess,True


In [None]:
# create logistic regression model
logistic_model = create_model('lr')

Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# relative importance of variables in un-tuned LR model
plot_model(logistic_model, plot = 'feature')


In [None]:
# tuning of hyperparameters
tuned_logistic = tune_model(logistic_model)

In [None]:
# see the tuned LR model
tuned_logistic

In [None]:
# relative importance of variables in tuned LR model
plot_model(tuned_logistic, plot = 'feature')

In [None]:
# create random forest model
rf_model = create_model('rf')


In [None]:
# tuning of hyperparameters
tuned_rf = tune_model(rf_model)

In [None]:
# see the tuned RF model
tuned_rf

In [None]:
# relative importance of variables in un-tuned RF model
plot_model(rf_model, plot = 'feature')


In [None]:
# relative importance of variables in tuned RF model
plot_model(tuned_rf, plot = 'feature')

In [None]:
# importance of variables 
rf_model.feature_importances_

In [None]:
#save models
save_model(tuned_logistic, 'my_first_pipeline_lr')
save_model(tuned_rf, 'my_first_pipeline_rf')

In [None]:
# compare models for different classifiers

#compare_tree_models = compare_models(include = ['rf', 'lr'])

# Above function above has return trained model object as an output. 
# The scoring grid is only displayed and not returned. 
# If you need access to the scoring grid you can use pull function to access the dataframe.
#compare_tree_models_results = pull()
#compare_tree_models_results
