## Spots that are always excluded:
##### Test spots
##### Spots marked as problematic
##### Suscpicious for analysis == 'Yes' spots
##### Fake Sale spots: This means that Reason for Cancelling Survey is any combination which contains 'Fake Sale'
##### Spots whose start date was before 2018


## CAN CANCEL:
##### Spots which cancelled more than 2 months before their Available Cancellation Date are excluded
##### Spots whose Available Cancellation Date is 2 or more months after the date of analysis
##### Months during which spots couldn't have cancelled according to the above conditions

## wo CB sets:
##### Closed Business, Sold Business and Non-payment spots are excluded. This means Reason for Cancelling Survey is any combination which contains 'Closed Business', 'Closed/Sold Business', 'Sold Business', 'Sold/Closed Business' or 'Non-payment'
## canc conf event
##### Spots for which Date Cancellation Confirmed is more than 60 days after Date Cancellation Requested

#### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
import numpy as np
import importlib
import get_started_doing_something_variables
import prepare_for_the_models
import fit_tv_cox_models
#import read_vars_for_separate_models_for_dependent_vars
import read_a_combination_of_variables
import yaml
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown

In [None]:
# read the yaml file with a list of parameters needed for the report
with open(r'./parameters/started_doing_something_report_parameters.yaml') as file:
    parameters = yaml.load(file, Loader=yaml.FullLoader)

date_of_analysis = parameters['date_of_analysis']
date_dir = date_of_analysis.replace('-', '_')
### name of the data directory ###
churn_based_on_behaviour_dir = parameters['churn_based_on_behaviour_dir']
### penalizer value when fitting the models ###
penalizer = parameters['penalizer']
### model type ###
model_type = parameters['model_type']
### coefficient and p values when dropping unsignificant variables ###
coefficient_limit_for_numerical_vars = parameters['coefficient_limit_for_numerical_vars']
coefficient_limit_for_cat_vars = parameters['coefficient_limit_for_cat_vars']
p_limit = parameters['p_limit']
additional_higher_p_limit = parameters['additional_higher_p_limit']
additional_lower_p_limit = parameters['additional_lower_p_limit']

# read the yaml file with data set parameters #
# data_set_name = input('Data set name: ')
data_set_name = 'ALL_spots_with_CB_cancellation_confirmed'
with open(r'./parameters/data_sets.yaml') as file:
    data_sets_parameters = yaml.load(file, Loader=yaml.FullLoader)
    
spots_set = data_sets_parameters[data_set_name]['spots_set']
with_wo_CB = data_sets_parameters[data_set_name]['with_wo_CB']
event_date_full_name = data_sets_parameters[data_set_name]['event_date_type']

if with_wo_CB == 'with_CB':
    with_wo_CB_boolean = True
else:
    with_wo_CB_boolean = False

if event_date_full_name == 'cancellation_requested':
    event_date = 'canc_req'
elif event_date_full_name == 'cancellation_confirmed':
    event_date = 'canc_conf'

##### base columns ##### 
base_cols = ['spot_id',\
            'time',\
            'event']

In [None]:
##### Model 0: all variables - prepare data for the model ####
# cols_to_use = read_vars_for_separate_models_for_dependent_vars.\
# main(model_number=0, dir_name='separate_models_for_dependent_variables/')
cols_to_use = read_a_combination_of_variables.\
main(model_number=0, dir_name='combinations_of_variables_that_are_not_dependent/')
#### get behavioural variables ####
(variables_to_use_for_the_model, did_something_last_month_vars, did_something_before_and_didnt_last_month_vars) = \
get_started_doing_something_variables.main(date_of_analysis=date_of_analysis, variables_to_use_for_the_model=cols_to_use)
cols = base_cols + variables_to_use_for_the_model + \
did_something_before_and_didnt_last_month_vars + did_something_last_month_vars


#### get data for the model ###
(data, base_df, df_timeline_all_vars) = \
prepare_for_the_models.get_data_for_the_MV_Cox_model(date_of_analysis=date_of_analysis, spots_set=spots_set, \
                            with_wo_CB=with_wo_CB, event_date=event_date, columns=cols, data_dir=churn_based_on_behaviour_dir,\
                            C = 100)
    
model_numbers = \
read_a_combination_of_variables.get_a_list_of_model_numbers(dir_name='combinations_of_variables_that_are_not_dependent/')


## save the initial set of all variables - dependent ones included ##
original_cols = [x for x in df_timeline_all_vars.columns.tolist() if x not in ['spot_id', 'event', 'start', 'stop']]

In [None]:
number_of_models = len(model_numbers[1:])
(all_models_summaries, all_models_names, conditions_described) = \
fit_tv_cox_models.\
fit_all_models_and_get_all_summaries(number_of_models=number_of_models, model_numbers=model_numbers, \
                                     date_of_analysis=date_of_analysis, churn_based_on_behaviour_dir=churn_based_on_behaviour_dir,\
                                     spots_set=spots_set, with_wo_CB=with_wo_CB, event_date_full_name=event_date_full_name, \
                                     df_timeline_all_vars=df_timeline_all_vars, base_cols=base_cols, base_df=base_df, \
                                     model_type=model_type, coefficient_limit_for_numerical_vars=coefficient_limit_for_numerical_vars,\
                                     coefficient_limit_for_cat_vars=coefficient_limit_for_cat_vars, p_limit=p_limit,\
                                     additional_higher_p_limit=additional_higher_p_limit, additional_lower_p_limit=additional_lower_p_limit, \
                                     penalizer=penalizer)

In [None]:
### calculate average coefficients and p values for each variable ###
summary_cols = ['variable', 'coef - AVERAGE', 'exp(coef) - AVERAGE', 'p value - AVERAGE']
final_summaries = []
all_conditions_described = []
summary_cols = ['variable', 'coef - AVERAGE', 'exp(coef) - AVERAGE', 'p value - AVERAGE']
final_summaries = []
for key in all_models_summaries.keys():
    all_conditions_described.append(conditions_described[key])
    final_summary = pd.DataFrame(columns=summary_cols)
    final_summary['variable'] = original_cols
    final_summary.set_index('variable', inplace = True)
    
    all_coef_for_each_var = dict.fromkeys(original_cols)
    all_exp_coef_for_each_var = dict.fromkeys(original_cols)
    all_p_value_for_each_var = dict.fromkeys(original_cols)

    for col in original_cols:
        all_coef_for_each_var[col] = []
        all_exp_coef_for_each_var[col] = []
        all_p_value_for_each_var[col] = []
        
    for var_ in original_cols:
        for summary_ in all_models_summaries[key]:
            if var_ in summary_.index.unique():
                all_coef_for_each_var[var_].append(summary_.loc[var_, 'coef'])
                all_exp_coef_for_each_var[var_].append(summary_.loc[var_, 'exp(coef)'])
                all_p_value_for_each_var[var_].append(summary_.loc[var_, 'p'])
        final_summary.loc[var_, 'coef - AVERAGE'] = round(np.mean(np.array(all_coef_for_each_var[var_])), 3)
        final_summary.loc[var_, 'exp(coef) - AVERAGE'] = round(np.mean(np.array(all_exp_coef_for_each_var[var_])), 3)
        final_summary.loc[var_, 'p value - AVERAGE'] = round(np.mean(np.array(all_p_value_for_each_var[var_])), 3)
        
    final_summary.drop(final_summary[final_summary['coef - AVERAGE'].isnull()].index, inplace = True)
    #display(final_summary)
    final_summaries.append(final_summary)

# Average coefficients and p values for all combinations of variables - sorted by p value

In [None]:
pd.set_option('display.max_rows', 500)
for i in range(0, len(final_summaries)):
    display(Markdown("## "+all_conditions_described[i]))
    display(final_summaries[i].sort_values('p value - AVERAGE'))

# Average coefficients and p values for all combinations of variables - sorted by variable name

In [None]:
for i in range(0, len(final_summaries)):
    display(Markdown("## "+all_conditions_described[i]))
    display(final_summaries[i].sort_index())