## Spots that are always excluded:
##### Test spots
##### Spots marked as problematic
##### Suscpicious for analysis == 'Yes' spots
##### Fake Sale spots: This means that Reason for Cancelling Survey is any combination which contains 'Fake Sale'
##### Spots whose stop date was before 2018


## CAN CANCEL:
##### Spots which cancelled more than 2 months before their Available Cancellation Date are excluded
##### Spots whose Available Cancellation Date is 2 or more months after the date of analysis
##### Months during which spots couldn't have cancelled according to the above conditions

## wo CB sets:
##### Closed Business, Sold Business and Non-payment spots are excluded. This means Reason for Cancelling Survey is any combination which contains 'Closed Business', 'Closed/Sold Business', 'Sold Business', 'Sold/Closed Business' or 'Non-payment'
## canc conf event
##### Spots for which Date Cancellation Confirmed is more than 60 days after Date Cancellation Requested

#### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
import importlib
import get_stopped_doing_something_variables
import prepare_for_the_models
import fit_tv_cox_models
#import read_vars_for_separate_models_for_dependent_vars
import read_a_combination_of_variables
import yaml
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown

In [2]:
# read the yaml file with a list of parameters needed for the report
with open(r'./parameters/stopped_doing_something_report_parameters.yaml') as file:
    parameters = yaml.load(file, Loader=yaml.FullLoader)

date_of_analysis = parameters['date_of_analysis']
date_dir = date_of_analysis.replace('-', '_')
### name of the data directory ###
churn_based_on_behaviour_dir = parameters['churn_based_on_behaviour_dir']
### penalizer value when fitting the models ###
penalizer = parameters['penalizer']
### model type ###
model_type = parameters['model_type']
### coefficient and p values when dropping unsignificant variables ###
coefficient_limit_for_numerical_vars = parameters['coefficient_limit_for_numerical_vars']
coefficient_limit_for_cat_vars = parameters['coefficient_limit_for_cat_vars']
p_limit = parameters['p_limit']
additional_higher_p_limit = parameters['additional_higher_p_limit']
additional_lower_p_limit = parameters['additional_lower_p_limit']

# read the yaml file with data set parameters #
# data_set_name = input('Data set name: ')
data_set_name = 'ALL_spots_with_CB_cancellation_confirmed'
with open(r'./parameters/data_sets.yaml') as file:
    data_sets_parameters = yaml.load(file, Loader=yaml.FullLoader)
    
spots_set = data_sets_parameters[data_set_name]['spots_set']
with_wo_CB = data_sets_parameters[data_set_name]['with_wo_CB']
event_date_full_name = data_sets_parameters[data_set_name]['event_date_type']

if with_wo_CB == 'with_CB':
    with_wo_CB_boolean = True
else:
    with_wo_CB_boolean = False

if event_date_full_name == 'cancellation_requested':
    event_date = 'canc_req'
elif event_date_full_name == 'cancellation_confirmed':
    event_date = 'canc_conf'

##### base columns ##### 
base_cols = ['spot_id',\
            'time',\
            'event']

In [3]:
##### Model 0: all variables - prepare data for the model ####
# cols_to_use = read_vars_for_separate_models_for_dependent_vars.\
# main(model_number=0, dir_name='separate_models_for_dependent_variables/')
cols_to_use = read_a_combination_of_variables.\
main(model_number=0, dir_name='combinations_of_variables_that_are_not_dependent/')
#### get behavioural variables ####
(variables_to_use_for_the_model, did_something_before_vars, did_something_last_month_vars) = \
get_stopped_doing_something_variables.main(date_of_analysis=date_of_analysis, variables_to_use_for_the_model=cols_to_use)
cols = base_cols + variables_to_use_for_the_model + \
did_something_before_vars + did_something_last_month_vars


#### get data for the model ###
(data, base_df, df_timeline_all_vars) = \
prepare_for_the_models.get_data_for_the_MV_Cox_model(date_of_analysis=date_of_analysis, spots_set=spots_set, \
                            with_wo_CB=with_wo_CB, event_date=event_date, columns=cols, data_dir=churn_based_on_behaviour_dir,\
                            C = 100)
    
model_numbers = \
read_a_combination_of_variables.get_a_list_of_model_numbers(dir_name='combinations_of_variables_that_are_not_dependent/')


## save the initial set of all variables - dependent ones included ##
original_cols = [x for x in df_timeline_all_vars.columns.tolist() if x not in ['spot_id', 'event', 'start', 'stop']]

In [4]:
number_of_models = len(model_numbers[1:])
(all_models_summaries, all_models_names, conditions_described) = \
fit_tv_cox_models.\
fit_all_models_and_get_all_summaries(number_of_models=number_of_models, model_numbers=model_numbers, \
                                     date_of_analysis=date_of_analysis, churn_based_on_behaviour_dir=churn_based_on_behaviour_dir,\
                                     spots_set=spots_set, with_wo_CB=with_wo_CB, event_date_full_name=event_date_full_name, \
                                     df_timeline_all_vars=df_timeline_all_vars, base_cols=base_cols, base_df=base_df, \
                                     model_type=model_type, coefficient_limit_for_numerical_vars=coefficient_limit_for_numerical_vars,\
                                     coefficient_limit_for_cat_vars=coefficient_limit_for_cat_vars, p_limit=p_limit,\
                                     additional_higher_p_limit=additional_higher_p_limit, additional_lower_p_limit=additional_lower_p_limit, \
                                     penalizer=penalizer)

In [5]:
### calculate average coefficients and p values for each variable ###
summary_cols = ['variable', 'coef - AVERAGE', 'exp(coef) - AVERAGE', 'p value - AVERAGE']
final_summaries = []
all_conditions_described = []
summary_cols = ['variable', 'coef - AVERAGE', 'exp(coef) - AVERAGE', 'p value - AVERAGE']
final_summaries = []
for key in all_models_summaries.keys():
    all_conditions_described.append(conditions_described[key])
    final_summary = pd.DataFrame(columns=summary_cols)
    final_summary['variable'] = original_cols
    final_summary.set_index('variable', inplace = True)
    
    all_coef_for_each_var = dict.fromkeys(original_cols)
    all_exp_coef_for_each_var = dict.fromkeys(original_cols)
    all_p_value_for_each_var = dict.fromkeys(original_cols)

    for col in original_cols:
        all_coef_for_each_var[col] = []
        all_exp_coef_for_each_var[col] = []
        all_p_value_for_each_var[col] = []
        
    for var_ in original_cols:
        for summary_ in all_models_summaries[key]:
            if var_ in summary_.index.unique():
                all_coef_for_each_var[var_].append(summary_.loc[var_, 'coef'])
                all_exp_coef_for_each_var[var_].append(summary_.loc[var_, 'exp(coef)'])
                all_p_value_for_each_var[var_].append(summary_.loc[var_, 'p'])
        final_summary.loc[var_, 'coef - AVERAGE'] = round(np.mean(np.array(all_coef_for_each_var[var_])), 3)
        final_summary.loc[var_, 'exp(coef) - AVERAGE'] = round(np.mean(np.array(all_exp_coef_for_each_var[var_])), 3)
        final_summary.loc[var_, 'p value - AVERAGE'] = round(np.mean(np.array(all_p_value_for_each_var[var_])), 3)
        
    final_summary.drop(final_summary[final_summary['coef - AVERAGE'].isnull()].index, inplace = True)
    #display(final_summary)
    final_summaries.append(final_summary)

# Average coefficients and p values for all combinations of variables - sorted by p value

In [6]:
pd.set_option('display.max_rows', 500)
for i in range(0, len(final_summaries)):
    display(Markdown("## "+all_conditions_described[i]))
    display(final_summaries[i].sort_values('p value - AVERAGE'))

## with all initial variables

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
had_tickets_before,-0.638,0.529,0.0
had_posts_on_facebook_last_month,-0.597,0.551,0.0
had_clicked_emails_last_3_months,-0.57,0.566,0.0
Website.Views.last.month.total.log2,-0.223,0.8,0.0
their_own_website_no,-1.957,0.141,0.0
Monthly,1.003,2.726,0.0
metro_area_Atlanta,-0.893,0.409,0.002
spot_category_Mexican / Latin,0.742,2.103,0.003
their_own_website_dontKnow,-1.04,0.354,0.006
had_visited_qrcode_flyers_page_before,-0.841,0.432,0.011


## without categorical variables with |coeff| < 0.2 and p value > 0.2; without numerical variables with |coeff| < 0.01 and p value > 0.2

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
their_own_website_no,-1.816,0.164,0.0
had_posts_on_facebook_last_month,-0.61,0.543,0.0
Monthly,1.021,2.777,0.0
had_tickets_before,-0.633,0.531,0.0
had_clicked_emails_last_3_months,-0.587,0.556,0.0
Website.Views.last.month.total.log2,-0.219,0.804,0.0
spot_category_Mexican / Latin,0.678,1.972,0.001
metro_area_Atlanta,-0.788,0.455,0.003
their_own_website_dontKnow,-1.043,0.353,0.004
metro_area_Boston,0.732,2.08,0.005


## without variables p value >= 0.5

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
their_own_website_no,-1.795,0.167,0.0
had_posts_on_facebook_last_month,-0.624,0.536,0.0
had_tickets_before,-0.634,0.531,0.0
Monthly,1.021,2.779,0.0
had_clicked_emails_last_3_months,-0.598,0.55,0.0
Website.Views.last.month.total.log2,-0.213,0.808,0.0
spot_category_Mexican / Latin,0.671,1.959,0.001
metro_area_Atlanta,-0.77,0.463,0.003
their_own_website_dontKnow,-1.029,0.358,0.004
metro_area_Boston,0.726,2.069,0.005


## without variables p value >= 0.2

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
their_own_website_no,-1.607,0.201,0.0
had_posts_on_facebook_last_month,-0.62,0.538,0.0
Monthly,0.995,2.706,0.0
Website.Views.last.month.total.log2,-0.197,0.821,0.0
had_clicked_emails_last_3_months,-0.598,0.55,0.0
had_tickets_before,-0.639,0.528,0.0
spot_category_Mexican / Latin,0.656,1.929,0.001
metro_area_Boston,0.797,2.221,0.002
had_posts_seen_before,0.59,1.804,0.002
had_preview_page_views_before,0.639,1.896,0.002


# Average coefficients and p values for all combinations of variables - sorted by variable name

In [7]:
for i in range(0, len(final_summaries)):
    display(Markdown("## "+all_conditions_described[i]))
    display(final_summaries[i].sort_index())

## with all initial variables

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Annual Commitment with 2 semi-annual payments,-0.19,0.828,0.645
Annual Prepay,-0.054,0.948,0.724
Fb.page.likes.analysis.avg.log2,-0.038,0.963,0.043
Instagram.Add.on,0.211,1.236,0.135
Monthly,1.003,2.726,0.0
New.email.subscriber.signups.last.month.div10,0.0,1.001,0.927
New.email.subscribers.allinclusive.last.month.div10,-0.062,0.94,0.3
Opted.out.of.facebook,0.126,1.134,0.571
SpotHopper_site_not_live_yet,-0.168,0.846,0.656
Website.Views.last.month.total.log2,-0.223,0.8,0.0


## without categorical variables with |coeff| < 0.2 and p value > 0.2; without numerical variables with |coeff| < 0.01 and p value > 0.2

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Annual Commitment with 2 semi-annual payments,-0.253,0.777,0.525
Fb.page.likes.analysis.avg.log2,-0.036,0.965,0.05
Instagram.Add.on,0.226,1.253,0.089
Monthly,1.021,2.777,0.0
New.email.subscriber.signups.last.month.div10,-0.012,0.988,0.75
New.email.subscribers.allinclusive.last.month.div10,-0.061,0.941,0.168
SpotHopper_site_not_live_yet,-0.277,0.758,0.447
Website.Views.last.month.total.log2,-0.219,0.804,0.0
had_added_events_manually_edited_events_before,0.183,1.201,0.261
had_added_events_manually_edited_events_last_3_months,-0.341,0.712,0.196


## without variables p value >= 0.5

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Annual Commitment with 2 semi-annual payments,-0.351,0.704,0.367
Fb.page.likes.analysis.avg.log2,-0.036,0.965,0.05
Instagram.Add.on,0.231,1.26,0.081
Monthly,1.021,2.779,0.0
New.email.subscriber.signups.last.month.div10,-0.054,0.947,0.373
New.email.subscribers.allinclusive.last.month.div10,-0.058,0.944,0.171
SpotHopper_site_not_live_yet,-0.28,0.756,0.44
Website.Views.last.month.total.log2,-0.213,0.808,0.0
had_added_events_manually_edited_events_before,0.182,1.2,0.256
had_added_events_manually_edited_events_last_3_months,-0.353,0.704,0.18


## without variables p value >= 0.2

Unnamed: 0_level_0,coef - AVERAGE,exp(coef) - AVERAGE,p value - AVERAGE
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fb.page.likes.analysis.avg.log2,-0.035,0.966,0.045
Instagram.Add.on,0.202,1.225,0.099
Monthly,0.995,2.706,0.0
New.email.subscribers.allinclusive.last.month.div10,-0.064,0.938,0.122
Website.Views.last.month.total.log2,-0.197,0.821,0.0
had_added_events_manually_edited_events_before,0.213,1.238,0.171
had_added_events_manually_edited_events_last_3_months,-0.388,0.678,0.127
had_added_specials_edited_specials_before,-0.163,0.85,0.271
had_added_specials_edited_specials_last_3_months,0.482,1.619,0.069
had_catering_submissions_before,-0.106,0.9,0.631
