## Spots that are always excluded:
##### Test spots
##### Spots marked as problematic
##### Suscpicious for analysis == 'Yes' spots
##### Fake Sale spots: This means that Reason for Cancelling Survey is any combination which contains 'Fake Sale'
##### Spots whose start date was before 2018


## CAN CANCEL:
##### Spots which cancelled more than 2 months before their Available Cancellation Date are excluded
##### Spots whose Available Cancellation Date is 2 or more months after the date of analysis
##### Months during which spots couldn't have cancelled according to the above conditions

## wo CB sets:
##### Closed Business, Sold Business and Non-payment spots are excluded. This means Reason for Cancelling Survey is any combination which contains 'Closed Business', 'Closed/Sold Business', 'Sold Business', 'Sold/Closed Business' or 'Non-payment'
## canc conf event
##### Spots for which Date Cancellation Confirmed is more than 60 days after Date Cancellation Requested

#### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import math
import importlib
from dateutil.relativedelta import relativedelta
import get_started_doing_something_variables
import prepare_for_the_models
import fit_tv_cox_models
import read_a_combination_of_variables
import yaml
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, Markdown

#### Helpers ####
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from helpers.s3_bucket_utils import S3BucketUtils
from helpers import db_utils
from helpers import settings

bucket = S3BucketUtils()
################

import calculate_all_vars_for_churn_prediction
import churn_risk_calculation_test
import churn_risk_calculation
import model_testing

In [2]:
# read the yaml file with a list of parameters needed for the report
with open(r'./parameters/started_doing_something_report_parameters.yaml') as file:
    parameters = yaml.load(file, Loader=yaml.FullLoader)

date_of_analysis = parameters['date_of_analysis']
date_dir = date_of_analysis.replace('-', '_')
### name of the data directory ###
churn_based_on_behaviour_dir = parameters['churn_based_on_behaviour_dir']
hs_list_filename = parameters['hs_list_filename']
hs_list_path = 'churn_analysis/data/'+date_dir+'/'+hs_list_filename

prediction_months = parameters['prediction_months']
### penalizer value when fitting the models ###
penalizer = parameters['penalizer']
### model type ###
model_type = parameters['model_type']
### coefficient and p values when dropping unsignificant variables ###
coefficient_limit_for_numerical_vars = parameters['coefficient_limit_for_numerical_vars']
coefficient_limit_for_cat_vars = parameters['coefficient_limit_for_cat_vars']
p_limit = parameters['p_limit']
additional_higher_p_limit = parameters['additional_higher_p_limit']
additional_lower_p_limit = parameters['additional_lower_p_limit']

# read the yaml file with data set parameters #
# data_set_name = input('Data set name: ')
data_set_name = 'ALL_spots_with_CB_cancellation_confirmed'
with open(r'./parameters/data_sets.yaml') as file:
    data_sets_parameters = yaml.load(file, Loader=yaml.FullLoader)
    
spots_set = data_sets_parameters[data_set_name]['spots_set']
with_wo_CB = data_sets_parameters[data_set_name]['with_wo_CB']
event_date_full_name = data_sets_parameters[data_set_name]['event_date_type']

if with_wo_CB == 'with_CB':
    with_wo_CB_boolean = True
else:
    with_wo_CB_boolean = False

if event_date_full_name == 'cancellation_requested':
    event_date = 'canc_req'
elif event_date_full_name == 'cancellation_confirmed':
    event_date = 'canc_conf'

In [3]:
##### base columns ##### 
base_cols = ['spot_id',\
            'time',\
            'event']

In [4]:
##### Model 0: all variables - prepare data for the model ####
cols_to_use = read_a_combination_of_variables.\
main(model_number=0, dir_name='combinations_of_variables_that_are_not_dependent/')
#### get behavioural variables ####
(variables_to_use_for_the_model, did_something_last_month_vars, did_something_before_and_didnt_last_month_vars) = \
get_started_doing_something_variables.main(date_of_analysis=date_of_analysis, variables_to_use_for_the_model=cols_to_use)
cols = base_cols + variables_to_use_for_the_model + \
did_something_before_and_didnt_last_month_vars + did_something_last_month_vars

data_all_spots = \
calculate_all_vars_for_churn_prediction.\
get_data_for_the_MV_Cox_model(date_of_analysis=date_of_analysis,\
                              hs_filename=hs_list_filename,\
                              spots_set=spots_set,\
                              with_wo_CB=with_wo_CB,\
                              event_date=event_date, columns=cols,\
                              data_dir=churn_based_on_behaviour_dir)

#### get data for the model ###
(data, base_df, df_timeline_all_vars) = \
prepare_for_the_models.get_data_for_the_MV_Cox_model(date_of_analysis=date_of_analysis, spots_set=spots_set, \
                            with_wo_CB=with_wo_CB, event_date=event_date, columns=cols, data_dir=churn_based_on_behaviour_dir,\
                            C = 100)
    
model_numbers = \
read_a_combination_of_variables.get_a_list_of_model_numbers(dir_name='combinations_of_variables_that_are_not_dependent/')

In [5]:
# properly_used_inquiries_models = read_a_combination_of_variables.\
# get_properly_used_inquiries_model_names_and_numbers(dir_name='combinations_of_variables_that_are_not_dependent/')

In [6]:
## determine which models are not already fit ##
import models_to_be_fit
num_of_p_values = 1
num_of_prediction_months = len(prediction_months)
churn_risk_prediction_exports = 'data/'+date_dir+'/exports/churn_risk_prediction/'+model_type+'/'
coefficients_and_pvalues_exports = 'data/'+date_dir+'/exports/coefficients_and_pvalues/'+model_type+'/'
(already_fit, yet_to_be_fit) = models_to_be_fit.main(model_numbers=model_numbers,\
                                                     model_type=model_type,\
                                                     date_dir=date_dir,\
                                                     churn_risk_prediction_exports=churn_risk_prediction_exports,\
                                                     coefficients_and_pvalues_exports=coefficients_and_pvalues_exports,\
                                                     data_set_name=data_set_name,\
                                                     num_of_p_values=num_of_p_values,\
                                                     num_of_prediction_months=num_of_prediction_months)

# for model_number in properly_used_inquiries_models['model_number'].unique():
for model_number in yet_to_be_fit:
    cols_to_use = read_a_combination_of_variables.\
    main(model_number=model_number, dir_name='combinations_of_variables_that_are_not_dependent/')
    
    print(sorted(cols_to_use))
    
    model_name = \
    read_a_combination_of_variables.get_model_names(model_number=model_number, \
                                                    dir_name='combinations_of_variables_that_are_not_dependent/')
    display(Markdown("# Model "+ str(model_number) + ": " + model_name))

    #### get behavioural variables ####
    (variables_to_use_for_the_model, did_something_last_month_vars, did_something_before_and_didnt_last_month_vars) = \
    get_started_doing_something_variables.main(date_of_analysis=date_of_analysis, variables_to_use_for_the_model=cols_to_use)
    cols = variables_to_use_for_the_model + \
    did_something_before_and_didnt_last_month_vars + did_something_last_month_vars

    #### data for the model ###
    df_timeline = df_timeline_all_vars.copy()
    vars_that_stay = ['spot_id', 'start', 'stop', 'event']+\
    [x for x in cols if x not in base_cols]+\
    [x for x in df_timeline.columns if 'spot_category_' in x or 'metro_area_' in x]
    df_timeline.drop([x for x in df_timeline.columns if x not in vars_that_stay], axis = 1, inplace = True)

    ### variables to skip ###
    df_timeline.isnull().sum().sum() #OK
    skip_vars = list((df_timeline!=0).sum()[(df_timeline!=0).sum()==0].index)
    skip_vars

    ctv = fit_tv_cox_models.fit_the_models_and_print_summaries(df_timeline=df_timeline, base_df=base_df, \
                                                         date_of_analysis=date_of_analysis, model_type=model_type, \
                                                         variables_to_use_for_the_model=cols_to_use,\
                                                         coefficient_limit_for_numerical_vars=coefficient_limit_for_numerical_vars, \
                                                         coefficient_limit_for_cat_vars=coefficient_limit_for_cat_vars, p_limit=p_limit, \
                                                         additional_higher_p_limit=additional_higher_p_limit, \
                                                         additional_lower_p_limit=additional_lower_p_limit,\
                                                         skip_vars=skip_vars, penalizer=penalizer)

    ### save coefs and p values for p < 0.2 ###
    fit_tv_cox_models.save_results(df = ctv.summary.reset_index(), date_of_analysis = date_of_analysis, \
                                   data_dir = churn_based_on_behaviour_dir, dir_name='exports/coefficients_and_pvalues',\
                 results_name = 'coef_and_pvalues', spots_set = spots_set,\
                 with_wo_CB = with_wo_CB, event_date_type = event_date_full_name, p_limit=0.2, model_number=model_number,\
                                  model_type = model_type)
    
    churn_risk_calculation.calculate_churn_risk(date_of_analysis = date_of_analysis, df_timeline = df_timeline, \
                                            vars_ = list(ctv.summary.index), data_all_spots = data_all_spots, ctv = ctv, \
                                            model_type = model_type, model_number = model_number,\
                                            with_add_vars = False, p_limit = p_limit,\
                                            event_date_type=event_date_full_name, spots_set = spots_set, with_wo_CB=with_wo_CB)
    
    for prediction_month in prediction_months:
        testing_results = model_testing.\
        get_testing_results(date_of_analysis=date_of_analysis,\
                            df_timeline=df_timeline.copy(), \
                            base_df = base_df.copy(),\
                            data_all_spots=data_all_spots.copy(), \
                            hs_list_path=hs_list_path,\
                            model_type=model_type,\
                            model_number=model_number,\
                            with_add_vars=False, p_limit=p_limit, event_date_type=event_date_full_name, \
                            prediction_month=prediction_month, ctv=ctv, spots_set=spots_set, \
                            with_wo_CB=with_wo_CB, penalizer=penalizer, test_with_req_canc_before=True)

        model_testing.save_results(date_of_analysis=date_of_analysis,\
                               df=testing_results[0], \
                               results_name='testing_results', \
                               model_type=model_type,\
                               model_number=model_number,\
                               spots_set=spots_set,\
                               with_wo_CB=with_wo_CB,\
                               event_date_type=event_date_full_name,\
                               with_add_vars=False, p_limit=p_limit, prediction_month=prediction_month)
        display(Markdown("## Model "+ str(model_number) + ": testing results"))
        display(testing_results[0].set_index('index'))