In [1]:
import pandas as pd
import numpy as np
import yaml

#### Helpers ####
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from helpers.s3_bucket_utils import S3BucketUtils
from helpers import db_utils
from helpers import settings

bucket = S3BucketUtils()
################

def get_regular_churn_cases():
#     regular_churn_with_wo_add_vars = [True, \
#                                   False]
#     spots_sets = ['ALL', \
#                   'CAN_CANCEL']
#     with_wo_CB_options = ['with_CB',\
#                           'wo_CB']
#     event_date_types = ['cancellation_confirmed',\
#                         'cancellation_requested']

#     p_limits = [0.2, 0.5]
    
#     regular_churn_all_cases = []
#     for spots_set in spots_sets:
#         for with_wo_CB in with_wo_CB_options:
#             for event_date_type in event_date_types:
#                 for with_wo_add_vars in regular_churn_with_wo_add_vars:
#                     for p_limit in p_limits:
#                         if with_wo_add_vars == True:
#                             regular_churn_all_cases.append(spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                           '_with_zero_churn_vars_for_p_below_'+str(p_limit))
#                         else:
#                             regular_churn_all_cases.append(spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                           '_for_p_below_'+str(p_limit))
    
    regular_churn_with_wo_add_vars = [True, \
                                  False]
    spots_sets = ['ALL_spots_with_CB_cancellation_requested',\
                 'CAN_CANCEL_spots_wo_CB_cancellation_requested']
    p_limits = [0.2, 0.5]
    
    regular_churn_all_cases = []
    for spots_set in spots_sets:
        for with_wo_add_vars in regular_churn_with_wo_add_vars:
            for p_limit in p_limits:
                if with_wo_add_vars == True:
                    regular_churn_all_cases.append(spots_set+\
                                                          '_with_zero_churn_vars_for_p_below_'+str(p_limit))
                else:
                    regular_churn_all_cases.append(spots_set+\
                                                          '_for_p_below_'+str(p_limit))
        
                            
    return regular_churn_all_cases

def get_behavioural_churn_cases(model_names):
#     model_types = ['started_doing_something', 'stopped_doing_something']
#     spots_sets = ['ALL', \
#                   'CAN_CANCEL']
#     with_wo_CB_options = ['with_CB',\
#                           'wo_CB']
#     event_date_types = ['cancellation_confirmed',\
#                         'cancellation_requested']

#     p_limit = 0.2
    
#     behavioural_churn_all_cases = []
#     for spots_set in spots_sets:
#         for with_wo_CB in with_wo_CB_options:
#             for event_date_type in event_date_types:
#                 for model_type in model_types:
#                     for model_number in model_names['model_number'].unique()[1:]:
#                             behavioural_churn_all_cases.append(spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                               '_'+model_type+'_model_'+str(model_number)+'_for_p_below_'+str(p_limit))
                            
              
    model_types = ['started_doing_something', 'stopped_doing_something']
    spots_sets = ['ALL_spots_with_CB_cancellation_requested',\
                 'CAN_CANCEL_spots_wo_CB_cancellation_requested']
    
    p_limit = 0.2
    
    behavioural_churn_all_cases = []
    for spots_set in spots_sets:
        for model_type in model_types:
            for model_number in model_names['model_number'].unique()[1:]:
                behavioural_churn_all_cases.append(spots_set+\
                                                          '_'+model_type+'_model_'+str(model_number)+'_for_p_below_'+str(p_limit))
                            
    return behavioural_churn_all_cases

def get_mean_testing_results_one_case(testing_results_one_month_one_case, metric):
    all_spots_that_requested_cancellation = \
    round(float(testing_results_one_month_one_case.loc[metric, 'all_spots_that_requested_cancellation']), 6)
    all_spots_that_requested_cancellation_wo_Closed_Sold_Business = \
    round(float(testing_results_one_month_one_case.loc[metric, 'all_spots_that_requested_cancellation_wo_Closed_Sold_Business']), 6)

#     mean_value = np.mean([all_spots_that_requested_cancellation,\
#         all_spots_that_requested_cancellation_wo_Closed_Sold_Business])
    
    mean_value = all_spots_that_requested_cancellation_wo_Closed_Sold_Business
    
    return mean_value  

def main(date_of_analysis):
    date_dir = date_of_analysis.replace('-', '_')

    regular_churn_dir_path_base = 'churn_analysis/data/'+date_dir+'/exports/'
    churn_based_on_behaviour_dir_path_base = 'churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/'

    export = pd.DataFrame(columns = ['model_type', 'model_number', 'model_name',\
     'spots_set', 'for_p_value_below', 'metric', 'case', 'prediction_month'])

#     metrics = ['log_loss',\
#     'concordance',\
#      'perc_of_requested_cancellation_spots_in_the_first_20_%_of_all_spots']

    metrics = ['perc_of_requested_cancellation_spots_in_the_first_20_%_of_all_spots']
    
    cases = ['all_spots_that_requested_cancellation', \
    'all_spots_that_requested_cancellation_wo_Closed_Sold_Business']

    spots_sets_for_testing = ['all_spots_that_requested_cancellation',\
                             'all_spots_that_requested_cancellation_wo_Closed_Sold_Business']

#     p_limit = 0.2

    ### read model names and numbers ###
    model_names = bucket.\
    load_csv_from_s3(file_name = 'churn_analysis_based_on_behaviour/combinations_of_variables_that_are_not_dependent/'+\
    'model_names.csv')
    
    ## get all possible combinations (data set cases) ##
    regular_churn_all_cases = get_regular_churn_cases()
    behavioural_churn_all_cases = get_behavioural_churn_cases(model_names=model_names)
    
    with open(r'./parameters/started_doing_something_report_parameters.yaml') as file: # it's doesnt matter which model type we choose
        parameters = yaml.load(file, Loader=yaml.FullLoader)
    prediction_months = parameters['prediction_months']
    
    regular_churn_testing_results = dict.fromkeys(prediction_months)
    behavioural_churn_testing_results = dict.fromkeys(prediction_months)
    for key in prediction_months:
        regular_churn_testing_results[key] = pd.DataFrame(index = regular_churn_all_cases, columns = metrics)
        behavioural_churn_testing_results[key] = pd.DataFrame(index = behavioural_churn_all_cases, columns = metrics)

#     regular_churn_with_wo_add_vars = [True, \
#                                   False]
#     spots_sets = ['ALL', \
#                   'CAN_CANCEL']
#     with_wo_CB_options = ['with_CB',\
#                           'wo_CB']
#     event_date_types = ['cancellation_confirmed',\
#                         'cancellation_requested']

#     p_limits = [0.2, 0.5]

#     for spots_set in spots_sets:
#         for with_wo_CB in with_wo_CB_options:
#             for event_date_type in event_date_types:
#                 for with_wo_add_vars in regular_churn_with_wo_add_vars:
#                     for p_limit in p_limits:
#                         for prediction_month in prediction_months:
#                             if with_wo_add_vars == True:
#                                 testing_results_one_month_one_case = \
#                                 bucket.load_csv_from_s3(file_name=regular_churn_dir_path_base+'coefficients_and_pvalues_with_add_important_vars/testing_results_'+\
#                                     prediction_month.replace('-', '_')+'_'+spots_set+'_spots_'+\
#                                     with_wo_CB+'_'+event_date_type+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
#                                 testing_results_one_month_one_case.set_index('index', inplace = True)

#                                 for metric in metrics:
#                                     regular_churn_testing_results[prediction_month].loc[spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                                   '_with_zero_churn_vars_for_p_below_'+str(p_limit), metric] = \
#                                     get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
#                                                                                metric=metric)


#                             else:
#                                 testing_results_one_month_one_case = \
#                                 bucket.load_csv_from_s3(file_name=regular_churn_dir_path_base+'coefficients_and_pvalues/testing_results_'+\
#                                 prediction_month.replace('-', '_')+'_'+spots_set+'_spots_'+\
#                                 with_wo_CB+'_'+event_date_type+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
#                                 testing_results_one_month_one_case.set_index('index', inplace = True)

#                                 for metric in metrics:
#                                     regular_churn_testing_results[prediction_month].loc[spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                                   '_for_p_below_'+str(p_limit), metric] = \
#                                     get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
#                                                                                metric=metric)
                                    
    
    regular_churn_with_wo_add_vars = [True, \
                                  False]
    spots_sets = ['ALL_spots_with_CB_cancellation_requested',\
                 'CAN_CANCEL_spots_wo_CB_cancellation_requested']
    p_limits = [0.2, 0.5]
    for spots_set in spots_sets:
        for with_wo_add_vars in regular_churn_with_wo_add_vars:
            for p_limit in p_limits:
                for prediction_month in prediction_months:
                    if with_wo_add_vars == True:
                        testing_results_one_month_one_case = \
                        bucket.load_csv_from_s3(file_name=regular_churn_dir_path_base+'coefficients_and_pvalues_with_add_important_vars/testing_results_'+\
                            prediction_month.replace('-', '_')+'_'+spots_set+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
                        testing_results_one_month_one_case.set_index('index', inplace = True)

                        for metric in metrics:
                            regular_churn_testing_results[prediction_month].loc[spots_set+\
                                                          '_with_zero_churn_vars_for_p_below_'+str(p_limit), metric] = \
                            get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
                                                                       metric=metric)


                    else:
                        testing_results_one_month_one_case = \
                        bucket.load_csv_from_s3(file_name=regular_churn_dir_path_base+'coefficients_and_pvalues/testing_results_'+\
                        prediction_month.replace('-', '_')+'_'+spots_set+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
                        testing_results_one_month_one_case.set_index('index', inplace = True)

                        for metric in metrics:
                            regular_churn_testing_results[prediction_month].loc[spots_set+\
                                                          '_for_p_below_'+str(p_limit), metric] = \
                            get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
                                                                       metric=metric)
        
    
    
                                    
#     spots_sets = ['ALL', \
#               'CAN_CANCEL']
#     with_wo_CB_options = ['with_CB',\
#                           'wo_CB']
#     event_date_types = ['cancellation_confirmed',\
#                         'cancellation_requested']

#     model_types = ['started_doing_something',\
#                    'stopped_doing_something']

#     p_limits = [0.2]

#     for spots_set in spots_sets:
#         for with_wo_CB in with_wo_CB_options:
#             for event_date_type in event_date_types:
#                 for model_type in model_types:
#                     for model_number in model_names['model_number'].unique()[1:]:
#                         for p_limit in p_limits:
#                             for prediction_month in prediction_months:
#                                 churn_based_on_behaviour_dir_path = churn_based_on_behaviour_dir_path_base+'coefficients_and_pvalues/'+\
#                                     model_type+'/model_'+str(model_number)+'/'

#                                 testing_results_one_month_one_case =\
#                                 bucket.load_csv_from_s3(file_name=churn_based_on_behaviour_dir_path+'testing_results_'+\
#                                                         prediction_month.replace('-', '_')+'_'+spots_set+'_spots_'+ \
#                                                         with_wo_CB+'_'+event_date_type+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
#                                 testing_results_one_month_one_case.set_index('index', inplace = True)

#                                 for metric in metrics:
#                                     behavioural_churn_testing_results[prediction_month].loc[spots_set+'_spots_'+with_wo_CB+'_'+event_date_type+\
#                                                                       '_'+model_type+'_model_'+str(model_number)+'_for_p_below_'+str(p_limit), metric] = \
#                                     get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
#                                                                      metric=metric)
                                    
    
    spots_sets = ['ALL_spots_with_CB_cancellation_requested',\
                 'CAN_CANCEL_spots_wo_CB_cancellation_requested']
    model_types = ['started_doing_something',\
                   'stopped_doing_something']
    p_limits = [0.2]
    
    for spots_set in spots_sets:
        for model_type in model_types:
            for model_number in model_names['model_number'].unique()[1:]:
                for p_limit in p_limits:
                    for prediction_month in prediction_months:
                        churn_based_on_behaviour_dir_path = churn_based_on_behaviour_dir_path_base+'coefficients_and_pvalues/'+\
                            model_type+'/model_'+str(model_number)+'/'

                        testing_results_one_month_one_case =\
                        bucket.load_csv_from_s3(file_name=churn_based_on_behaviour_dir_path+'testing_results_'+\
                                                prediction_month.replace('-', '_')+'_'+spots_set+'_p_below_'+str(p_limit).replace('.', '_')+'.csv')
                        testing_results_one_month_one_case.set_index('index', inplace = True)

                        for metric in metrics:
                            behavioural_churn_testing_results[prediction_month].loc[spots_set+\
                                                              '_'+model_type+'_model_'+str(model_number)+'_for_p_below_'+str(p_limit), metric] = \
                            get_mean_testing_results_one_case(testing_results_one_month_one_case=testing_results_one_month_one_case,\
                                                             metric=metric)
        
    
    
                                    
    
    ## extract best testing results for each analysis type (regular and behavioural) ##
    best_testing_results_for_behavioural_churn_df = pd.DataFrame(index = prediction_months,\
                                                            columns=sorted(['best_'+x+'_data_set' for x in metrics]+['best_'+x+'_value' for x in metrics]))
    best_testing_results_for_regular_churn_df = pd.DataFrame(index = prediction_months,\
                                                                columns=sorted(['best_'+x+'_data_set' for x in metrics]+['best_'+x+'_value' for x in metrics]))
    
    for prediction_month in prediction_months:
        regular_churn_one_month_testing_results = regular_churn_testing_results[prediction_month]
        behavioural_churn_one_month_testing_results = behavioural_churn_testing_results[prediction_month]
        for metric in metrics:
            if metric == 'log_loss':
                best_testing_results_for_regular_churn_df.loc[prediction_month, 'best_'+metric+'_data_set'] = \
                list(regular_churn_one_month_testing_results[regular_churn_one_month_testing_results[metric]==min(regular_churn_one_month_testing_results[metric])].index.unique())
                best_testing_results_for_regular_churn_df.loc[prediction_month, 'best_'+metric+'_value'] = \
            min(regular_churn_one_month_testing_results[metric])

                best_testing_results_for_behavioural_churn_df.loc[prediction_month, 'best_'+metric+'_data_set'] = \
                list(behavioural_churn_one_month_testing_results[behavioural_churn_one_month_testing_results[metric]==min(behavioural_churn_one_month_testing_results[metric])].index.unique())
                best_testing_results_for_behavioural_churn_df.loc[prediction_month, 'best_'+metric+'_value'] = \
            min(behavioural_churn_one_month_testing_results[metric])


            else:
                best_testing_results_for_regular_churn_df.loc[prediction_month, 'best_'+metric+'_data_set'] = \
                list(regular_churn_one_month_testing_results[regular_churn_one_month_testing_results[metric]==max(regular_churn_one_month_testing_results[metric])].index.unique())
                best_testing_results_for_regular_churn_df.loc[prediction_month, 'best_'+metric+'_value'] = \
            max(regular_churn_one_month_testing_results[metric])

                best_testing_results_for_behavioural_churn_df.loc[prediction_month, 'best_'+metric+'_data_set'] = \
                list(behavioural_churn_one_month_testing_results[behavioural_churn_one_month_testing_results[metric]==max(behavioural_churn_one_month_testing_results[metric])].index.unique())
                best_testing_results_for_behavioural_churn_df.loc[prediction_month, 'best_'+metric+'_value'] = \
            max(behavioural_churn_one_month_testing_results[metric])
                
    ## create a data frame for comparison purposes ##
    for col in best_testing_results_for_regular_churn_df.columns:
        best_testing_results_for_regular_churn_df.rename(columns = {col:'REGULAR CHURN '+col}, inplace = True)
    for col in best_testing_results_for_behavioural_churn_df.columns:
        best_testing_results_for_behavioural_churn_df.rename(columns = {col:'BEHAVIOURAL CHURN '+col}, inplace = True)
        
    best_testing_results_comparison = best_testing_results_for_regular_churn_df.reset_index().merge(best_testing_results_for_behavioural_churn_df.reset_index(),\
                                               on = ['index'])
    best_testing_results_comparison.set_index('index', inplace = True)
    cols_to_export = []
    for metric in metrics:
        cols_to_export.append('REGULAR CHURN '+'best_'+metric+'_data_set')
        cols_to_export.append('REGULAR CHURN '+'best_'+metric+'_value')
        cols_to_export.append('BEHAVIOURAL CHURN '+'best_'+metric+'_data_set')
        cols_to_export.append('BEHAVIOURAL CHURN '+'best_'+metric+'_value')
        
    best_testing_results_comparison = \
    best_testing_results_comparison[cols_to_export]
    best_mean_results = pd.DataFrame(index = metrics, columns = ['REGULAR/BEHAVIOURAL'])
    
    for metric in metrics:
        beh_mean = best_testing_results_comparison['BEHAVIOURAL CHURN '+'best_'+metric+'_value'].mean()
        reg_mean = best_testing_results_comparison['REGULAR CHURN '+'best_'+metric+'_value'].mean()
        if metric == 'log_loss':
            if beh_mean<reg_mean:
                best_mean_results.loc[metric] = 'BEHAVIOURAL CHURN'
            elif beh_mean>reg_mean:
                best_mean_results.loc[metric] = 'REGULAR CHURN'
            else:
                best_mean_results.loc[metric] = 'THE SAME'
        else:
            if beh_mean>reg_mean:
                best_mean_results.loc[metric] = 'BEHAVIOURAL CHURN'
            elif beh_mean<reg_mean:
                best_mean_results.loc[metric] = 'REGULAR CHURN'
            else:
                best_mean_results.loc[metric] = 'THE SAME'
    chosen_analysis = best_mean_results['REGULAR/BEHAVIOURAL'].value_counts().index[0]
#     chosen_data_set = pd.DataFrame((best_testing_results_comparison[chosen_analysis+' best_'+metrics[0]+'_data_set'].\
#     apply(lambda x: x[0]).tolist()+\
#     best_testing_results_comparison[chosen_analysis+' best_'+metrics[1]+'_data_set'].\
#     apply(lambda x: x[0]).tolist()+\
#     best_testing_results_comparison[chosen_analysis+' best_'+metrics[2]+'_data_set'].\
#     apply(lambda x: x[0]).tolist()))[0].value_counts().index[0]

    chosen_data_set = pd.DataFrame(best_testing_results_comparison[chosen_analysis+' best_'+metrics[0]+'_data_set'].values).explode(0)[0].\
    value_counts().index[0]
    
    data_sets_with_the_best_results = \
    pd.DataFrame(pd.DataFrame(best_testing_results_comparison[chosen_analysis+' best_'+metrics[0]+'_data_set'].values).explode(0)[0].\
    value_counts()).reset_index().rename(columns = {0:'how many times data set had the best results',\
                                                   'index':'data_set'})
    
    data_sets_with_the_best_results.\
    to_csv('data/'+date_dir+'/exports/data_sets_with_the_best_results.csv', index = False)
    bucket.store_csv_to_s3(data_frame=data_sets_with_the_best_results,\
                           file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/data_sets_with_the_best_results.csv')
    
    
    
    best_testing_results_comparison.reset_index().rename(columns = {'index':'prediction_month'}).\
    to_csv('data/'+date_dir+'/exports/best_testing_results_comparison.csv', index = False)
    bucket.store_csv_to_s3(data_frame=best_testing_results_comparison.reset_index().rename(columns = {'index':'prediction_month'}),\
                           file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/best_testing_results_comparison.csv')

    best_mean_results.reset_index().rename(columns = {'index':'metric'}).\
    to_csv('data/'+date_dir+'/exports/better_mean_testing_results.csv', index = False)
    bucket.store_csv_to_s3(data_frame=best_mean_results.reset_index().rename(columns = {'index':'metric'}).rename(columns = {'index':'prediction_month'}),\
                           file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/better_mean_testing_results.csv')
    
    
    return (chosen_data_set, chosen_analysis, data_sets_with_the_best_results, best_mean_results, best_testing_results_comparison)

In [2]:
import yaml
with open(r'./parameters/started_doing_something_report_parameters.yaml') as file:
    model_params = yaml.load(file, Loader=yaml.FullLoader)
date_of_analysis = model_params['date_of_analysis']

(chosen_data_set, chosen_analysis, data_sets_with_the_best_results, best_mean_results, best_testing_results_comparison) = \
main(date_of_analysis=date_of_analysis)

In [3]:
chosen_analysis

'BEHAVIOURAL CHURN'

In [4]:
chosen_data_set

'CAN_CANCEL_spots_wo_CB_cancellation_requested_stopped_doing_something_model_37_for_p_below_0.2'