In [1]:
import pandas as pd
import numpy as np
import important_variables
from dateutil.relativedelta import relativedelta
from datetime import datetime
from datetime import timedelta

#### Helpers ####
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from helpers.s3_bucket_utils import S3BucketUtils
from helpers import db_utils
from helpers import settings

bucket = S3BucketUtils()
################

for_interpretation = {'model_started':{'did_something_last_X_months':'continued_vs_never_did', 'did_something_before_and_didnt_last_X_months':'stopped_vs_never_did'},\
                     'model_stopped':{'did_something_before':'stopped_vs_never_did', 'did_something_last_X_months':'continued_vs_stopped'}}

### read model names and numbers ###
model_names = bucket.\
load_csv_from_s3(file_name = 'churn_analysis_based_on_behaviour/combinations_of_variables_that_are_not_dependent/'+\
'model_names.csv')

model_names['model_name'] = model_names['model_name'].map(lambda x: \
    list(map(lambda x: x.lstrip(' '), x.replace("'", "").split(','))))
model_names = model_names.explode('model_name')

def add_month(date, m):
    ddd = pd.to_datetime(date, format='%Y-%m-%d')
    ddd2 = ddd + relativedelta(months=m)
    return (str(ddd2))[0:10]

def get_key_based_on_value_in_a_dict(dict_, value_of_interest):
    for key, value in dict_.items():
        if value == value_of_interest:
            return key

def get_var_type(var_name, first_var, second_var):
    if var_name not in first_var.values() and var_name not in second_var.values():
        return 'not_behavioural'
    elif 'before' in var_name and 'last_month' not in var_name and 'months' not in var_name:
        return 'did_something_before'
    elif 'before' in var_name and 'didnt' in var_name:
        return 'did_something_before_and_didnt_last_X_months'
    elif 'before' not in var_name:
        return 'did_something_last_X_months'

def get_base_var_names(df, first_var, second_var):
    df['variable_base_name'] = df['variable']
    df['variable_type'] = df['variable'].apply(lambda x: get_var_type(var_name=x, first_var=first_var, second_var=second_var))
    for var_ in first_var.values():
        if var_ in df['variable'].unique():
            df.loc[(df['variable']==var_),  'variable_base_name'] = get_key_based_on_value_in_a_dict(dict_=first_var, \
                                                                                                    value_of_interest=var_)
            
    for var_ in second_var.values():
        if var_ in df['variable'].unique():
            df.loc[(df['variable']==var_),  'variable_base_name'] = get_key_based_on_value_in_a_dict(dict_=second_var, \
                                                                                                    value_of_interest=var_)
            
    return df

def check_the_vars_with_the_same_interpretation(x):
    if len(x)>1:
        if x['interpretation'].iloc[0]==x['interpretation'].iloc[1]:
            if x['exp(coef) - AVERAGE'].iloc[0]>1 and x['exp(coef) - AVERAGE'].iloc[1]<1:
                return True
            elif x['exp(coef) - AVERAGE'].iloc[0]<1 and x['exp(coef) - AVERAGE'].iloc[1]>1: 
                return True
    return False

def model_to_choose_from(x):
    return x[x['p value - AVERAGE']==min(x['p value - AVERAGE'])]['model'].iloc[0]


def get_data_type(df, var_):
    if df[var_].nunique()==2 and 1 in df[var_].unique() and 0 in df[var_].unique():
        return 'categorical'
    else:
        return 'numerical'

def get_perc_of_spots(df, var_, var_type):
    if var_type=='numerical':
        return np.nan
    elif var_type=='categorical':
        return round(100*(df[(df[var_]==1)]['spot_id'].nunique()/df['spot_id'].nunique()), 2)
    
def get_perc_of_spots_last_month(df, var_, var_type, last_month):
    if var_type=='numerical':
        return np.nan
    elif var_type=='categorical':
        return round(100*(df[(df[var_]==1)&\
                            (df['left_limit']==last_month)]['spot_id'].nunique()/df[df['left_limit']==last_month]['spot_id'].nunique()), 2)
    
    
def get_coef_and_p_for_a_specific_model(var_base_name, var_, model, spots_set, model_names, date_dir):
    
#     var_ had_properly_used_catering_inquiries_last_4_months
#     var_base_name c_changed_inquiry_status
    if ('changed_inquiry_status' in var_base_name):
        with open(r'./parameters/for_properly_used_inquiries_vars.yaml') as file:
            for_properly_used_inquiries_vars = yaml.load(file, Loader=yaml.FullLoader)

        var_base_name = for_properly_used_inquiries_vars['changed_inquiry_status_to_properly_used'][var_base_name]

    
    if var_base_name in model_names['model_name'].unique():
        model_number = \
        model_names[(model_names['model_name'].apply(lambda x: var_base_name in x))]['model_number'].values[0]
    else:
        model_number = 1
    if model == 'model_started':
        coefs_and_p_values = \
        bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/coefficients_and_pvalues/'+\
                               'started_doing_something/model_'+str(model_number)+'/coef_and_pvalues_'+spots_set+'_p_below_0_2.csv')
    else:
        coefs_and_p_values = \
        bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/coefficients_and_pvalues/'+\
                               'stopped_doing_something/model_'+str(model_number)+'/coef_and_pvalues_'+spots_set+'_p_below_0_2.csv')
    
   
    if(var_=='had_properly_used_catering_inquiries_last_4_months'):
        print('model names', model_names['model_name'].unique())
        print('var_', var_)
        print('var_base_name', var_base_name)
        print('model_number', model_number)
        print('spots_set', spots_set)
        print(coefs_and_p_values)
        
    if var_ in coefs_and_p_values['covariate'].unique():
        if(var_=='had_properly_used_catering_inquiries_last_4_months'):
            print('uso')
        return coefs_and_p_values[(coefs_and_p_values['covariate']==var_)][['exp(coef)', 'p']].\
            apply(lambda x: (round(x[0].astype(float), 4), round(x[1].astype(float), 3)), axis = 1).values[0]
#         if coefs_and_p_values[(coefs_and_p_values['covariate']==var_)]['p'].values[0]<=0.05:
#             return coefs_and_p_values[(coefs_and_p_values['covariate']==var_)][['exp(coef)', 'p']].\
#             apply(lambda x: (round(x[0], 4), round(x[1], 3)), axis = 1).values[0]
#         else:
#             return np.nan
    elif var_ not in coefs_and_p_values['covariate'].unique():
        if(var_=='had_properly_used_catering_inquiries_last_4_months'):
            print('nan')
        return np.nan

# def get_coef_and_p_for_a_specific_model_for_non_behavioural_vars(var_base_name, var_, model, spots_set, model_names, date_dir):
#     for model_number in model_names['model_number'].unique()[1:]:
#         if model == 'model_started':
#             coefs_and_p_values = \
#             bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/coefficients_and_pvalues/'+\
#                                    'started_doing_something/model_'+str(model_number)+'/coef_and_pvalues_'+spots_set+'_p_below_0_2.csv')
#         else:
#             coefs_and_p_values = \
#             bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/coefficients_and_pvalues/'+\
#                                'stopped_doing_something/model_'+str(model_number)+'/coef_and_pvalues_'+spots_set+'_p_below_0_2.csv')
#         if var_ in coefs_and_p_values['covariate'].unique():
#             if coefs_and_p_values[(coefs_and_p_values['covariate']==var_)]['p'].values[0]<=0.05:
#                 model_name = model_names[model_names['model_number']==model_number]['model_name'].values[0]
#                 return coefs_and_p_values[(coefs_and_p_values['covariate']==var_)][['exp(coef)', 'p']].\
#                       apply(lambda x: (round(x[0], 4), round(x[1], 3), 'model_for_'+str(model_name)), axis = 1).values[0]
#     return np.nan
    


def main(date_of_analysis):
    date_dir = date_of_analysis.replace('-', '_')
    last_month = add_month(date_of_analysis, -1)
    
    for file_name in ['all_significant_variables_sorted_by_p_value.csv',\
                 'not_significant_variables_with_p_below_0_2_sorted_by_p_value.csv']:
        combined_export = []
        for model_type in ['started_doing_something', 'stopped_doing_something']:
            (first_var, second_var) = \
            important_variables.get_pairs_of_variables(churn_based_on_behaviour_dir='churn_analysis_based_on_behaviour/',\
                                                   date_dir=date_dir, model_type=model_type)

            df_important_vars = \
            pd.read_csv('data/'+date_dir+'/exports/'+model_type+'/'+file_name)

            df_important_vars['model'] = 'model_'+model_type.split('_')[0]

            df_important_vars = get_base_var_names(df=df_important_vars, first_var=first_var, second_var=second_var)

            combined_export.append(df_important_vars)

        combined_export = \
        pd.concat([combined_export[0], combined_export[1]], axis = 0)
        combined_export.reset_index(drop = True, inplace = True)

        combined_export['interpretation'] = 'not a behavioural variable'
        combined_export['interpretation'] = \
        combined_export[['model', 'variable_type']].apply(lambda x: for_interpretation[x['model']][x['variable_type']] if x['variable_type']!='not_behavioural' else x['variable_type'], axis = 1)

        combined_export = combined_export.merge(combined_export.groupby(['variable_base_name', 'variable_type', 'interpretation'])[['exp(coef) - AVERAGE', 'interpretation']].\
        apply(lambda x: check_the_vars_with_the_same_interpretation(x)).reset_index().rename(columns = {0:'different_sign'}),\
                          on = ['variable_base_name', 'variable_type', 'interpretation'])

        if len(combined_export[(combined_export['different_sign']==True)])>0:
            print('THERE ARE VARIABLE THAT REPRESENT THE SAME THING BUT HAVE A DIFFERENT SIGN!!!')
        else:
            combined_export.drop(combined_export[(combined_export['model']=='model_stopped')&\
                                                (combined_export['variable_type']=='did_something_before')].index, inplace = True)

            duplicate_vars = \
            combined_export[combined_export['interpretation']=='not_behavioural'].groupby('variable')['model'].nunique()[combined_export[combined_export['interpretation']=='not_behavioural'].groupby('variable')['model'].nunique()>1].\
            reset_index()['variable'].unique()

            combined_export.reset_index(drop = True, inplace = True)

            df_duplicate_vars = combined_export[(combined_export['variable'].isin(duplicate_vars))].\
            groupby('variable')[['model', 'p value - AVERAGE']].apply(lambda x: model_to_choose_from(x)).\
            reset_index().rename(columns = {0:'model_to_choose_from'})

            combined_export = combined_export.merge(df_duplicate_vars, on = ['variable'], how = 'left')

            combined_export.drop(combined_export[(combined_export['variable'].isin(duplicate_vars))&\
                                                (combined_export['model']!=combined_export['model_to_choose_from'])].index, inplace = True)

        combined_export = combined_export[['variable_base_name', 'variable_type', 'model', 'variable', 'interpretation', 'exp(coef) - AVERAGE', 'p value - AVERAGE']].\
        sort_values(['p value - AVERAGE'])
        combined_export.reset_index(drop = True, inplace = True)

        combined_export.loc[(combined_export['interpretation']=='not_behavioural'), 'interpretation'] = np.nan

        if not os.path.exists('data/'+date_dir+'/exports/important_variables/'):
            os.makedirs('data/'+date_dir+'/exports/important_variables/')
        combined_export.\
        to_csv('data/'+date_dir+'/exports/important_variables/'+file_name, index = False)
        bucket.store_csv_to_s3(data_frame = combined_export, \
            file_name = file_name, \
            dir = '/churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/important_variables/')

        if file_name == 'all_significant_variables_sorted_by_p_value.csv':
            sign_vars = combined_export
            sign_vars['important_variables_group'] = 'significant_vars'
        else:
            not_sign_p_below_0_2_vars = combined_export
            not_sign_p_below_0_2_vars['important_variables_group'] = 'not_significant_p_below_0_2'
            
    
    behavioural_sign_vars_base_names = sign_vars[sign_vars['interpretation'].notnull()]['variable_base_name'].unique()

    important_vars = \
    pd.concat([sign_vars, not_sign_p_below_0_2_vars[(not_sign_p_below_0_2_vars['interpretation'].notnull())&\
                             (not_sign_p_below_0_2_vars['variable_base_name'].\
                             isin(behavioural_sign_vars_base_names))]], axis = 0)
    important_vars.reset_index(drop = True, inplace = True)

    spots_sets=['ALL', 'CAN_CANCEL']
    with_wo_CB_options=['with_CB', 'wo_CB']
    event_date_full_names=['cancellation_confirmed', 'cancellation_requested']

#     all_spots_sets = []
#     for spots_set in spots_sets:
#         for with_wo_CB in with_wo_CB_options:
#             for event_date_full_name in event_date_full_names:
#                 if event_date_full_name == 'cancellation_requested':
#                     event_date = 'canc_req'
#                 elif event_date_full_name == 'cancellation_confirmed':
#                     event_date = 'canc_conf'
#                 all_spots_sets.append(spots_set+'_spots_'+with_wo_CB+'_'+event_date)

    all_spots_sets = ['ALL_spots_with_CB_canc_req',\
                     'CAN_CANCEL_spots_wo_CB_canc_req']


    cols_to_export = []
    for set_ in all_spots_sets:
        cols_to_export.append('%_of_'+set_)
        cols_to_export.append('last_month_%_of_'+set_)

        df = bucket.load_csv_from_s3(file_name = 'churn_analysis_based_on_behaviour/data/' + date_dir + \
                    '/exports/data_used_for_each_model/data_tv_'+set_+'.csv')
        important_vars['type'] = important_vars['variable'].apply(lambda x: get_data_type(df, x))

        important_vars['%_of_'+set_] = \
        important_vars[['variable', 'type']].apply(lambda x: get_perc_of_spots(df, x['variable'], x['type']), axis = 1)

        important_vars['last_month_%_of_'+set_] = \
        important_vars[['variable', 'type']].apply(lambda x: get_perc_of_spots_last_month(df, x['variable'], x['type'], last_month), axis = 1)


    all_spots = [#'ALL_spots_with_CB_cancellation_confirmed',\
                 'ALL_spots_with_CB_cancellation_requested']
    can_cancel_spots = [#'CAN_CANCEL_spots_wo_CB_cancellation_confirmed',\
                       'CAN_CANCEL_spots_wo_CB_cancellation_requested']

    for spots_set in all_spots+can_cancel_spots:
#         important_vars[spots_set+'_exp(coef)_and_p_value'] = np.nan
        important_vars[spots_set+'_exp(coef)_and_p_value'] = \
        important_vars[['variable_base_name', 'model', 'variable']].\
        apply(lambda x: get_coef_and_p_for_a_specific_model(var_base_name=x['variable_base_name'],\
                                                           var_=x['variable'],\
                                                           model=x['model'],\
                                                           spots_set=spots_set,\
                                                           model_names=model_names, date_dir=date_dir), axis = 1)
        important_vars.loc[important_vars['interpretation'].isnull(), spots_set+'_exp(coef)_and_p_value'] = \
        important_vars.loc[important_vars['interpretation'].isnull(), ['variable_base_name', 'model', 'variable']].\
        apply(lambda x: get_coef_and_p_for_a_specific_model_for_non_behavioural_vars(var_base_name=x['variable_base_name'],\
                                                           var_=x['variable'],\
                                                           model=x['model'],\
                                                           spots_set=spots_set,\
                                                           model_names=model_names, date_dir=date_dir), axis = 1)

    important_vars[['model', 'variable', 'interpretation', 'important_variables_group', 'exp(coef) - AVERAGE',\
                   'p value - AVERAGE']+[x+'_exp(coef)_and_p_value' for x in all_spots+can_cancel_spots]+cols_to_export].sort_values(['variable', 'interpretation']).\
    to_csv('data/'+date_dir+'/exports/important_variables/important_variables_sorted_by_variable_name.csv',\
          index = False)
    bucket.store_csv_to_s3(data_frame = important_vars[['model', 'variable', 'interpretation', 'important_variables_group', 'exp(coef) - AVERAGE',\
                   'p value - AVERAGE']+[x+'_exp(coef)_and_p_value' for x in all_spots+can_cancel_spots]+cols_to_export].sort_values(['variable', 'interpretation']), \
            file_name = 'important_variables_sorted_by_variable_name.csv', \
            dir = '/churn_analysis_based_on_behaviour/data/'+date_dir+'/exports/important_variables/')
    
    return important_vars[['model', 'variable', 'interpretation', 'important_variables_group', 'exp(coef) - AVERAGE',\
                   'p value - AVERAGE']+[x+'_exp(coef)_and_p_value' for x in all_spots+can_cancel_spots]+cols_to_export].sort_values(['variable', 'interpretation'])

In [2]:
import yaml
with open(r'./parameters/started_doing_something_report_parameters.yaml') as file:
    model_params = yaml.load(file, Loader=yaml.FullLoader)

date_of_analysis = model_params['date_of_analysis']
important_vars = main(date_of_analysis=date_of_analysis)

KeyboardInterrupt: 

In [None]:
important_vars[important_vars['variable']=='had_properly_used_catering_inquiries_last_4_months']

In [None]:
important_vars.to_csv('test.csv')

In [None]:
bla = 'had_properly_used_catering_inquiries_last_4_months'

In [None]:
if ('last_') in 'had_properly_used_catering_inquiries_last_4_months':
    print(bla.split('_last_')[0])

In [None]:
coefs_and_p_values = \
        bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+'2022_09_01'+'/exports/coefficients_and_pvalues/'+\
                               'started_doing_something/model_'+'1'+'/coef_and_pvalues_'+'CAN_CANCEL_spots_wo_CB_cancellation_requested'+'_p_below_0_2.csv')

In [None]:
coefs_and_p_values