In [1]:
import pandas as pd
import numpy as np
import os
import yaml
from dateutil.relativedelta import relativedelta
import yaml

#### Helpers ####
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from helpers.s3_bucket_utils import S3BucketUtils
from helpers import db_utils
from helpers import settings

bucket = S3BucketUtils()
################

def add_month(date, m):
    ddd = pd.to_datetime(date, format='%Y-%m-%d')
    ddd2 = ddd + relativedelta(months=m)
    return (str(ddd2))[0:10]


def get_last_w_months_average(df, col_name, last_w_months_avg_col_name, w):
    df[last_w_months_avg_col_name] = df.groupby('spot_id')[col_name].\
    apply(lambda x: x.rolling(window=w, min_periods=0).mean())
    
    return df

def get_last_w_months_sum(df, col_name, last_w_months_avg_col_name, w):
    df[last_w_months_avg_col_name] = df.groupby('spot_id')[col_name].\
    apply(lambda x: x.rolling(window=w, min_periods=0).sum())
    
    return df

def get_num_of_months(df, col_name, num_of_months_col_name):
    df = df.merge(df[df[col_name]>0].groupby('spot_id')['time'].count().reset_index().\
rename(columns = {'time':num_of_months_col_name}), on = 'spot_id', how = 'left')
    df[num_of_months_col_name].fillna(0, inplace = True)
    
    return df

def make_dir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [2]:
def prepare_data_for_calculating_stats(df, var_, start_date, end_date):
    df_tmp = df.copy()

    df_tmp = df_tmp.merge(df_tmp[df_tmp[var_]>0].groupby('spot_id')['left_limit'].min().\
        reset_index().rename(columns = {'left_limit':var_+'_start'}), on = ['spot_id'])

    df_tmp = df_tmp.merge(df_tmp[df_tmp[var_]>0].groupby('spot_id')['left_limit'].max().\
    reset_index().rename(columns = {'left_limit':var_+'_end'}), on = ['spot_id'])

    df_tmp.drop(df_tmp[(df_tmp['left_limit']<df_tmp[var_+'_start'])|\
                      (df_tmp['left_limit']>df_tmp[var_+'_end'])].index, inplace = True)
    
    ## spots which had inquiries before the covid period ##
    spots_of_interest = df_tmp[(df_tmp[var_]>0)&\
                              (df_tmp['left_limit']<start_date)]['spot_id'].unique()
    
    df_tmp.drop(df_tmp[(df_tmp['left_limit']<start_date)|\
                      (df_tmp['left_limit']>end_date)|\
                      (df_tmp['spot_id'].isin(spots_of_interest)==False)].index, inplace = True)
    
    df_tmp.reset_index(drop = True, inplace = True)
    df_tmp = df_tmp.merge(df_tmp.groupby('spot_id')['time'].\
                          count().reset_index().rename(columns = {'time':var_+'_total_num_of_months'}),\
                          on = ['spot_id'])
    
    return (spots_of_interest, df_tmp)

In [3]:
def prepare_data_for_calculating_stats_second_approach(df, var_, start_date, end_date):
    df_tmp = df.copy()

    df_tmp = df_tmp.merge(df_tmp[df_tmp[var_]>0].groupby('spot_id')['left_limit'].min().\
        reset_index().rename(columns = {'left_limit':var_+'_start'}), on = ['spot_id'])

    df_tmp = df_tmp.merge(df_tmp[df_tmp[var_]>0].groupby('spot_id')['left_limit'].max().\
    reset_index().rename(columns = {'left_limit':var_+'_end'}), on = ['spot_id'])

    df_tmp.drop(df_tmp[(df_tmp['left_limit']<df_tmp[var_+'_start'])|\
                      (df_tmp['left_limit']>df_tmp[var_+'_end'])].index, inplace = True)
    
    ## spots which had inquiries before the covid period ##
    spots_of_interest = df_tmp[(df_tmp[var_]>0)&\
                              (df_tmp['left_limit']<start_date)]['spot_id'].unique()
    
    df_tmp = df_tmp.merge(df_tmp.groupby('spot_id')['time'].\
                          count().reset_index().rename(columns = {'time':var_+'_total_num_of_months'}),\
                          on = ['spot_id'])
    
    df_tmp.drop(df_tmp[(df_tmp['left_limit']<start_date)|\
                      (df_tmp['left_limit']>end_date)|\
                      (df_tmp['spot_id'].isin(spots_of_interest)==False)].index, inplace = True)
    
    df_tmp.reset_index(drop = True, inplace = True)
    
    return (spots_of_interest, df_tmp)

In [4]:
def create_empty_export_first_approach(covid_period, all_vars_new_names, var_):
    export_cols = ['variable_name', 'num_of_spots_with_var_set_to_1_at_least_once_before_the_covid_period',\
                       '%_of_spots_with_var_set_to_1_every_1_month']
    
    if covid_period>1:
        for i in range(2, covid_period+1):
            export_cols.append('%_of_spots_with_var_set_to_1_every_'+str(i)+'_months')
            
    export = \
    pd.DataFrame(columns = export_cols)

    export['variable_name'] = all_vars_new_names[var_]
    export.set_index('variable_name', inplace = True)
    
    return export

In [5]:
def get_the_export_first_approach(df_tmp, spots_of_interest, all_vars_new_names, var_, period_length, filename, first_approach_perc_threshold):
    export = create_empty_export_first_approach(covid_period=period_length, all_vars_new_names=all_vars_new_names, var_=var_)
    
    num_of_months_cols = ['num_of_months_with_'+all_vars_new_names[var_]+'_last_1_month']
    for i in range(2, period_length+1):
        num_of_months_cols.append('num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(i)+'_months')
        
    df_of_interest = \
    df_tmp[df_tmp['spot_id'].isin(spots_of_interest)][['spot_id', var_+'_total_num_of_months']+num_of_months_cols].drop_duplicates()
    
    export.loc[all_vars_new_names[var_], 'num_of_spots_with_var_set_to_1_at_least_once_before_the_covid_period'] = len(spots_of_interest)
    every_month = \
    df_of_interest[(df_of_interest[var_+'_total_num_of_months']==df_of_interest['num_of_months_with_'+all_vars_new_names[var_]+'_last_1_month'])]
    export.loc[all_vars_new_names[var_], '%_of_spots_with_var_set_to_1_every_1_month'] = round(100*(len(every_month)/len(spots_of_interest)), 2)

    if period_length>1:
        for i in range(2, period_length+1):
            every_X_months = \
            df_of_interest[(df_of_interest[var_+'_total_num_of_months']==df_of_interest['num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(i)+'_months'])]
            export.loc[all_vars_new_names[var_], '%_of_spots_with_var_set_to_1_every_'+str(i)+'_months'] = \
            round(100*(len(every_X_months)/len(spots_of_interest)), 2)
            
    cols_of_interest_first_approach = \
    [x for x in export.columns if '%_of_spots' in x]

    chosen_period_first_approach = cols_of_interest_first_approach[-1]
    for col in cols_of_interest_first_approach:
        if export[col].iloc[0]>=first_approach_perc_threshold:
            chosen_period_first_approach = col
            break

    chosen_period_first_approach = \
    'last_'+chosen_period_first_approach.split('every_')[1]
    if '1' in chosen_period_first_approach:
        chosen_period_first_approach = 'last_month'

    export['period_to_look_at'] = chosen_period_first_approach
           
    export.to_csv('data/'+date_dir+'/covid_period_analysis/first_approach/'+all_vars_new_names[var_]+'/'+filename)
    
    return export

In [6]:
def create_empty_export_second_approach(covid_period, all_vars_new_names, var_):
    export_cols = ['variable_name', 'num_of_spots_with_var_set_to_1_at_least_once_before_the_covid_period',\
                       '%_months_with_var_set_to_1_last_month_make_of_all_months_avg']

    
    if covid_period>1:
        for i in range(2, covid_period+1):
            export_cols.append('%_months_with_var_set_to_1_last_'+str(i)+'_months_make_of_all_months_avg')
            
    export = \
    pd.DataFrame(columns = export_cols)

    export['variable_name'] = all_vars_new_names[var_]
    export.set_index('variable_name', inplace = True)
    
    return export

In [7]:
def get_the_export_second_approach(df_tmp, spots_of_interest, all_vars_new_names, var_, period_length, filename, second_approach_perc_threshold):
    export = \
    create_empty_export_second_approach(covid_period=period_length, all_vars_new_names=all_vars_new_names, var_=var_)
    
    num_of_months_cols = ['num_of_months_with_'+all_vars_new_names[var_]+'_last_1_month']
    for i in range(2, period_length+1):
        num_of_months_cols.append('num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(i)+'_months')

    df_of_interest = \
    df_tmp[df_tmp['spot_id'].isin(spots_of_interest)][['spot_id', var_+'_total_num_of_months']+num_of_months_cols].drop_duplicates()

    export.loc[all_vars_new_names[var_], 'num_of_spots_with_var_set_to_1_at_least_once_before_the_covid_period'] = len(spots_of_interest)
    every_month = \
    round(df_of_interest[['spot_id', 'num_of_months_with_'+all_vars_new_names[var_]+'_last_1_month', var_+'_total_num_of_months']].\
        drop_duplicates().apply(lambda x: round(100*(x['num_of_months_with_'+all_vars_new_names[var_]+'_last_1_month']/x[var_+'_total_num_of_months']), 2),\
                               axis = 1).mean(), 2)
    export.loc[all_vars_new_names[var_], '%_months_with_var_set_to_1_last_month_make_of_all_months_avg'] = every_month
    
    if period_length>1:
        for i in range(2, period_length+1):
            every_X_months = \
            round(df_of_interest[['spot_id', 'num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(i)+'_months', var_+'_total_num_of_months']].\
        drop_duplicates().apply(lambda x: round(100*(x['num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(i)+'_months']/x[var_+'_total_num_of_months']), 2),\
                               axis = 1).mean(), 2)
            export.loc[all_vars_new_names[var_], '%_months_with_var_set_to_1_last_'+str(i)+'_months_make_of_all_months_avg'] = \
        every_X_months
            

    cols_of_interest_second_approach = \
    [x for x in export.columns if '%_months_with' in x]

    chosen_period_second_approach = cols_of_interest_second_approach[-1]
    for col in cols_of_interest_second_approach:
        if export[col].iloc[0]>=second_approach_perc_threshold:
            chosen_period_second_approach = col
            break

    chosen_period_second_approach = \
    'last_'+chosen_period_second_approach.split('last_')[1].split('_make')[0]

    export['period_to_look_at'] = chosen_period_second_approach

    export.to_csv('data/'+date_dir+'/covid_period_analysis/second_approach/'+all_vars_new_names[var_]+'/'+filename)
    
    return export

In [8]:
with open(r'./parameters/started_doing_something_report_parameters.yaml') as file:
    model_params = yaml.load(file, Loader=yaml.FullLoader)

date_of_analysis = model_params['date_of_analysis']
date_dir = date_of_analysis.replace('-', '_')

In [9]:
data_sets = ['data_tv_ALL_spots_with_CB_wo_151617.csv',\
             'data_tv_ALL_spots_wo_CB_wo_151617.csv',\
             'data_tv_CAN_CANCEL_spots_with_CB_wo_151617.csv',\
             'data_tv_CAN_CANCEL_spots_wo_CB_wo_151617.csv']

filename = data_sets[0].split('_wo_151617.csv')[0]

## ALL spots with CB ##
df = \
bucket.load_csv_from_s3(file_name='churn_analysis/data/'+date_dir+'/'+filename+'_wo_151617.csv')

In [10]:
vars_of_interest = ['private.parties.submissions.total',\
                        #'PP.Changed.inquiry.status.total',\
                       'reservations.submissions.total',\
                        #'R.Changed.inquiry.status.total'
                       ]

all_vars = vars_of_interest

In [11]:
## read covid period limits ##
with open(r'./parameters/covid_period.yaml') as file:
    covid_period_limits = yaml.load(file, Loader=yaml.FullLoader)

## read % thresholds to use when choosing a period to look at ##
with open(r'./parameters/perc_thresholds.yaml') as file:
    perc_thresholds = yaml.load(file, Loader=yaml.FullLoader)
first_approach_perc_threshold = perc_thresholds['first_approach_perc_threshold']
second_approach_perc_threshold = perc_thresholds['second_approach_perc_threshold']
    
all_vars_new_names = dict.fromkeys(all_vars)

In [12]:
for var_ in all_vars:
    var_new_name = var_
    if var_ == 'QR.code.flyer.scans.total':
        var_new_name = 'other_non_contactless_menu_qr_flyer_scans'
    elif var_ == 'Consumer.job.listings.inquiry.total':
        var_new_name = 'consumer_job_listing_inquiries'
    elif 'total' in var_:
        var_new_name = var_[0:-6]
        
    if '.' in var_:
        var_new_name = var_new_name.replace('.', '_')
    
    var_new_name = var_new_name.lower()
    
    all_vars_new_names[var_] = var_new_name
    
    covid_period_lengths = []
    for i in range(1, int(len(covid_period_limits[all_vars_new_names[var_]])/2)+1):
        start_date = pd.to_datetime(covid_period_limits[all_vars_new_names[var_]]['start_'+str(i)])
        end_date = pd.to_datetime(covid_period_limits[all_vars_new_names[var_]]['end_'+str(i)])
        months_diff = relativedelta(end_date, start_date).months + 1
        covid_period_lengths.append(months_diff)
    
    
    for w in range(1, max(covid_period_lengths)+1):
        if w == 1:
            if all_vars_new_names[var_]+'_last_'+str(w)+'_month_sum' in df.columns:
                df.drop(all_vars_new_names[var_]+'_last_'+str(w)+'_month_sum', axis = 1, inplace = True)
            df = get_last_w_months_sum(df=df, col_name=var_, \
                                         last_w_months_avg_col_name=all_vars_new_names[var_]+'_last_'+str(w)+'_month_sum',\
                                         w = w)
        elif all_vars_new_names[var_]+'_last_'+str(w)+'_months_sum' in df.columns:
            df.drop(all_vars_new_names[var_]+'_last_'+str(w)+'_months_sum', axis = 1, inplace = True)
        df = get_last_w_months_sum(df=df, col_name=var_, \
                                         last_w_months_avg_col_name=all_vars_new_names[var_]+'_last_'+str(w)+'_months_sum',\
                                         w = w)
            
    make_dir('data/'+date_dir+'/covid_period_analysis/first_approach/'+all_vars_new_names[var_]+'/')
    make_dir('data/'+date_dir+'/covid_period_analysis/second_approach/'+all_vars_new_names[var_]+'/')
    for i in range(0, len(covid_period_lengths)):
        filename = 'period_'+str(i+1)+'.csv'
        period_length = covid_period_lengths[i]
        start_date = pd.to_datetime(covid_period_limits[all_vars_new_names[var_]]['start_'+str(i+1)])
        end_date = pd.to_datetime(covid_period_limits[all_vars_new_names[var_]]['end_'+str(i+1)])

        print(filename)
        print(str(period_length))
        print(start_date)
        print(end_date)

        (spots_of_interest, df_tmp) = \
        prepare_data_for_calculating_stats(df=df, var_=var_, start_date=str(start_date)[0:10], end_date=str(end_date)[0:10])

        print(df_tmp['left_limit'].min())
        print(df_tmp['left_limit'].max())
        print(str(len(spots_of_interest)))

        for w in range(1, period_length+1):
            if w == 1:
                col_name = var_
                num_of_months_col_name = 'num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(w)+'_month'
            else:
                col_name = all_vars_new_names[var_] + '_last_'+str(w)+'_months_sum'
                num_of_months_col_name = 'num_of_months_with_'+all_vars_new_names[var_]+'_last_'+str(w)+'_months'

            df_tmp = get_num_of_months(df=df_tmp, col_name=col_name, num_of_months_col_name=num_of_months_col_name)

        export_first_approach = get_the_export_first_approach(df_tmp=df_tmp, spots_of_interest=spots_of_interest,\
                                           all_vars_new_names=all_vars_new_names, var_=var_,\
                                          period_length=period_length, filename=filename, first_approach_perc_threshold=\
                                                             first_approach_perc_threshold)
        
        export_second_approach = get_the_export_second_approach(df_tmp=df_tmp, spots_of_interest=spots_of_interest, \
                                                                all_vars_new_names=all_vars_new_names, var_=var_,\
                                                                period_length=period_length, filename=filename,\
                                                               second_approach_perc_threshold=second_approach_perc_threshold)

period_1.csv
12
2020-03-01 00:00:00
2021-02-01 00:00:00
2020-03-01
2021-02-01
391
period_1.csv
3
2020-03-01 00:00:00
2020-05-01 00:00:00
2020-03-01
2020-05-01
100
period_2.csv
1
2020-12-01 00:00:00
2020-12-01 00:00:00
2020-12-01
2020-12-01
217
