In [1]:
import pandas as pd
import numpy as np

#### Helpers ####
import os
import sys
sys.path.insert(0, os.path.abspath('../'))
from helpers.s3_bucket_utils import S3BucketUtils
from helpers import db_utils
from helpers import settings

bucket = S3BucketUtils()
################

date_of_analysis = input('Date of analysis: ')
date_dir = date_of_analysis.replace('-', '_')

Date of analysis: 2021-12-01


In [2]:
data_sets = ['data_tv_ALL_spots_with_CB_wo_151617.csv',\
             'data_tv_ALL_spots_wo_CB_wo_151617.csv',\
             'data_tv_CAN_CANCEL_spots_with_CB_wo_151617.csv',\
             'data_tv_CAN_CANCEL_spots_wo_CB_wo_151617.csv']

In [3]:
def get_months_with_properly_using_pps_perc(spot):
    num_of_months_with_pps = len(spot[spot['had_private_parties_submissions_last_2_months']==1])
    num_of_months_with_properly_using_pps = len(spot[spot['had_properly_used_private_parties_inquiries_last_2_months']==1])
    if num_of_months_with_pps>0:
        months_with_properly_using_pps_perc = \
    round(100*(num_of_months_with_properly_using_pps/num_of_months_with_pps), 2)
    else:
        return np.nan
    
    return months_with_properly_using_pps_perc

In [4]:
had_inquiries_var = 'had_private_parties_submissions_last_2_months'
had_inquiries_and_stopped_var = 'had_private_parties_submissions_before_and_didnt_last_2_months'
properly_used_inquiries_var = 'had_properly_used_private_parties_inquiries_last_2_months'
properly_used_inquiries_and_stopped_var = 'had_properly_used_private_parties_inquiries_before_and_didnt_last_2_months'

In [5]:
export = pd.DataFrame(index = [x.split('_wo_151617')[0] for x in data_sets],\
                     columns = ['had_pps_before',\
                               'had_pps_last_2_months',\
                               'had_pps_before_and_didnt_last_2_months',\
                               'had_pps_before_stopped_and_resumed',\
                               'had_properly_used_pps_before',\
                               'had_properly_used_pps_last_2_months',\
                               'had_properly_used_pps_before_and_didnt_last_2_months',\
                               'had_properly_used_pps_before_stopped_and_resumed',\
                               'had_pps_but_stopped_properly_using_them',\
                               'months_with_properly_using_pps_on_average_%'])

In [6]:
for filename in [x.split('_wo_151617')[0] for x in data_sets]:
    df = \
    bucket.load_csv_from_s3(file_name='churn_analysis_based_on_behaviour/data/'+date_dir+'/'+filename+'_wo_151617.csv')
    
    df = df.merge(df[df[had_inquiries_var]==1].groupby('spot_id')['left_limit'].max().\
    reset_index().rename(columns = {'left_limit':had_inquiries_var+'_last_month'}),\
             on = 'spot_id', how = 'left')

    df = df.merge(df[df[properly_used_inquiries_var]==1].groupby('spot_id')['left_limit'].max().\
    reset_index().rename(columns = {'left_limit':properly_used_inquiries_var+'_last_month'}),\
             on = 'spot_id', how = 'left')

    df = df.merge(df[df[had_inquiries_and_stopped_var]==1].groupby('spot_id')['left_limit'].min().\
    reset_index().rename(columns = {'left_limit':had_inquiries_and_stopped_var+'_first_month'}),\
                  on = 'spot_id', how = 'left')

    df = df.merge(df[df[properly_used_inquiries_and_stopped_var]==1].groupby('spot_id')['left_limit'].min().\
    reset_index().rename(columns = {'left_limit':properly_used_inquiries_and_stopped_var+'_first_month'}),\
                  on = 'spot_id', how = 'left')

    had_pps_before = df[df['had_private_parties_submissions_before']>0]['spot_id'].unique()
    had_pps_last_2_months = df[df['had_private_parties_submissions_last_2_months']>0]['spot_id'].unique()
    had_pps_before_and_didnt_last_2_months = df[df['had_private_parties_submissions_before_and_didnt_last_2_months']>0]['spot_id'].unique()

    had_properly_used_pps_before = df[df['had_properly_used_private_parties_inquiries_before']>0]['spot_id'].unique()
    had_properly_used_pps_last_2_months = df[df['had_properly_used_private_parties_inquiries_last_2_months']>0]['spot_id'].unique()
    had_properly_used_pps_before_and_didnt_last_2_months = \
    df[df['had_properly_used_private_parties_inquiries_before_and_didnt_last_2_months']>0]['spot_id'].unique()

    had_pps_but_stopped_properly_using_them = \
    df[(df['had_properly_used_private_parties_inquiries_before_and_didnt_last_2_months']==1)&\
      (df['had_private_parties_submissions_last_2_months']==1)]['spot_id'].unique()
    
    had_properly_used_pps_before_stopped_and_resumed = \
    df[(df[properly_used_inquiries_var+'_last_month']>df[properly_used_inquiries_and_stopped_var+'_first_month'])]['spot_id'].nunique()
    
    had_pps_before_stopped_and_resumed = \
    df[(df[had_inquiries_var+'_last_month']>df[had_inquiries_and_stopped_var+'_first_month'])]['spot_id'].nunique()
    

    export.loc[filename, 'had_pps_before'] = len(had_pps_before)
    export.loc[filename, 'had_pps_last_2_months'] = len(had_pps_last_2_months)
    export.loc[filename, 'had_pps_before_and_didnt_last_2_months'] = \
    (len(had_pps_before_and_didnt_last_2_months), round(100*(len(had_pps_before_and_didnt_last_2_months)/len(had_pps_before)), 2))

    export.loc[filename, 'had_properly_used_pps_before'] = \
    (len(had_properly_used_pps_before), round(100*(len(had_properly_used_pps_before)/len(had_pps_before)), 2))
    export.loc[filename, 'had_properly_used_pps_last_2_months'] = \
    (len(had_properly_used_pps_last_2_months), round(100*(len(had_properly_used_pps_last_2_months)/len(had_pps_last_2_months)), 2))
    export.loc[filename, 'had_properly_used_pps_before_and_didnt_last_2_months'] = \
    (len(had_properly_used_pps_before_and_didnt_last_2_months), round(100*(len(had_properly_used_pps_before_and_didnt_last_2_months)/len(had_properly_used_pps_before)), 2))

    export.loc[filename, 'had_pps_but_stopped_properly_using_them'] = \
    (len(had_pps_but_stopped_properly_using_them), \
         round(100*(len(had_pps_but_stopped_properly_using_them)/len(had_properly_used_pps_before_and_didnt_last_2_months)), 2))
    
    export.loc[filename, 'had_pps_before_stopped_and_resumed'] = \
    (had_pps_before_stopped_and_resumed, round(100*(had_pps_before_stopped_and_resumed/len(had_pps_before_and_didnt_last_2_months)), 2))

    export.loc[filename, 'had_properly_used_pps_before_stopped_and_resumed'] = \
    (had_properly_used_pps_before_stopped_and_resumed, round(100*(had_properly_used_pps_before_stopped_and_resumed/len(had_properly_used_pps_before_and_didnt_last_2_months)), 2))
    
    
    df['months_with_properly_using_pps_perc'] = \
    df.groupby('spot_id').apply(lambda x: get_months_with_properly_using_pps_perc(spot = \
                                                                                 x[['had_private_parties_submissions_last_2_months',\
                                                                                   'had_properly_used_private_parties_inquiries_last_2_months']]))

    export.loc[filename, 'months_with_properly_using_pps_on_average_%'] = \
    round(df[df['months_with_properly_using_pps_perc'].notnull()][['spot_id','months_with_properly_using_pps_perc']].\
    drop_duplicates()['months_with_properly_using_pps_perc'].mean(), 2)

In [7]:
export.transpose().to_csv('data/'+date_dir+'/exports/had_PPs_vs_properly_used_PPs.csv')