In [134]:
import pandas as pd
import numpy as np

# set up connection to drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [135]:
# import dataframe
path = '/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/cleaned_insurance_complaint_data.xlsx'
insurance = pd.read_excel(path)

In [136]:
# filter out dates prior to covid and other companies
insurance = insurance.loc[insurance['Complaint_filed_against'] == 'BCBS']
insurance = insurance.loc[insurance['Received_date'] >= '5/1/2020']

In [None]:
# save reduced table for tableau
ready = insurance

ready.to_excel('tableau_insurance_complaint_data.xlsx')
!cp tableau_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"

In [None]:
insurance.info()

In [None]:
from itertools import count
# Time Graph table
sa = insurance[['Received_date','Confirmed_complaint', 'Main_Reason']]

sa = pd.pivot_table(sa, index = 'Received_date', columns = 'Confirmed_complaint', values = 'Main_Reason', aggfunc = 'count')

# fill in missing dates with 0
idx = pd.date_range('2020-05-01','2023-03-29')
sa = sa.reindex(idx, fill_value=0)
sa.reset_index()

# Get totals
sa = sa.fillna(0)
sa['Total'] = sa.sum(axis = 1)

In [None]:
# Save Time graph in drive
sa.to_excel('timegraph_insurance_complaint_data.xlsx')
!cp timegraph_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"

In [None]:
## Survival Analysis with Claim Length
surv = insurance[['Claim_Length', 'Confirmed_complaint', 'Main_Reason']]
surv = pd.pivot_table(surv, index = 'Claim_Length', columns = 'Confirmed_complaint', values = 'Main_Reason', aggfunc = 'count')
m = insurance['Claim_Length'].max()

# Fill in missings Number of Stages
nums = [i for i in range(0,m+1)]
surv = surv.reindex(nums, fill_value=0)
surv = surv.reset_index()

# Get running totals
surv = surv.fillna(0)
perc_y = pd.DataFrame(surv['Claim_Length'])
perc_y['Confirmed'] = 'Y'
perc_x = pd.DataFrame(surv['Claim_Length'])
perc_x['Confirmed'] = 'N'
surv['Total Yes'] = surv.loc[::-1, 'Yes'].cumsum()[::-1]
surv['Total No'] = surv.loc[::-1, 'No'].cumsum()[::-1]
perc_y['Perc'] = surv['Total Yes']/(surv.iloc[0]['Total Yes'])
perc_x['Perc'] = surv['Total No']/(surv.iloc[0]['Total No'])

perc_y = perc_y.append(perc_x, ignore_index=True)
perc_y

In [158]:
# Save Survival Analysis with Claim Length in drive
perc_y.to_excel('survival_insurance_complaint_data.xlsx')
!cp survival_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"

In [None]:
## Survival Analysis with Number of Stages
survs = insurance[['Num_Stages', 'Confirmed_complaint', 'Main_Reason']]
survs = pd.pivot_table(survs, index = 'Num_Stages', columns = 'Confirmed_complaint', values = 'Main_Reason', aggfunc = 'count')
mm = insurance['Num_Stages'].max()

# Fill in missings Number of Stages
numss = [i for i in range(0,mm+1)]
survs = survs.reindex(numss, fill_value=0)
survs = survs.reset_index()

# Get running totals
survs = survs.fillna(0)
p_y = pd.DataFrame(survs['Num_Stages'])
p_y['Confirmed'] = 'Y'
p_x = pd.DataFrame(survs['Num_Stages'])
p_x['Confirmed'] = 'N'
survs['Total Yes'] = survs.loc[::-1, 'Yes'].cumsum()[::-1]
survs['Total No'] = survs.loc[::-1, 'No'].cumsum()[::-1]
p_y['Perc'] = survs['Total Yes']/(survs.iloc[0]['Total Yes'])
p_x['Perc'] = survs['Total No']/(survs.iloc[0]['Total No'])

p_y = p_y.append(p_x, ignore_index=True)
p_y

In [161]:
# Save Survival Analysis with Number of Stages in drive
p_y.to_excel('survival_stag_insurance_complaint_data.xlsx')
!cp survival_stag_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"

In [None]:
## Calculate Percents of Top Main reasons

# Get Reasons with most claims
tops = insurance['Main_Reason'].value_counts().nlargest(7)
names = tops.index

# Get only entries with one of top reasons.
perc = insurance.loc[insurance['Main_Reason'].isin(names)]
perc = perc[['Main_Reason', 'Confirmed_complaint', 'Complaint_filed_against']]

# Get Percentages of top reasons
p = pd.pivot_table(perc, index = 'Main_Reason', columns = 'Confirmed_complaint', values = 'Complaint_filed_against', aggfunc = 'count')
p = p.fillna(0)
p['Total'] = p.sum(axis=1)
p['Perc_Yes'] = p['Yes']/p['Total']
p['Perc_No'] = 1 - p['Perc_Yes']
p.reset_index()

In [None]:
# Save Percent in drive
p.to_excel('perc_insurance_complaint_data.xlsx')
!cp perc_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"