#Insurance Complaints Data Cleaning

The below data will be cleaned to make useful visualizations and/or models for Blue Cross and Blue Shield of Texas (BCBSTX). This data will be used to explore claim trends and possible prediction of the success of a complaint. 

In [None]:
import pandas as pd
import numpy as np

# show all values when printing lists
import sys
np.set_printoptions(threshold=sys.maxsize)

In [None]:
# set up connection to drive
from google.colab import drive
drive.mount('/content/drive')

In [63]:
# import dataframe
path = '/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/Insurance_complaints__All_data_raw.csv'
insurance = pd.read_csv(path)

In [None]:
insurance.shape

# large file. Will need to trim down.

In [None]:
insurance.head()

In [None]:
insurance.info()

# Change data types of dates
# Reason complaint filed has some null values, along with complaint type and how resolved
# Get rid of spaces to make it easier to call on values

In [64]:
# fix column names to no longer have spaces
insurance.columns = [c.replace(' ','_') for c in insurance.columns]

In [65]:
insurance['Coverage_type'].unique()

# Shrink data set to only include Accident and Health
# filter down related entries. Namely, Coverage type
insurance = insurance.loc[insurance['Coverage_type'] == 'Accident and Health']

In [None]:
# investigate Complaint_filed_against. 
print(np.sort(insurance['Complaint_filed_against'].unique()))

In [66]:
# Clean up due to capitalizations
# many names are listed twice. One with capital first letters, and others in all caps
insurance['Complaint_filed_against'] = insurance['Complaint_filed_against'].str.title()


# filter data down to only include BCBSTX and its competitors
# Omit when group is obviously not based in Texas.
insurance = insurance[insurance['Complaint_filed_against'].isin(['BCBS',
                                                                 'Humana',
                                                                 'Aetna',
                                                                 'United Health',
                                                                 'Cigna',
                                                                 'Blue Cross And Blue Shield Of Texas, Inc.', 
                                                                 'Blue Cross And Blue Shield Of Texas, A Division Of Health Care Service Corporation',
                                                                 'Blue Cross And Blue Shield Of Texas',
                                                                 'Blue Cross Medicare Advantage',
                                                                 'Humana Claims',
                                                                 'Humana Health Plan Of Texas',
                                                                 'Humana Health Plan Of Texas, Inc',
                                                                 'Humana Health Plan Of Texas, Inc.',
                                                                 'Humana Health Plan, Inc.',
                                                                 'Humana Insurance Company',
                                                                 'Aetna Better Health Of Texas Inc.',
                                                                 'Aetna Health Inc.',
                                                                 'Aetna Health Insurance Company',
                                                                 'Aetna Health And Life Insurance Company',
                                                                 'Cigna Healthcare Of Texas, Inc.',
                                                                 'Cigna Healthcare Of Texas,Inc.',
                                                                 'Cigna Health & Life Insurance Company',
                                                                 'Cigna Health And Life Insurance Company',
                                                                 'Cigna Health Management, Inc.',
                                                                 'United Healthcare Insurance Company',
                                                                 'United Healthcare Of Texas, Inc.',
                                                                 'United Healthcare Services',
                                                                 'United Healthcare Services Inc', 
                                                                 'United Healthcare Services, Inc',
                                                                 'United Healthcare Services, Inc.',
                                                                 'Unitedhealthcare Benefits Of Texas, Inc.',
                                                                 'Unitedhealthcare Community Plan Of Texas, L.L.C.',
                                                                 'Unitedhealthcare Community Plan Of Texas, Llc',
                                                                 'Unitedhealthcare Insurance Company',
                                                                 'Unitedhealthcare Insurance Company Of America',
                                                                 'Unitedhealthcare Of Texas, Inc.',
                                                                 'Unitedhealthcare Services',
                                                                 'Unitedhealthcare Services, Inc.'])]

# Rename the Health Insurance companies to BCBSTX, United Healthcare, Aetna, Cigna, and Humana
insurance.loc[insurance['Complaint_filed_against'].str.contains('United', case=False), 'Complaint_filed_against'] = 'United Health'
insurance.loc[insurance['Complaint_filed_against'].str.contains('Blue', case=False), 'Complaint_filed_against'] = 'BCBS'
insurance.loc[insurance['Complaint_filed_against'].str.contains('Cigna', case=False), 'Complaint_filed_against'] = 'Cigna'
insurance.loc[insurance['Complaint_filed_against'].str.contains('Humana', case=False), 'Complaint_filed_against'] = 'Humana'
insurance.loc[insurance['Complaint_filed_against'].str.contains('Aetna', case=False), 'Complaint_filed_against'] = 'Aetna'

In [None]:
# Reduced dataset to 60455 entries
insurance.info()

In [67]:
# Convert to date data types
insurance['Received_date'] = pd.to_datetime(insurance['Received_date'])
insurance['Closed_date'] = pd.to_datetime(insurance['Closed_date'])

In [68]:
# Get rid of nulls in complaint type and how resolved, especially since there are only a few of them.
insurance = insurance.dropna(subset=['How_resolved', 
                                     'Complaint_type', 
                                     'Reason_complaint_filed'])

In [69]:
# Let's break down the reasons into broader categories, and a sub category
spl = insurance['Reason_complaint_filed'].str.split(pat=';',n=2, expand=True)


insurance['Main_Reason'] = spl[0]
insurance['Sub_Reason'] = spl[1]

# Make the null values explicitly None
insurance['Sub_Reason'] = insurance['Sub_Reason'].fillna('None')

# Count how many stages in the reason for complaint
insurance['Num_Stages'] = insurance['Reason_complaint_filed'].str.count(';') +1

In [70]:
# Summarize number of 'Others'
insurance['Num_Others'] = insurance['Others_involved'].str.count(';') + 1
insurance['Num_Others'] = insurance['Num_Others'].fillna(0)

In [71]:
# Get month value from date
insurance['Month'] = insurance['Received_date'].dt.month

# Get number of days between received and closed_date
insurance['Claim_Length'] = (insurance['Closed_date'] - insurance['Received_date']).dt.days

In [None]:
insurance.info()

In [72]:
# Delete columns that I am no longer going to use
# These columns are either identifiers for the company, or contain strings that are not necessary for analysis at hand
insurance = insurance.drop(['Keywords',
                'Complainant_type',
                'Respondent_Role',
                'Respondent_ID', 
                'Others_involved',
                'Coverage_type',
                'How_resolved',
                'Complaint_number'], axis=1)

In [None]:
# Check data
insurance.info()

In [None]:
# Check if amounts look reasonable
insurance.describe(include='all')

In [None]:
# Save as xlsx file for Tableau
insurance.to_excel('cleaned_insurance_complaint_data.xlsx')

In [79]:
!cp cleaned_insurance_complaint_data.xlsx "/content/drive/My Drive/Colab Notebooks/Insurance_Investigation/"