In [None]:
# import required packages/libraries
import pandas as pd
import numpy as np
# read data into dataframe
df =  pd.read_csv("../Claim_Report_Sept2018.csv")

In [None]:
df.head()

### First thing need to rename the columns to get better understanding of attributes

In [None]:
df.rename(columns={'claimid':'Claim ID',
    'clm mspinsurancetypecode': 'Claim MSPQ Insurance Type Code', 
    'clm mspinsurancetypename':'Claim MSPQ Insurance Type Name',
    'srvday':'Claim Service Day',
    'srvmnth': 'Claim Service Month',
    'patient status':'Patient Claim Status',
    'prmry clm dgnss cd st':'Primary Claim Diagnosis Code Set',
    'primary status':'Primary Claim Status',
    'prmry clm typ':'Primary Claim Type',
    'pri clm sbmssn cnt':'Primary Claim Submission Count',
    'scndry clm dgnss cd st':'Secondary Claim Diagnosis Code Set',
    'secondary status':'Secondary Claim Status',
    'scndry clm typ':'Secondary Claim Type',
    'sec clm sbmssn cnt':'Secondary Claim Submission Count',
    'patient age':'Patient Age',
    'ptnt age mnths': 'Patient Age (Months)',
    'patientdob':'Patient Date of Birth',
    'patientid':'Patient ID',
    'patient name':'Patient Name',
    'race':'Patient Race',
    'pat sex orientation':'Patient Sexual Orientation',
    'patient state':'Patient State',
    'patient city':'Patient City',
    'status':'Patient Status',
     },
    inplace=True)

In [None]:
df.head()

### Now it looks better, lets go ahead and see data types aand total number of records

In [None]:
df.info()

#### So total records are approx 11112 and datatypes are float. Lets explore further and see how many missing values are resent for each column

In [None]:
df.isnull().sum()

#### From above we can see, except Claim ID
    - Claim MSPQ Insurance Type Code        11112[No Records]
    - Claim MSPQ Insurance Type Name        11112[No Records]
    - Claim Service Day                         1[Only one record missing]
    - Claim Service Month                       1[Only one record missing]
    - Patient Claim Status                      1[Only one record missing]
    - Primary Claim Diagnosis Code Set         75[75 records missing]
    - Primary Claim Status                      1[Only one record missing]
    - Primary Claim Type                       75[75 records missing]
    - Primary Claim Submission Count            1[Only one record missing]
    - Secondary Claim Diagnosis Code Set     9808[9808 records missing]
    - Secondary Claim Status                    1[Only one record missing]
    - Secondary Claim Type                   9808[9808 records missing]
    - Secondary Claim Submission Count          1[Only one record missing]
    - Patient Age                               1[Only one record missing]
    - Patient Age (Months)                      1[Only one record missing]
    - Patient Date of Birth                     1[Only one record missing]
    - Patient ID                                1[Only one record missing]
    - Patient Name                              1[Only one record missing]
    - Patient Race                            411[411 records missing]
    - Patient Sexual Orientation            11112[11112 records missing]
    - Patient State                             1[Only one record missing]
    - Patient City                              1[Only one record missing]
    - Patient Status                            1[Only one record missing]


So there is one claim which is no records at all as there are lots of columns with 1 record missing. Let's figure out which record it is and delete that reord from data

In [None]:
df

### we can see that the last row has no data. Lets delete that row

In [None]:
df = df[:-1]

In [None]:
df

#### SO now we don't have any wrong record in dataframe

#### Lets take count of missing values again

In [None]:
df.isnull().sum()

#### Now we have accurate column missing value count

## Handle Missing Data

#### 1. As we can see columns Claim MSPQ Insurance Type Code, Claim MSPQ Insurance Type Name, Patient Sexual Orientation. So there is no need of these columns we can drop those off


In [None]:
df = df.drop(columns=['Claim MSPQ Insurance Type Code', 'Claim MSPQ Insurance Type Name', 'Patient Sexual Orientation'])

In [None]:
df

In [None]:
df.isnull().sum()

#### Now to proceed futher lets draw a heatmap on finding co relation beween different variable[columns] 

In [None]:
# importsome more packages/libraries
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10)) 
# play with the figsize until the plot is big enough to plot all the columns
# of your dataset, or the way you desire it to look like otherwise

#sns.heatmap(data.corr())

# plot a heatmap to find out co relation matrix of clumns in the dataframe
sns.heatmap(df.corr(), cmap='magma')
plt.shot()

## From above we can see that, 
### - Primary Claim Submission count has high co relation with Secondary Claim Submission count
### - Patient Age and Patinet ID has High co relation with Secondary Claim Submission count 
### - Patient Age has little less co relation with Primary Claim Submission count than Secondary Claim submission count 

#### Now lets see percentage distribution for remaining missing values in columns, so that we can replace those with correct % values
- Secondary Claim Diagnosis Code Set    9807
- Secondary Claim Type                  9807
- Primary Claim Diagnosis Code Set        74
- Primary Claim Type                      74
- Patient Race                           410

In [None]:
df['Secondary Claim Diagnosis Code Set'].value_counts(normalize=True) * 100

In [None]:
#df['Primary Claim Diagnosis Code Set'].value_counts(normalize=True) * 100
(df['Primary Claim Diagnosis Code Set'].value_counts()/df['Primary Claim Diagnosis Code Set'].count())*100

In [None]:
df['Secondary Claim Type'].value_counts(normalize=True) * 100

In [None]:
df['Primary Claim Type'].value_counts(normalize=True) * 100

So we can replace all missing values of primary and secondary claims with ICD -10 and Primary and secondary claim type values as "Professional"

In [None]:
df['Secondary Claim Diagnosis Code Set'] = df['Secondary Claim Diagnosis Code Set'].fillna('ICD -10')

In [None]:
df['Primary Claim Diagnosis Code Set'] = df['Primary Claim Diagnosis Code Set'].fillna('ICD -10')

In [None]:
df['Secondary Claim Type'] = df['Secondary Claim Type'].fillna('Professional')

In [None]:
df['Primary Claim Type'] = df['Primary Claim Type'].fillna('Professional')

#### Lets take a null coun again

In [None]:
df.isnull().sum()

#### Now we are only left witj Patient Race. lets see distribution of person race values

In [None]:
df['Patient Race'].value_counts(normalize=True) * 100


### From above we can see race types appearing closer to 0.01% values
    - Middle Eastern or North African     0.009345
    - Italian                             0.009345
    - Haitian                             0.009345
    - Polish                              0.009345
    - Arab                                0.009345

#### We will only consider below values , if we decide to replace missing.  
    - White                              62.424073
    - Patient Declined                   21.427904
    - Other Race                          5.457434
    - Asian                               2.943650
    - Black or African American           2.616578
    - Black                               2.504439
    - Asian Indian                        1.597981
    - African American                    0.663489
    - Dominican                           0.317727



In [None]:
fig= plt.figure(figsize=(40,20))
sns.countplot(data= df, y ='Patient Race').set_title("Patient Race Count")
sns.set(font_scale=3)
plt.show()
fig.savefig("Patient_Race_Count.png")

## Analysis 1: Lets see Patient Age distribution for claim type

In [None]:
df['Patient Age'].value_counts(normalize=True) * 100

In [None]:
df['Patient Age'].hist()

In [None]:
bins = [0,20,40,60,80,100,120]
plt.hist(df['Patient Age'].values, bins=bins)
plt.xlabel("Age range")
plt.ylabel("Count")
plt.xticks(bins)
plt.show()

## Analysis 2: Patient Age distribution as per State


In [None]:
fig= plt.figure(figsize=(40,20))
sns.boxplot(x="Patient State",y="Patient Age",data=df, palette="coolwarm")
sns.set(font_scale=3)

## Analysis 3: Patient race distibution as per Age

In [None]:
fig= plt.figure(figsize=(70,20))
sns.violinplot(x="Patient Race", y="Patient Age", data=df,palette='rainbow')
sns.set(font_scale=3)

## Analysis 4: Patient race distribution as per State

In [None]:
import plotly as py
import plotly.graph_objs as go
data = dict (
    type = 'choropleth',
    locations = df['Patient State'],
    locationmode='USA-states',
    colorscale = [[0, 'green'], [0.5, 'red'], [1.0, 'rgb(0, 0, 255)']],
    z=df['Patient Race'])

lyt = dict(geo=dict(scope='usa'))
map = go.Figure(data=[data], layout = lyt)
py.offline.plot(map)

## Analysis 5: Patient Claim Status Distribution

In [None]:
fig= plt.figure(figsize=(40,20))
sns.countplot(data= df, x ='Patient Claim Status').set_title("Patient Count as per Claim Status ")
sns.set(font_scale=3)
plt.show()
fig.savefig("Patient_Claim_Status_Count.png")

In [None]:
df['Patient Claim Status'].value_counts().plot(kind='pie', autopct='%1.1f%%',textprops={'fontsize':5})

In [None]:
from matplotlib import font_manager as fm
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
wedges,autotexts=ax.pie(df['Patient Claim Status'].value_counts(),
                                  textprops=dict(color="w"))
proptease = fm.FontProperties()
proptease.set_size('xx-small')
ingredients = ['CLOSED','BILLED','DROP','COLLECT','MGRHOLD','HOLD']
ax.legend(wedges,ingredients,
          title="Status",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))

plt.rcParams['font.size'] = 2
plt.setp(autotexts,fontproperties=proptease)

ax.set_title("Patient Claim Status Count")

plt.show()