In [None]:
# import required packages/libraries
import pandas as pd
import numpy as np
# read data into dataframe
data =  pd.read_csv("../AthenaTRANSACTIONS_LESS_2018.csv")

In [None]:
# set option to display all columns in a dataframe 
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', None)

# display first 5 rows in the dataframe
data.head()

In [None]:
#see total information about dataframe on columns, datatypes, number of records
data.info()

In [None]:
# display count of null records in the data if any
data.isna().sum()

## Handle Missing Data
#### Almost more than 65% of values are emprty/missing in 
- icd10transactiondiagcode2,        
- icd10transactiondiagcode3         
- icd10transactiondiagcode4          
- icd9transactiondiagcode1           
- icd9transactiondiagcode2           
- icd9transactiondiagcode3          
- icd9transactiondiagcode4
#### Let's drop those columns 

In [None]:
columns= ['icd10transactiondiagcode2','icd10transactiondiagcode3','icd10transactiondiagcode4','icd9transactiondiagcode1', 
'icd9transactiondiagcode2', 'icd9transactiondiagcode3', 'icd9transactiondiagcode4']

# drop the colunmns from the dataframe
data.drop(columns, inplace=True, axis=1)


In [None]:
# verify that columns are dropped
data.head()

In [None]:
# importsome more packages/libraries
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(40,30)) 
# play with the figsize until the plot is big enough to plot all the columns
# of your dataset, or the way you desire it to look like otherwise

#sns.heatmap(data.corr())

# plot a heatmap to find out co relation matrix of clumns in the dataframe
sns.heatmap(data.corr(), cmap='BuGn')

#### 1. From data it seems that Patient id and enterpize id are same and they dont have any co relation with other variable columns.  But we will keep those to see how they are related to claims

#### 2. From co relation matrix we can see that 
- total rvu                            
- work rvu                          
- practice expense rvu               
- adj work rvu                         
- adj total rvu 

#### these are highly co related. So need to find out the way to handle missing data



In [None]:
sns.boxplot(x=data['total rvu'])

In [None]:
sns.boxplot(x=data['work rvu'])

In [None]:
sns.boxplot(x=data['practice expense rvu'])


In [None]:
sns.boxplot(x=data['adj work rvu'])


In [None]:
sns.boxplot(x=data['adj total rvu'])

#### From the box plot we can see that majority of values fall under certain value and only few are outliers.
#### Lets replaces all missing values of each column with their corresponding median value

In [None]:
median_total_rvu = data['total rvu'].median()
data['total rvu']=data['total rvu'].fillna(median_total_rvu)

In [None]:
median_work_rvu = data['work rvu'].median()
data['work rvu']=data['work rvu'].fillna(median_work_rvu)

In [None]:
median_pra_exp_rvu = data['practice expense rvu'].median()
data['practice expense rvu']=data['practice expense rvu'].fillna(median_pra_exp_rvu)

In [None]:
median_adj_work_rvu = data['adj work rvu'].median()
data['adj work rvu']=data['adj work rvu'].fillna(median_adj_work_rvu)

In [None]:
median_adj_total_rvu = data['adj total rvu'].median()
data['adj total rvu']=data['adj total rvu'].fillna(median_adj_total_rvu)

In [None]:
data.dropna().describe()

In [None]:
# verify there are no more null values
data.isna().sum()

#### icd10transactiondiagcode1 Transaction code has 2 missing values, which is ok as of now.

In [None]:
# rename the columns names to understand the data better
data.rename(columns={'srvday':'Claim Service Day','claimid':'Claim ID', 'proccode':'Procedure Code', 'allowed':'Sum of Actual Allowed Amounts', 
                   'adjall':'Sum of All Adjustments', 'all chgs':'Sum of All Charges', 'contract':'Sum of Contractual Adjustments', 
                   'current chg':'Sum of Current Charges',
                   'expected':'Sum of Expected Allowed Amounts', 'late chg':'Sum of Late Charges','net pmt':'Sum of Net Payments',
                   'netreceivable':'Sum of Net Receivable' ,'net xfer':'Sum of Net Transfers' ,'adjoth': 'Sum of Other Adjustments', 'pmt':'Sum of Payments',
                   'refund': 'Sum of Refunds',
                   'total rvu':'Total RVU','work rvu':'Work RVU','#chg':'Sum of Charges','chg units sum':'Sum of Charge Units',
                   'practice expense rvu':'Practice Expense RVU','adj work rvu':'Adjusted Work RVU','adj total rvu':'Adjusted Total RVU',
                   'trnsfr type':'Transfer Type','procclass':'Procedure Classification','ins pkg name':'Insurance Package Name',
                   'procgroup':'Procedure Code Group','rndrng prvdr':'Rendering Provider' ,'patient dprtmnt':'Patient Department' ,
                   'svc dprtmnt': 'Service Department','patientid':'Patient ID','enterpriseid':'Patient Enterprise ID' },
                    inplace=True)

In [None]:
data.head()

# Exploratory Data Analysis

## Analysis 1:
#### Find out maximum appearing Claim ID's and then in turn find out corresonding maximum appearing Procedure codes 

In [None]:
data.groupby(['Patient ID', 'Claim ID']).sum()

### It seems that maximum appearing Claim ID's are : 
1162926, 1159569, 1176159, 1194841, 1203331, 1212531, 1229473




### Lets see more about these claim ID's :

In [None]:
claim_id_array =[1162926, 1159569, 1176159, 1194841, 1203331, 1212531, 1229473]
max_claim = data.loc[data['Claim ID'].isin(claim_id_array)]
max_claim

In [None]:
max_claim.groupby(['Procedure Code', 'Claim ID']).sum()

In [None]:
fig= plt.figure(figsize=(20,8))
sns.countplot(max_claim['Procedure Code'])
plt.show()
fig.savefig("Procedure_Code_Count.png")

## Analysis 2

#### Find out Patient ID's who have late charges, and how much is the amount

In [None]:
data[data['Sum of Late Charges'] != 0]['Patient ID']