In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read full dataframe
df = pd.read_csv('gs://datacamp-202518.appspot.com/data/claims/beneficiary/2008_BSA_Carrier_Line_Items_PUF.csv')

MemoryError: Unable to allocate 517. MiB for an array with shape (67735075,) and data type object

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.dtypes

In [None]:
letter_codes = ['V','E']
onlynumbers = df.loc[
    # match anything that does not have "V" or "E" codes
    ~df.CAR_LINE_ICD9_DGNS_CD.str.contains('|'.join(letter_codes), na=False)
    ,:]

In [None]:
# check whether those numbers are actually numbers
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.dtypes)

In [None]:
# Ok, not a number yet.
onlynumbers.loc[:, "CAR_LINE_ICD9_DGNS_CD"] = onlynumbers.loc[:, "CAR_LINE_ICD9_DGNS_CD"].apply(pd.to_numeric, errors = 'coerce')

In [None]:
# did that change it?
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.dtypes)

In [None]:
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.unique())

- - -

## Ok, those are numbers, so I can find the ones in the ICD9 code range I want 

> Let's get the ICD codes between 580 & 629 (from [this reference](https://www.dropbox.com/s/z4xqytytdppjuzw/2010_BSA_Carrier_Line_Items_PUF_DataDic.pdf?dl=0))


In [None]:
uticodes = onlynumbers.loc[
    # match anything between 
    onlynumbers.CAR_LINE_ICD9_DGNS_CD.between(580,629)
    ,:]    
print (uticodes.CAR_LINE_ICD9_DGNS_CD.unique())

In [None]:
print ("{} percent of dataset are *Diseases of the Urinary Tract System* ".format((uticodes.shape[0]/df.shape[0])*100))

## So how let's look at the distribution of these diagnoses across the various types


In [None]:
solo_hcp = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 1,:]
small_clinic = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 0,:]
hospital = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 3,:]
large_clinic = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 5,:]

In [None]:
small_clinic.CAR_LINE_PRVDR_TYPE_CD.value_counts()

In [None]:
uticodes.CAR_LINE_PRVDR_TYPE_CD.value_counts()

In [None]:
sns.set(font_scale=1.4)
uticodes.CAR_LINE_PRVDR_TYPE_CD.value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Type of Healthcare Provider", labelpad=14)
plt.ylabel("Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("GENITOURINARY diagnoses, by Type of Provider", y=1.02)

In [None]:
uticodes.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
sns.set(font_scale=1.4)
uticodes.CAR_LINE_ICD9_DGNS_CD.value_counts().plot(kind='barh', figsize=(6, 18), rot=0);
plt.ylabel("GENITOURINARY ICD-9 code", labelpad=14)
plt.xlabel("Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("Total Count of GENITOURINARY diagnoses from ", y=1.02);

### Ok, very well

> **Next Steps**: Get a Lookup Table of the ICD-9 codes listed above

- - -

In [None]:
genitourinary = pd.read_csv('gs://datacamp-202518.appspot.com/data/claims/beneficiary/genitourinary-icd_codes.csv')
genitourinary.dtypes

In [None]:
uticodes.dtypes

In [None]:
genitourinary.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
uticodes.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
graph = pd.merge(uticodes,genitourinary, on='CAR_LINE_ICD9_DGNS_CD', how='outer')

In [None]:
sns.set(font_scale=1.4)
graph.DIAGNOSIS_DESCRIPTION.value_counts().plot(kind='barh', figsize=(12, 22), rot=0);
# plt.ylabel("GENITOURINARY diagnosis", labelpad=14)
plt.xlabel("Number of Patients with GENITOURINARY diagnoses\n (ICD-9 codes 580-629)", labelpad=14)
# plt.title("Diagnosis count of GENITOURINARY diseases", y=1.02);

In [None]:
sns.set(font_scale=1.4)
uticodes.BENE_AGE_CAT_CD.value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Age bracket", labelpad=14)
plt.ylabel("No. Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("GENITOURINARY diagnoses, by Age of Patient", y=1.2)