In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read full dataframe
df = pd.read_csv('gs://datacamp-202518.appspot.com/data/claims/beneficiary/2008_BSA_Carrier_Line_Items_PUF.csv')

In [3]:
df.shape

(67735075, 11)

# ⇪
# That's 67 million rows of data, from the **"2008 Basic Standalone Carrier Line Items"** [*(found here)*](https://www.cms.gov/Research-Statistics-Data-and-Systems/Downloadable-Public-Use-Files/BSAPUFS/Carrier_Line_Items)

In [4]:
df.nunique()

CAR_LINE_ID                  67735075
BENE_SEX_IDENT_CD                   2
BENE_AGE_CAT_CD                     6
CAR_LINE_ICD9_DGNS_CD             923
CAR_LINE_HCPCS_CD                4736
CAR_LINE_BETOS_CD                  99
CAR_LINE_SRVC_CNT                 156
CAR_LINE_PRVDR_TYPE_CD              6
CAR_LINE_CMS_TYPE_SRVC_CD          22
CAR_LINE_PLACE_OF_SRVC_CD          27
CAR_HCPCS_PMT_AMT                  76
dtype: int64

In [5]:
letter_codes = ['V','E']
onlynumbers = df.loc[
    # match anything that does not have "V" or "E" codes
    ~df.CAR_LINE_ICD9_DGNS_CD.str.contains('|'.join(letter_codes), na=False)
    ,:]

In [6]:
# check whether those numbers are actually numbers
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.dtypes)

object


In [7]:
# Ok, not a number yet.
onlynumbers.loc[:, "CAR_LINE_ICD9_DGNS_CD"] = onlynumbers.loc[:, "CAR_LINE_ICD9_DGNS_CD"].apply(pd.to_numeric, errors = 'coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [8]:
# did that change it?
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.dtypes)

float64


In [9]:
print (onlynumbers.CAR_LINE_ICD9_DGNS_CD.unique())

[739. 465. 959. 472. 394. 272. 592. 486. 585. 429. 789. 204. 110. 496.
 696. 414. 402. 428. 327. 403. 273. 162. 491. 288. 202. 715. 285. 924.
 295. 462. 250. 707. 424. 401. 786. 451. 311. 362. 596. 618. 490. 368.
 724. 356. 703. 157. 284. 998. 599. 722. 238. 854. 435. 365. 553. 153.
 719. 185. 266. 600. 276. 436. 338. 518. 729. 309. 725.   8. 342. 174.
 702. 738. 627. 188. 413. 466. 690. 427. 733. 441. 244. 785. 366. 780.
 296. 584. 434. 211. 536. 784. 411. 820. 396. 371. 781. 482. 721. 709.
 281. 242. 726. 303. 616. 388. 290. 456. 788. 588. 723. 997. 440. 433.
 790. 173. 562. 195. 305. 787. 836. 996. 593. 426. 287. 372. 625. 575.
 682. 692. 735. 492. 794. 354. 531. 331. 203. 256. 824. 493. 458. 473.
 782. 569. 268. 714. 512. 511.  42. 280. 823. 578.  53. 727. 793. 530.
 294. 373. 300. 425. 146. 216. 201. 796. 922. 611. 410. 350. 710. 357.
 917. 535.  88. 307. 686. 808. 275. 154. 573. 274. 233. 286. 514. 340.
 716. 528. 698.  38. 333. 720. 558. 537. 189. 586. 783. 183. 507. 515.
 332. 

- - -

## Ok, those are numbers, so I can find the ones in the ICD9 code range I want 

> Let's get the ICD codes between 580 & 629 (from [this reference](https://www.dropbox.com/s/z4xqytytdppjuzw/2010_BSA_Carrier_Line_Items_PUF_DataDic.pdf?dl=0))


In [10]:
uticodes = onlynumbers.loc[
    # match anything between 
    onlynumbers.CAR_LINE_ICD9_DGNS_CD.between(580,629)
    ,:]    
print (uticodes.CAR_LINE_ICD9_DGNS_CD.unique())

[592. 585. 596. 618. 599. 600. 627. 584. 616. 588. 593. 625. 611. 586.
 598. 595. 597. 602. 604. 620. 607. 610. 623. 608. 601. 590. 583. 591.
 626. 621. 603. 622. 594. 614. 624. 582. 580. 617. 605. 587. 581. 619.
 615. 589. 606. 628. 629.]


In [12]:
print ("{} percent of dataset are *Diseases of the Urinary Tract System* ".format((uticodes.shape[0]/df.shape[0])*100))

5.695647343713726 percent of dataset are *Diseases of the Urinary Tract System* 


In [14]:
uticodes.to_csv('/home/jupyter/data/2008_genitourinary_beneficiaries.csv', index=None)

## So how let's look at the distribution of these diagnoses across the various types of healthcare clinics



In [None]:
solo_hcp = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 1,:]
small_clinic = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 0,:]
hospital = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 3,:]
large_clinic = uticodes.loc[uticodes.CAR_LINE_PRVDR_TYPE_CD == 5,:]

In [None]:
small_clinic.CAR_LINE_PRVDR_TYPE_CD.value_counts()

In [None]:
uticodes.CAR_LINE_PRVDR_TYPE_CD.value_counts()

In [None]:
sns.set(font_scale=1.4)
uticodes.CAR_LINE_PRVDR_TYPE_CD.value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Type of Healthcare Provider", labelpad=14)
plt.ylabel("Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("GENITOURINARY diagnoses, by Type of Provider", y=1.02)

In [None]:
uticodes.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
sns.set(font_scale=1.4)
uticodes.CAR_LINE_ICD9_DGNS_CD.value_counts().plot(kind='barh', figsize=(6, 18), rot=0);
plt.ylabel("GENITOURINARY ICD-9 code", labelpad=14)
plt.xlabel("Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("Total Count of GENITOURINARY diagnoses from ", y=1.02);

### Ok, very well

> **Next Steps**: Get a Lookup Table of the ICD-9 codes listed above

- - -

In [None]:
genitourinary = pd.read_csv('gs://datacamp-202518.appspot.com/data/claims/beneficiary/genitourinary-icd_codes.csv')
genitourinary.dtypes

In [None]:
uticodes.dtypes

In [None]:
genitourinary.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
uticodes.CAR_LINE_ICD9_DGNS_CD.nunique()

In [None]:
graph = pd.merge(uticodes,genitourinary, on='CAR_LINE_ICD9_DGNS_CD', how='outer')

In [None]:
sns.set(font_scale=1.4)
graph.DIAGNOSIS_DESCRIPTION.value_counts().plot(kind='barh', figsize=(12, 22), rot=0);
# plt.ylabel("GENITOURINARY diagnosis", labelpad=14)
plt.xlabel("Number of Patients with GENITOURINARY diagnoses\n (ICD-9 codes 580-629)", labelpad=14)
# plt.title("Diagnosis count of GENITOURINARY diseases", y=1.02);

In [None]:
sns.set(font_scale=1.4)
uticodes.BENE_AGE_CAT_CD.value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.xlabel("Age bracket", labelpad=14)
plt.ylabel("No. Patients with GENITOURINARY diagnoses", labelpad=14)
plt.title("GENITOURINARY diagnoses, by Age of Patient", y=1.2)