In [1]:
import pandas as pd

In [2]:
prescriptions = pd.read_csv(r'../data_unzipped/PartD_13/PARTD_PRESCRIBER_PUF_NPI_DRUG_13.tab', delimiter='\t')
drugs = pd.read_csv(r'../drug_manufacturers_cleaned.csv')

In [3]:
prescriptions.head()

Unnamed: 0,NPI,NPPES_PROVIDER_LAST_ORG_NAME,NPPES_PROVIDER_FIRST_NAME,NPPES_PROVIDER_CITY,NPPES_PROVIDER_STATE,SPECIALTY_DESC,DESCRIPTION_FLAG,DRUG_NAME,GENERIC_NAME,BENE_COUNT,TOTAL_CLAIM_COUNT,TOTAL_DAY_SUPPLY,TOTAL_DRUG_COST,BENE_COUNT_GE65,BENE_COUNT_GE65_REDACT_FLAG,TOTAL_CLAIM_COUNT_GE65,GE65_REDACT_FLAG,TOTAL_DAY_SUPPLY_GE65,TOTAL_DRUG_COST_GE65
0,1821285826,21ST CENTURY ONCOLOGY LLC,,AVENTURA,FL,Urology,S,TAMSULOSIN HCL,TAMSULOSIN HCL,,12,360,142.14,,*,,*,,
1,1093969024,"A PRESTIGE WELLNESS CENTER, PA",,DORAL,FL,Internal Medicine,S,PANTOPRAZOLE SODIUM,PANTOPRAZOLE SODIUM,,12,360,196.37,,*,,*,,
2,1518048750,A VILES,OMAR,MINEOLA,NY,Pediatric Medicine,S,VENLAFAXINE HCL ER,VENLAFAXINE HCL,,12,360,161.69,0.0,,0.0,,0.0,0.0
3,1952310666,A'BODJEDI,ENENGE,STAMFORD,CT,Psychiatry,S,ABILIFY,ARIPIPRAZOLE,,14,420,12705.73,0.0,,0.0,,0.0,0.0
4,1952310666,A'BODJEDI,ENENGE,STAMFORD,CT,Psychiatry,S,ALENDRONATE SODIUM,ALENDRONATE SODIUM,,17,480,107.83,,#,,#,,


From earlier analysis we know that the top 225 drugs cover 90% of all claims.

Let's try and tie the drug names to manufacturers who make them.

In [4]:
len(prescriptions.DRUG_NAME.unique())

2738

In [5]:
len(drugs.brand_name.unique())

39188

In [6]:
drugs.brand_name = drugs.brand_name.str.upper()

In [7]:
len(drugs.brand_name.drop_duplicates())

39188

In [8]:
sum(drugs.brand_name.drop_duplicates().isin(prescriptions.DRUG_NAME.unique())) / len(prescriptions.DRUG_NAME.unique())

0.70233747260774293

In [9]:
sum(drugs.generic_name.drop_duplicates().isin(prescriptions.GENERIC_NAME.unique())) / len(prescriptions.GENERIC_NAME.unique())

0.53367217280813217

So based solely on the brand name we've got pretty good coverage of the prescriptions.

The interesting problem here is that a single brand name drug can be made by multiple manufacturers

In [10]:
drugs.brand_name.value_counts().sort_values(ascending=False)[:5]

OXYGEN                     664
IBUPROFEN                  505
GABAPENTIN                 294
METFORMIN HYDROCHLORIDE    247
LISINOPRIL                 239
Name: brand_name, dtype: int64

In [11]:
sum(drugs.brand_name.value_counts() == 1)

30514

In [12]:
sum(drugs.brand_name.value_counts() != 1)

8673

Good thing is that the majority of brand name drugs have a single manufacturer, the real question is how likely is it that drug manufacturers are making payments to doctors who are the only manufacturer of that drug? Under their brand name it may be likely. 

Let's see what the counts are like on the drugs that exist in the prescription dataset.

In [13]:
sum(drugs[drugs.brand_name.isin(prescriptions.DRUG_NAME.unique())].brand_name.value_counts() == 1)

674

In [14]:
sum(drugs[drugs.brand_name.isin(prescriptions.DRUG_NAME.unique())].brand_name.value_counts() != 1)

1248

So the majority of these prescriptions have multiple manufacturers

In [15]:
prescription_set = prescriptions.groupby(['NPI', 'DRUG_NAME']
                                        ).agg({'TOTAL_CLAIM_COUNT': 'sum', 
                                               'TOTAL_DAY_SUPPLY': 'sum', 
                                               'TOTAL_DRUG_COST': 'sum'})

In [16]:
prescription_set.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,TOTAL_CLAIM_COUNT,TOTAL_DAY_SUPPLY,TOTAL_DRUG_COST
NPI,DRUG_NAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1003000126,ISOSORBIDE MONONITRATE ER,11,307,171.59
1003000126,LEVOFLOXACIN,26,165,227.1
1003000126,LISINOPRIL,19,570,100.37
1003000126,METOPROLOL TARTRATE,30,916,154.65
1003000126,PREDNISONE,14,133,44.72


In [17]:
# prescription_set.to_csv(r'prescription_data_npi_13.csv')