In [None]:
import pandas as pd
data = pd.DataFrame([['a',1],['B',None],['C c',3]], columns=['letter','number'])

In [None]:
data

Unnamed: 0,letter,number
0,a,1.0
1,B,
2,C c,3.0


---
### Fill NaN with a fixed value

In [None]:
data['number'].fillna(0)

---
### Fill NaN with a calculated value

In [None]:
data['number'].fillna(data['number'].mean())

---
### Fill NA with another column

In [None]:
data['number'].fillna(data['letter'])

In [None]:
import pandas as pd
states = pd.DataFrame([
    ['1','AK',703423],
    [pd.NA, 'AL', 5634923],
    [pd.NA, 'AR', 3029341],
    [pd.NA, 'AZ', 2317412],
    ['2', 'CA', 13493821],
    [pd.NA, 'CO', 5434124]
], columns=['region','state','population'])

In [None]:
states

Unnamed: 0,region,state,population
0,1.0,AK,703423
1,,AL,5634923
2,,AR,3029341
3,,AZ,2317412
4,2.0,CA,13493821
5,,CO,5434124


In [None]:
# states['region_clean'] = states['region'].fillna(states['state'])
states['region'].fillna(states['state'], inplace=True)

In [None]:
states

Unnamed: 0,region,state,population
0,1,AK,703423
1,AL,AL,5634923
2,AR,AR,3029341
3,AZ,AZ,2317412
4,2,CA,13493821
5,CO,CO,5434124


In [None]:
import pandas as pd
states = pd.DataFrame([
    ['1','AK',703423],
    [pd.NA, 'AL', 5634923],
    [pd.NA, 'AR', 3029341],
    [pd.NA, 'AZ', 2317412],
    ['2', 'CA', 13493821],
    [pd.NA, 'CO', 5434124]
], columns=['region','state','population'])

In [None]:
states

Unnamed: 0,region,state,population
0,1.0,AK,703423
1,,AL,5634923
2,,AR,3029341
3,,AZ,2317412
4,2.0,CA,13493821
5,,CO,5434124


In [None]:
states['region'].fillna(method='ffill', inplace=True)

In [None]:
states

Unnamed: 0,region,state,population
0,1,AK,703423
1,1,AL,5634923
2,1,AR,3029341
3,1,AZ,2317412
4,2,CA,13493821
5,2,CO,5434124


---
---

### String Functions

In [None]:
data = pd.DataFrame([['a',1],['B',None],['Cat c',3]], columns=['letter','number'])

In [None]:
data

Unnamed: 0,letter,number
0,a,1.0
1,B,
2,Cat c,3.0


In [None]:
data['letter'].str.lower()

0        a
1        b
2    cat c
Name: letter, dtype: object

In [None]:
data['letter'].str.upper()

0        A
1        B
2    CAT C
Name: letter, dtype: object

In [None]:
data['letter'].str.title()

0        A
1        B
2    Cat C
Name: letter, dtype: object

In [None]:
data['letter'].str.replace('^.* ','')

0    a
1    B
2    c
Name: letter, dtype: object

In [None]:
data['letter'].str[-1]

0    a
1    B
2    c
Name: letter, dtype: object

# Situation / Problem

In this data set, we want to group by the Proprietary Name.  How many drugs are represented here?

* Does every row have a proprietary name? - If not, use the non-proprietary name instead
* Are they formatted consistently? - Make them all upper case or lower case
* Are they sometimes more specific than we want? - Look for extra stuff like units of measure

In [None]:
import pandas as pd

In [None]:
ndc = pd.read_csv('/data/ndc.txt', delimiter='\t')

In [None]:
ndc

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
0,0002-0800_4bb5d1cb-0fa7-48c7-9f6d-8d45f9b91649,0002-0800,HUMAN OTC DRUG,Sterile Diluent,,diluent,"INJECTION, SOLUTION",SUBCUTANEOUS,19870710,,NDA,NDA018781,Eli Lilly and Company,WATER,1,mL/mL,,,N,20191231.0
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,HUMAN PRESCRIPTION DRUG,,,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,20120601,,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
2,0002-1407_14757f9d-f641-4836-acf3-229265588d1d,0002-1407,HUMAN PRESCRIPTION DRUG,Quinidine Gluconate,,Quinidine Gluconate,SOLUTION,INTRAVENOUS,19500712,,NDA,NDA007529,Eli Lilly and Company,QUINIDINE GLUCONATE,80,mg/mL,"Antiarrhythmic [EPC],Cytochrome P450 2D6 Inhib...",,N,20191231.0
3,0002-1433_4468578a-47d2-488e-9fd4-a8322070392f,0002-1433,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,20140918,,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0
4,0002-1434_4468578a-47d2-488e-9fd4-a8322070392f,0002-1434,HUMAN PRESCRIPTION DRUG,Trulicity,,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,20140918,,BLA,BLA125469,Eli Lilly and Company,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep...",,N,20201231.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0003-0852_9bc9427b-2cca-466c-b41e-d47ce4540aa6,0003-0852,HUMAN PRESCRIPTION DRUG,SPRYCEL,,dasatinib,TABLET,ORAL,20080530,,NDA,NDA021986,"E.R. Squibb & Sons, L.L.C.",DASATINIB,100,mg/1,"Kinase Inhibitor [EPC],Protein Kinase Inhibito...",,N,20201231.0
90,0003-0855_9bc9427b-2cca-466c-b41e-d47ce4540aa6,0003-0855,HUMAN PRESCRIPTION DRUG,SPRYCEL,,dasatinib,TABLET,ORAL,20101028,,NDA,NDA021986,"E.R. Squibb & Sons, L.L.C.",DASATINIB,80,mg/1,"Kinase Inhibitor [EPC],Protein Kinase Inhibito...",,N,20201231.0
91,0003-0857_9bc9427b-2cca-466c-b41e-d47ce4540aa6,0003-0857,HUMAN PRESCRIPTION DRUG,SPRYCEL,,dasatinib,TABLET,ORAL,20101028,,NDA,NDA021986,"E.R. Squibb & Sons, L.L.C.",DASATINIB,140,mg/1,"Kinase Inhibitor [EPC],Protein Kinase Inhibito...",,N,20201231.0
92,0003-0893_391cf576-1ae6-4a11-bf14-8e5a135cd5dd,0003-0893,HUMAN PRESCRIPTION DRUG,eliquis,,apixaban,"TABLET, FILM COATED",ORAL,20121228,,NDA,NDA202155,"E.R. Squibb & Sons, L.L.C.",APIXABAN,2.5,mg/1,"Factor Xa Inhibitor [EPC],Factor Xa Inhibitors...",,N,20201231.0


In [None]:
# STEP 1: Are any of them blank

blank_filter = ndc['PROPRIETARYNAME'].isnull()
blanks = ndc[blank_filter]
blanks.shape

(7, 20)

In [None]:
blanks

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,HUMAN PRESCRIPTION DRUG,,,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,20120601,,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0
5,0002-1436_ad6f74e8-b0ef-4a96-9249-c1225c5cd6a7,0002-1436,HUMAN PRESCRIPTION DRUG,,,galcanezumab,"INJECTION, SOLUTION",SUBCUTANEOUS,20180927,,BLA,BLA761063,Eli Lilly and Company,GALCANEZUMAB,120,mg/mL,,,N,20201231.0
10,0002-3228_f1ee27ad-c38e-430a-b027-0b7ebad7403e,0002-3228,HUMAN PRESCRIPTION DRUG,,,Atomoxetine hydrochloride,CAPSULE,ORAL,20021126,,NDA,NDA021411,Eli Lilly and Company,ATOMOXETINE HYDROCHLORIDE,25,mg/1,"Norepinephrine Reuptake Inhibitor [EPC],Norepi...",,N,20201231.0
11,0002-3229_f1ee27ad-c38e-430a-b027-0b7ebad7403e,0002-3229,HUMAN PRESCRIPTION DRUG,,,Atomoxetine hydrochloride,CAPSULE,ORAL,20021126,,NDA,NDA021411,Eli Lilly and Company,ATOMOXETINE HYDROCHLORIDE,40,mg/1,"Norepinephrine Reuptake Inhibitor [EPC],Norepi...",,N,20201231.0
13,0002-3231_89af030d-8f23-4e54-92f0-724df4d81712,0002-3231,HUMAN PRESCRIPTION DRUG,,,Olanzapine and Fluoxetine hydrochloride,CAPSULE,ORAL,20031224,,NDA,NDA021520,Eli Lilly and Company,OLANZAPINE; FLUOXETINE HYDROCHLORIDE,6; 25,mg/1; mg/1,"Atypical Antipsychotic [EPC],Serotonin Reuptak...",,N,20201231.0
14,0002-3232_89af030d-8f23-4e54-92f0-724df4d81712,0002-3232,HUMAN PRESCRIPTION DRUG,,,Olanzapine and Fluoxetine hydrochloride,CAPSULE,ORAL,20031224,20190731.0,NDA,NDA021520,Eli Lilly and Company,OLANZAPINE; FLUOXETINE HYDROCHLORIDE,12; 25,mg/1; mg/1,"Atypical Antipsychotic [EPC],Serotonin Reuptak...",,N,
28,0002-4182_1606b529-d77d-41d8-9379-094caf0241c2,0002-4182,HUMAN PRESCRIPTION DRUG,,,baricitinib,"TABLET, FILM COATED",ORAL,20180531,,NDA,NDA207924,Eli Lilly and Company,BARICITINIB,2,mg/1,"Janus Kinase Inhibitor [EPC],Janus Kinase Inhi...",,N,20191231.0


In [None]:
ndc['PROPRIETARYNAME'].fillna(ndc['NONPROPRIETARYNAME'], inplace=True)

In [None]:
blanks = ndc[ndc['PROPRIETARYNAME'].isnull()]
blanks.shape

(0, 20)

In [None]:
ndc[ndc['PROPRIETARYNAME'] == 'Florbetapir F 18']

Unnamed: 0,PRODUCTID,PRODUCTNDC,PRODUCTTYPENAME,PROPRIETARYNAME,PROPRIETARYNAMESUFFIX,NONPROPRIETARYNAME,DOSAGEFORMNAME,ROUTENAME,STARTMARKETINGDATE,ENDMARKETINGDATE,MARKETINGCATEGORYNAME,APPLICATIONNUMBER,LABELERNAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES,DEASCHEDULE,NDC_EXCLUDE_FLAG,LISTING_RECORD_CERTIFIED_THROUGH
1,0002-1200_957ee1b5-dfa7-4e3f-96e1-6bed1ffc0abe,0002-1200,HUMAN PRESCRIPTION DRUG,Florbetapir F 18,,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,20120601,,NDA,NDA202008,Eli Lilly and Company,FLORBETAPIR F-18,51,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em...",,N,20191231.0


In [None]:
blanks

In [None]:
# STEP 2: Check the formats - how many are all caps, how many are all lowercase

all_caps = ndc[ndc['PROPRIETARYNAME'].str.contains('^[A-Z0-9 \-]+$')]
all_caps.shape

In [None]:
all_lc = ndc[ndc['PROPRIETARYNAME'].str.contains('^[a-z0-9 \-]+$')]
all_lc.shape

In [None]:
is_lower = ndc['PROPRIETARYNAME'] == ndc['PROPRIETARYNAME'].str.lower()
ndc[is_lower].shape

In [None]:
# Let's just make them all lower case

ndc['PROPRIETARYNAME'] = ndc['PROPRIETARYNAME'].str.lower()

In [None]:
all_lc = ndc[ndc['PROPRIETARYNAME'] == ndc['PROPRIETARYNAME'].str.lower() ]
all_lc.shape

In [None]:
ndc.shape

In [None]:
# STEP 3: Look for any that might have strange characters or number

weird = ndc[ndc['PROPRIETARYNAME'].str.contains('[0-9\-\\\/]')]
weird.shape

In [None]:
weird

In [None]:
# Let's assume kenalog-40 and kenalog-10 should be grouped together

In [None]:
ndc['PROPRIETARYNAME'] = ndc['PROPRIETARYNAME'].str.replace('\-[0-9]+','')

In [None]:
ndc[ndc['PROPRIETARYNAME'].str.contains('kenalog')]