In [1]:
import pandas as pd
import numpy as np
import clean_df as cd
import nltk

In [2]:
df = pd.read_csv('ndc.csv')
df = df.dropna()
print(df)
df['name'] = df['name'].str.lower()
print(len(set(df['name'])))

                              name
0                       Row Labels
1                SPIRONOLACTONE % 
2        -L'Oreal USA Products Inc
3                 .Cardinal Health
4     .Church & Dwight Canada Corp
...                            ...
7022                        ZYGONE
7023    Zyla Life Sciences US Inc.
7024                        #NAME?
7025                       (blank)
7026                   Grand Total

[7027 rows x 1 columns]
6701


In [3]:
df['clean'] = df['name'].map(cd.clean_str)
print(df)
print(len(set(df['clean'])))

                              name                   clean
0                       row labels               rowlabels
1                spironolactone %           spironolactone
2        -l'oreal usa products inc    lorealusaproductsinc
3                 .cardinal health          cardinalhealth
4     .church & dwight canada corp  churchdwightcanadacorp
...                            ...                     ...
7022                        zygone                  zygone
7023    zyla life sciences us inc.   zylalifesciencesusinc
7024                        #name?                    name
7025                       (blank)                   blank
7026                   grand total              grandtotal

[7027 rows x 2 columns]
5658


In [4]:
df['-company'] = df['clean'].map(cd.filter_company_name)
print(df)
print(len(set(df['-company'])))

                              name                   clean  \
0                       row labels               rowlabels   
1                spironolactone %           spironolactone   
2        -l'oreal usa products inc    lorealusaproductsinc   
3                 .cardinal health          cardinalhealth   
4     .church & dwight canada corp  churchdwightcanadacorp   
...                            ...                     ...   
7022                        zygone                  zygone   
7023    zyla life sciences us inc.   zylalifesciencesusinc   
7024                        #name?                    name   
7025                       (blank)                   blank   
7026                   grand total              grandtotal   

                    -company  
0                  rowlabels  
1             spironolactone  
2       lorealusaproductsinc  
3             cardinalhealth  
4     churchdwightcanadacorp  
...                      ...  
7022                  zygone  
7023   

In [5]:
states = cd.prep_states_info()

In [6]:
df['-states'] = df['-company'].apply(lambda x: cd.filter_states_info(x, states))
print(df)
print(len(set(df['-states'])))

                              name                   clean  \
0                       row labels               rowlabels   
1                spironolactone %           spironolactone   
2        -l'oreal usa products inc    lorealusaproductsinc   
3                 .cardinal health          cardinalhealth   
4     .church & dwight canada corp  churchdwightcanadacorp   
...                            ...                     ...   
7022                        zygone                  zygone   
7023    zyla life sciences us inc.   zylalifesciencesusinc   
7024                        #name?                    name   
7025                       (blank)                   blank   
7026                   grand total              grandtotal   

                    -company                 -states  
0                  rowlabels               rowlabels  
1             spironolactone          spironolactone  
2       lorealusaproductsinc    lorealusaproductsinc  
3             cardinalhealth       

In [7]:
common_words = cd.get_words()
df['-common'] = df['-states'].apply(lambda x: cd.filter_common_words(x, common_words))
print(df.columns)

Index(['name', 'clean', '-company', '-states', '-common'], dtype='object')


In [8]:
df['short'] = df.apply(lambda row: cd.get_shortest_non_empty(row['name'], row['clean'], row['-company'], row['-states'], row['-common']), axis=1)

In [9]:
p = nltk.PorterStemmer()
p.stem('products')

'product'

# Check if each item token (or its plural counterparts) exists

In [10]:
print('product' in common_words)
print('products' in common_words)
print('supply' in common_words)
print('welding' in common_words)
print('a' in common_words)

True
False
True
True
True


In [11]:
pd.set_option('display.max_rows', None)
display(df)
pd.reset_option('display.max_rows')

Unnamed: 0,name,clean,-company,-states,-common,short
0,row labels,rowlabels,rowlabels,rowlabels,rowlabels,row labels
1,spironolactone %,spironolactone,spironolactone,spironolactone,spironolactone,spironolactone %
2,-l'oreal usa products inc,lorealusaproductsinc,lorealusaproductsinc,lorealusaproductsinc,lorealusaproductsinc,-l'oreal usa products inc
3,.cardinal health,cardinalhealth,cardinalhealth,cardinalhealth,cardinalhealth,.cardinal health
4,.church & dwight canada corp,churchdwightcanadacorp,churchdwightcanadacorp,churchdwightcanadacorp,churchdwightcanadacorp,.church & dwight canada corp
5,{preferred pharmaeutials inc.,preferredpharmaeutialsinc,preferredpharmaeutialsinc,preferredpharmaeutialsinc,preferredpharmaeutialsinc,preferredpharmaeutialsinc
6,"†wal-mart stores, inc.†",walmartstoresinc,walmartstoresinc,walmartstoresinc,walmartstoresinc,walmartstoresinc
7,veterans health,veteranshealth,veteranshealth,veteranshealth,veteranshealth,veterans health
8,medco,medco,medco,medco,medco,medco
9,"st class pharmaceuticals, inc.",stclasspharmaceuticalsinc,stclasspharmaceuticalsinc,stclasspharmaceuticalsinc,stclasspharmaceuticalsinc,"st class pharmaceuticals, inc."


In [12]:
total_companies = set(df['short'])
print(df[df['short'] == '-'])
print(len(total_companies))

   name clean -company -states -common short
30    -                                    -
6698


In [13]:
ndf = pd.DataFrame()
ndf['name'] = sorted(list(total_companies))
print(ndf)

                                name
0                               corp
1       degrees pharmaceuticals, llc
2                     essentials ltd
3                        innovations
4                              medco
...                              ...
6693   zydus pharmaceuticals usa inc
6694  zydus pharmaceuticals usa inc.
6695      zydus technologies limited
6696                          zygone
6697      zyla life sciences us inc.

[6698 rows x 1 columns]


In [14]:
ndf.to_csv('ndf_cleaned.csv', index=False)
print('%.2f reduction' % ((len(df)-len(ndf))/len(df)))

0.05 reduction
