### Medicine quantity data cleaning

This notebook utilizes Pandas extract and string split functions for data cleaning

In [1]:
import pandas as pd

#### Loading the Dataset

In [2]:
medicine_data = pd.read_excel('data/medicine_quantity.xlsx')

In [3]:
medicine_data.head()

Unnamed: 0,Description,Quantity
0,lotion Benzylbenzoate lotion,0Bottle
1,Methylated spirit 100ml,0Bottle
2,susp Magnessium Trisilicate 200ml,0Bottle
3,Susp. Amoxicillin 125mg/5ml,0Bottle
4,Susp. Erythromycin 125mg/5ml,0Bottle


In [4]:
medicine_data.shape

(2279, 2)

#### Aim: To segragate the 'Quantity' column into 'Quantity' and 'Units'

In [5]:
quants = medicine_data['Quantity'].str.extract('(\d{1,})') # Matches 1 or more digit

In [6]:
quants

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
2274,20
2275,20
2276,10
2277,1


In [7]:
units = medicine_data['Quantity'].str.strip('\d{1,2000}') # get only the units

In [8]:
units

0       Bottle
1       Bottle
2       Bottle
3       Bottle
4       Bottle
         ...  
2274    Bottle
2275    Bottle
2276    Bottle
2277       Pcs
2278     Pairs
Name: Quantity, Length: 2279, dtype: object

In [9]:
# putting it all together

medicine_data['Quantity'] = quants
medicine_data['Measure'] = units

In [10]:
medicine_data

Unnamed: 0,Description,Quantity,Measure
0,lotion Benzylbenzoate lotion,0,Bottle
1,Methylated spirit 100ml,0,Bottle
2,susp Magnessium Trisilicate 200ml,0,Bottle
3,Susp. Amoxicillin 125mg/5ml,0,Bottle
4,Susp. Erythromycin 125mg/5ml,0,Bottle
...,...,...,...
2274,Syp Ascorbic acid,20,Bottle
2275,syr Cough Syrup (P) 100ml,20,Bottle
2276,syr Cough Syrup (A) 100ml,10,Bottle
2277,Cotton Wool 100g,1,Pcs


#### Saving the results

In [11]:
medicine_data.to_csv('data/cleaned_medicine_quantity.csv', index = False)