# 1. Processing Ayurvedic Formula

This part process Ayurvedic-formula data

In [1]:
import pandas as pd

In [2]:
file_01 = r'E:\research\ayurvedic-hiv\data\original\AyurvedicFormula.xls'

In [3]:
df_ayurvedic = pd.read_excel(file_01, sheet_name='Sheet1')
df_ayurvedic

Unnamed: 0,Name (Semi-solids),Ingredient(10000gm),Non plant ingredient,Action,Application,Comments,Bacteria
0,Anqaruya Kabir,"Pellitory/Pyrethrum Root, Anacyclus pyrethrum:...","Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,0.0
1,Atisar Jog (50gm),"Ginger, Zingiber officinale: 10gm; Monk's Hood...",,Curative diarrhoea of kids.,,,1.0
2,Anantadikwath (240gm),"Shariva, Hemidesmus indicus: 20gm; Black shari...",,"Curative curbancle, diarrhoea,Blood poisoning,...",,,1.0
3,Amritadi Kwath (300gm),"Gurchi, Encyclopaedia Metallum: 30gm; Vas aka,...",,"Curative of Urticaria, Erisipelas, Leprosy, Po...",,,1.0
4,Abhayadi Kwath (240gm),"Chebulic Myrobalan, Tenninalia chebula: 20gm; ...",,"Curative of Drospy, Fever,Burning filling, Ery...",,,1.0
...,...,...,...,...,...,...,...
289,Hingastak Churna (800gm),"Asafoetida, Femia foetida: 100gm; Caraway, Car...","Rock Salt, Sodium Chloride: 100gm;","Curative for Digestive, Inflammation and Arthr...",,,1.0
290,Panchamrita Louha Mandur (500gm),"(Trifola)-Chebulic Myrobalan, Terminalia chebu...","(Lauha bhashma)Calcined iron, 10gm; Copper ash...","Curative for Anemea, jaundice and Dropsy.",,,0.0
291,Punarnaba Mandur (683.84gm),"Small Caltrops, Tribulus terresrris: 3.84gm; H...","Ferric oxied calx, 480gm;","Curative for Anemea, jaundice and Dropsy.",,,0.0
292,Dashanga Pralepa (200gm),"Albi:zia lebbek: 10gm; Liqourice, Glycyrrhiza ...","Ghee, Bas taurus (milk butter): 100gm;","Curative for Erysipelas, Skin diseases and Lep...",,,1.0


## 1.1 Extract the ingredients

In [4]:
# Function to extract all ingredients from a text string
def extract_ingredient_details(text):
    ingredients = []
    if isinstance(text, str):
        for item in text.split(';'):
            if item.strip():
                parts = item.split(':')
                name_parts = parts[0].split(',', 1)  # Split only at first comma
                common_name = name_parts[0].strip()
                scientific_name = name_parts[1].strip() if len(name_parts) > 1 else ""
                quantity = parts[1].strip() if len(parts) > 1 else ""
                ingredients.append({
                    "common_name": common_name,
                    "scientific_name": scientific_name,
                    "quantity": quantity
                })
    return ingredients

In [5]:
# Get all ingredients from all rows
all_ingredients = []
for _, row in df_ayurvedic.iterrows():
    ingredients = extract_ingredient_details(row['Ingredient(10000gm)'])
    all_ingredients.extend(ingredients)

In [6]:
# Get all unique common names
common_names = set(ingredient["common_name"] for ingredient in all_ingredients)
print(f"Total unique ingredients (by common name): {len(common_names)}")

Total unique ingredients (by common name): 464


In [7]:
# Get all unique scientific names
scientific_names = set(ingredient["scientific_name"] for ingredient in all_ingredients if ingredient["scientific_name"])
print(f"Total unique ingredients (by scientific name): {len(scientific_names)}")

Total unique ingredients (by scientific name): 294


In [8]:
# Create tuples from common and scientific names (tuples are hashable)
unique_combinations = set((ingredient["common_name"], ingredient["scientific_name"]) 
                          for ingredient in all_ingredients)
print(f"Total unique ingredient combinations: {len(unique_combinations)}")

Total unique ingredient combinations: 531


# 1.2 Give code to each formula

In [9]:
df_ayurvedic['Code'] = ['F{:03d}'.format(i+1) for i in range(len(df_ayurvedic))]
df_ayurvedic.set_index('Code', inplace=True)
df_ayurvedic

Unnamed: 0_level_0,Name (Semi-solids),Ingredient(10000gm),Non plant ingredient,Action,Application,Comments,Bacteria
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F001,Anqaruya Kabir,"Pellitory/Pyrethrum Root, Anacyclus pyrethrum:...","Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,0.0
F002,Atisar Jog (50gm),"Ginger, Zingiber officinale: 10gm; Monk's Hood...",,Curative diarrhoea of kids.,,,1.0
F003,Anantadikwath (240gm),"Shariva, Hemidesmus indicus: 20gm; Black shari...",,"Curative curbancle, diarrhoea,Blood poisoning,...",,,1.0
F004,Amritadi Kwath (300gm),"Gurchi, Encyclopaedia Metallum: 30gm; Vas aka,...",,"Curative of Urticaria, Erisipelas, Leprosy, Po...",,,1.0
F005,Abhayadi Kwath (240gm),"Chebulic Myrobalan, Tenninalia chebula: 20gm; ...",,"Curative of Drospy, Fever,Burning filling, Ery...",,,1.0
...,...,...,...,...,...,...,...
F290,Hingastak Churna (800gm),"Asafoetida, Femia foetida: 100gm; Caraway, Car...","Rock Salt, Sodium Chloride: 100gm;","Curative for Digestive, Inflammation and Arthr...",,,1.0
F291,Panchamrita Louha Mandur (500gm),"(Trifola)-Chebulic Myrobalan, Terminalia chebu...","(Lauha bhashma)Calcined iron, 10gm; Copper ash...","Curative for Anemea, jaundice and Dropsy.",,,0.0
F292,Punarnaba Mandur (683.84gm),"Small Caltrops, Tribulus terresrris: 3.84gm; H...","Ferric oxied calx, 480gm;","Curative for Anemea, jaundice and Dropsy.",,,0.0
F293,Dashanga Pralepa (200gm),"Albi:zia lebbek: 10gm; Liqourice, Glycyrrhiza ...","Ghee, Bas taurus (milk butter): 100gm;","Curative for Erysipelas, Skin diseases and Lep...",,,1.0


In [10]:
# Save the coded DataFrame to a csv file
df_ayurvedic.to_csv(r'E:\research\ayurvedic-hiv\data\processed\01_formula-coded.csv')

## 1.3 Cleaning Ayurvedic Formula data

This part is trying to clean formula where it does not have ingredient or no bacteria indication

In [11]:
df_ayurvedic.isna().sum()

Name (Semi-solids)        0
Ingredient(10000gm)       7
Non plant ingredient     73
Action                    1
Application             292
Comments                293
Bacteria                  1
dtype: int64

In [12]:
df_ayurvedic[df_ayurvedic['Ingredient(10000gm)'].isna()]

Unnamed: 0_level_0,Name (Semi-solids),Ingredient(10000gm),Non plant ingredient,Action,Application,Comments,Bacteria
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F110,Gulma Bajrini Bati (60gm),,"Mercury sulphate: 20gm; Copper, ash powder, 10...",Curative for Chronic enlargement of the spleen...,,,0.0
F125,Churamoni Ras (180gm),,"Calcined Cinnabar, 20gm; Coral Calx, 20gm; Gol...",Curative fever and symptoms of fever.,,,0.0
F247,Panchamrita Parpati (3.1kg),,"Sulphur, 1.6kg; Mercury, Hydrargyrum Metalicum...","Curative Chronic diarrhoea, Amobasis, Invigora...",,,0.0
F248,Bijay Parpati (3.2kg),,"Purified Sulphur with Eclipta alba juice, 1.6k...","Curative for Amobasis, Rheumatoid arthritis, D...",,,0.0
F249,Louha Parpati (1.2kg),,"Cinnabar, Compound of Mercury and Sulphur: 800...","Curative for Puerperal disease, Dudenum, Aneme...",,,1.0
F250,Suvra Parpati (110gm),,"Salt Petre, Potassium Nitrate: 80gm; Alum, Pot...","Curative for Acidity, Indigestion, Worm, Vomit...",,,1.0
F251,Sweta Parpati (570gm),,"Purified counch, 480gm; Purified Alum, Potassi...","Curative for Acidity, indigestion and Acute pain.",,,0.0


In [13]:
df_ayurvedic_clean = df_ayurvedic.dropna(subset=['Ingredient(10000gm)', 'Bacteria'])
df_ayurvedic_clean

Unnamed: 0_level_0,Name (Semi-solids),Ingredient(10000gm),Non plant ingredient,Action,Application,Comments,Bacteria
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F001,Anqaruya Kabir,"Pellitory/Pyrethrum Root, Anacyclus pyrethrum:...","Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,0.0
F002,Atisar Jog (50gm),"Ginger, Zingiber officinale: 10gm; Monk's Hood...",,Curative diarrhoea of kids.,,,1.0
F003,Anantadikwath (240gm),"Shariva, Hemidesmus indicus: 20gm; Black shari...",,"Curative curbancle, diarrhoea,Blood poisoning,...",,,1.0
F004,Amritadi Kwath (300gm),"Gurchi, Encyclopaedia Metallum: 30gm; Vas aka,...",,"Curative of Urticaria, Erisipelas, Leprosy, Po...",,,1.0
F005,Abhayadi Kwath (240gm),"Chebulic Myrobalan, Tenninalia chebula: 20gm; ...",,"Curative of Drospy, Fever,Burning filling, Ery...",,,1.0
...,...,...,...,...,...,...,...
F290,Hingastak Churna (800gm),"Asafoetida, Femia foetida: 100gm; Caraway, Car...","Rock Salt, Sodium Chloride: 100gm;","Curative for Digestive, Inflammation and Arthr...",,,1.0
F291,Panchamrita Louha Mandur (500gm),"(Trifola)-Chebulic Myrobalan, Terminalia chebu...","(Lauha bhashma)Calcined iron, 10gm; Copper ash...","Curative for Anemea, jaundice and Dropsy.",,,0.0
F292,Punarnaba Mandur (683.84gm),"Small Caltrops, Tribulus terresrris: 3.84gm; H...","Ferric oxied calx, 480gm;","Curative for Anemea, jaundice and Dropsy.",,,0.0
F293,Dashanga Pralepa (200gm),"Albi:zia lebbek: 10gm; Liqourice, Glycyrrhiza ...","Ghee, Bas taurus (milk butter): 100gm;","Curative for Erysipelas, Skin diseases and Lep...",,,1.0


In [14]:
# Get all ingredients from all rows
all_ingredients_cleaned = []
for _, row in df_ayurvedic_clean.iterrows():
    ingredients = extract_ingredient_details(row['Ingredient(10000gm)'])
    all_ingredients_cleaned.extend(ingredients)
all_ingredients_cleaned

[{'common_name': 'Pellitory/Pyrethrum Root',
  'scientific_name': 'Anacyclus pyrethrum',
  'quantity': '23.81'},
 {'common_name': 'Small Fene',
  'scientific_name': 'Nigella sativa',
  'quantity': '23.81'},
 {'common_name': 'Saussurea',
  'scientific_name': 'Saussurea lappa',
  'quantity': '23.81'},
 {'common_name': 'Pepper (Black or white)',
  'scientific_name': 'Piper nigrum',
  'quantity': '23.81'},
 {'common_name': 'Long Pepper',
  'scientific_name': 'Piper longum',
  'quantity': '23.81'},
 {'common_name': 'Sweet Flag',
  'scientific_name': 'Acarus calamus',
  'quantity': '23.81'},
 {'common_name': 'Garden Rue',
  'scientific_name': 'Ruta graveolens',
  'quantity': '11.90'},
 {'common_name': 'Pakhan bed',
  'scientific_name': 'Bergenia ligulata',
  'quantity': '11.90'},
 {'common_name': 'Asafoetida',
  'scientific_name': 'Femia foetida',
  'quantity': '11.90'},
 {'common_name': 'European Birthwort (Round)',
  'scientific_name': 'Aristolochia rotunda',
  'quantity': '11.90'},
 {'com

In [15]:
# Get all unique common names
common_names_cleaned = set(ingredient["common_name"] for ingredient in all_ingredients)
print(f"Total unique ingredients (by common name): {len(common_names_cleaned)}")

# Get all unique scientific names
scientific_names_cleaned = set(ingredient["scientific_name"] for ingredient in all_ingredients if ingredient["scientific_name"])
print(f"Total unique ingredients (by scientific name): {len(scientific_names_cleaned)}")

# Create tuples from common and scientific names (tuples are hashable)
unique_combinations_cleaned = set((ingredient["common_name"], ingredient["scientific_name"]) 
                          for ingredient in all_ingredients)
print(f"Total unique ingredient combinations: {len(unique_combinations_cleaned)}")

Total unique ingredients (by common name): 464
Total unique ingredients (by scientific name): 294
Total unique ingredient combinations: 531


In [16]:
df_ayurvedic_clean.isna().sum()

Name (Semi-solids)        0
Ingredient(10000gm)       0
Non plant ingredient     72
Action                    0
Application             284
Comments                285
Bacteria                  0
dtype: int64

### 1.3.1 Change Bacteria data type

In [17]:
df_ayurvedic_clean['Bacteria'] = df_ayurvedic_clean['Bacteria'].astype(bool)
df_ayurvedic_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ayurvedic_clean['Bacteria'] = df_ayurvedic_clean['Bacteria'].astype(bool)


Unnamed: 0_level_0,Name (Semi-solids),Ingredient(10000gm),Non plant ingredient,Action,Application,Comments,Bacteria
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F001,Anqaruya Kabir,"Pellitory/Pyrethrum Root, Anacyclus pyrethrum:...","Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
F002,Atisar Jog (50gm),"Ginger, Zingiber officinale: 10gm; Monk's Hood...",,Curative diarrhoea of kids.,,,True
F003,Anantadikwath (240gm),"Shariva, Hemidesmus indicus: 20gm; Black shari...",,"Curative curbancle, diarrhoea,Blood poisoning,...",,,True
F004,Amritadi Kwath (300gm),"Gurchi, Encyclopaedia Metallum: 30gm; Vas aka,...",,"Curative of Urticaria, Erisipelas, Leprosy, Po...",,,True
F005,Abhayadi Kwath (240gm),"Chebulic Myrobalan, Tenninalia chebula: 20gm; ...",,"Curative of Drospy, Fever,Burning filling, Ery...",,,True
...,...,...,...,...,...,...,...
F290,Hingastak Churna (800gm),"Asafoetida, Femia foetida: 100gm; Caraway, Car...","Rock Salt, Sodium Chloride: 100gm;","Curative for Digestive, Inflammation and Arthr...",,,True
F291,Panchamrita Louha Mandur (500gm),"(Trifola)-Chebulic Myrobalan, Terminalia chebu...","(Lauha bhashma)Calcined iron, 10gm; Copper ash...","Curative for Anemea, jaundice and Dropsy.",,,False
F292,Punarnaba Mandur (683.84gm),"Small Caltrops, Tribulus terresrris: 3.84gm; H...","Ferric oxied calx, 480gm;","Curative for Anemea, jaundice and Dropsy.",,,False
F293,Dashanga Pralepa (200gm),"Albi:zia lebbek: 10gm; Liqourice, Glycyrrhiza ...","Ghee, Bas taurus (milk butter): 100gm;","Curative for Erysipelas, Skin diseases and Lep...",,,True


In [18]:
# Save the cleaned formula DataFrame to a csv file
df_ayurvedic_clean.to_csv(r'E:\research\ayurvedic-hiv\data\cleaned\01_formula-cleaned.csv')

## 1.4 Split the ingredients one per row

In [19]:
# Split the Ingredients into separate rows and include additional columns
rows = []
for _, row in df_ayurvedic_clean.reset_index().iterrows():
    code = row['Code']
    name = row['Name (Semi-solids)']
    ingredients = str(row['Ingredient(10000gm)']).split(';')
    non_plant_ingredient = row['Non plant ingredient']
    action = row['Action']
    application = row['Application']
    comments = row['Comments']
    bacteria = row['Bacteria']

    for ingredient in ingredients:
        # Split the ingredient into common name, scientific name, and quantity
        if ':' in ingredient:
            parts = ingredient.split(',')
            # THIS PART CAN REDUCE THE AMOUNT OF DATA
            if len(parts) == 2:
                common_name = parts[0].strip()
                remaining_parts = parts[1].split(':')
                if len(remaining_parts) == 2:
                    scientific_name = remaining_parts[0].strip()
                    quantity = remaining_parts[1].strip()

                    # Append only if all required fields are present
                    if common_name and scientific_name and quantity:
                        rows.append([
                            code, name, common_name, scientific_name, quantity,
                            non_plant_ingredient, action, application, comments, bacteria
                        ])
rows

[['F001',
  'Anqaruya Kabir',
  'Pellitory/Pyrethrum Root',
  'Anacyclus pyrethrum',
  '23.81',
  'Ghee, Bos taurus (milk butter): 19.05; Honey, Apis mellifera(secretted liquid): rest amount;',
  'Increasing strenth of nerves, Stimulating, Removing obstructions',
  'Paralysis of face,Paralysis, Indigestion',
  'Harmful for Pregnancy.',
  False],
 ['F001',
  'Anqaruya Kabir',
  'Small Fene',
  'Nigella sativa',
  '23.81',
  'Ghee, Bos taurus (milk butter): 19.05; Honey, Apis mellifera(secretted liquid): rest amount;',
  'Increasing strenth of nerves, Stimulating, Removing obstructions',
  'Paralysis of face,Paralysis, Indigestion',
  'Harmful for Pregnancy.',
  False],
 ['F001',
  'Anqaruya Kabir',
  'Saussurea',
  'Saussurea lappa',
  '23.81',
  'Ghee, Bos taurus (milk butter): 19.05; Honey, Apis mellifera(secretted liquid): rest amount;',
  'Increasing strenth of nerves, Stimulating, Removing obstructions',
  'Paralysis of face,Paralysis, Indigestion',
  'Harmful for Pregnancy.',
  Fa

In [20]:
# Create a DataFrame from the rows
columns = [
    'Code', 'Name', 'Common Name', 'Scientific Name', 'Quantity',
    'Non plant ingredient', 'Action', 'Application', 'Comments', 'Bacteria'
]
df_ayurvedic_clean_split = pd.DataFrame(rows, columns=columns)
df_ayurvedic_clean_split

Unnamed: 0,Code,Name,Common Name,Scientific Name,Quantity,Non plant ingredient,Action,Application,Comments,Bacteria
0,F001,Anqaruya Kabir,Pellitory/Pyrethrum Root,Anacyclus pyrethrum,23.81,"Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
1,F001,Anqaruya Kabir,Small Fene,Nigella sativa,23.81,"Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
2,F001,Anqaruya Kabir,Saussurea,Saussurea lappa,23.81,"Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
3,F001,Anqaruya Kabir,Pepper (Black or white),Piper nigrum,23.81,"Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
4,F001,Anqaruya Kabir,Long Pepper,Piper longum,23.81,"Ghee, Bos taurus (milk butter): 19.05; Honey, ...","Increasing strenth of nerves, Stimulating, Rem...","Paralysis of face,Paralysis, Indigestion",Harmful for Pregnancy.,False
...,...,...,...,...,...,...,...,...,...,...
2171,F294,Darusataka Lep (60gm),Devdar,Cedrus deodara,10gm,"Rock Salt, Sodium Chloride: 10gm;",Curative for Acute pain and Stomach ache.,,,False
2172,F294,Darusataka Lep (60gm),Sweet flag,Acarus calamus,10gm,"Rock Salt, Sodium Chloride: 10gm;",Curative for Acute pain and Stomach ache.,,,False
2173,F294,Darusataka Lep (60gm),Costus root,Saussurea hypoleuca,10gm,"Rock Salt, Sodium Chloride: 10gm;",Curative for Acute pain and Stomach ache.,,,False
2174,F294,Darusataka Lep (60gm),Sowa Seeds,Anethum sowa,10gm,"Rock Salt, Sodium Chloride: 10gm;",Curative for Acute pain and Stomach ache.,,,False


In [21]:
df_ayurvedic_clean_split.isna().sum()

Code                       0
Name                       0
Common Name                0
Scientific Name            0
Quantity                   0
Non plant ingredient     630
Action                     0
Application             2160
Comments                2162
Bacteria                   0
dtype: int64

In [22]:
cn = len(df_ayurvedic_clean_split['Common Name'])
print(f"Total common names in split DataFrame: {cn}")
cn_unique = df_ayurvedic_clean_split['Common Name'].unique()
print(f"Total unique common names in split DataFrame: {len(cn_unique)}\n")

sn = len(df_ayurvedic_clean_split['Scientific Name'])
print(f"Total scientific names in split DataFrame: {sn}")
sn_unique = df_ayurvedic_clean_split['Scientific Name'].unique()
print(f"Total unique scientific names in split DataFrame: {len(sn_unique)}\n")

ay = len(df_ayurvedic_clean_split['Name'])
print(f"Total Ayurvedic formulas in split DataFrame: {ay}")
ay_unique = df_ayurvedic_clean_split['Name'].unique()
print(f"Total unique Ayurvedic formulas in split DataFrame: {len(ay_unique)}\n")


Total common names in split DataFrame: 2176
Total unique common names in split DataFrame: 361

Total scientific names in split DataFrame: 2176
Total unique scientific names in split DataFrame: 268

Total Ayurvedic formulas in split DataFrame: 2176
Total unique Ayurvedic formulas in split DataFrame: 280



In [23]:
# Save the split DataFrame to a csv file
df_ayurvedic_clean_split.to_csv(r'E:\research\ayurvedic-hiv\data\processed\02_formula-cleaned-split.csv', index=False)