#Alternative Medicine Finder - Application

This project has been designed to find an alternative medicine with the same medicinal drug components in it. This is a real world problem which i have faced mostly, whenever i visited a pharmacy/medical store. Though the pharmacist suggests the alternative medicines but there should be a platform where the medicine can be searched to find its alternatives.

##Data Extraction Process

In [None]:
import csv
import requests
from bs4 import BeautifulSoup

def scrape_links(url):
    extracted_links = []
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            h2_tags = soup.find_all('h2')
            for h2 in h2_tags:
                links = h2.find_all('a')
                for link in links:
                    extracted_links.append(link.get('href'))
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
    except Exception as e:
        print(f"Error scraping {url}: {e}")
    return extracted_links

def fetch_and_store_data(url_list):
    with open('extractedlinks.csv', 'w', newline='') as csvfile:
        fieldnames = ['URL']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for url in url_list:
            response = requests.get(url)
            if response.status_code == 200:
                extracted_links = scrape_links(response.url)
                for extracted_link in extracted_links:
                    writer.writerow({'URL': extracted_link})

def extract_info_from_link(link):
    try:
        response = requests.get(link)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extracting name from <h1> heading
            h1_heading = soup.find('h1')
            if h1_heading:
                name_h1 = h1_heading.text.strip()
            else:
                name_h1 = "N/A"

            # Extracting name field of <div class="d flex-column">
            div_d = soup.find('div', class_='d flex-column')
            if div_d:
                a_tag = div_d.find('a', class_='d block')
                if a_tag:
                    name_a_d_text = a_tag.text.strip()
                else:
                    name_a_d_text = "N/A"
            else:
                name_a_d_text = "N/A"

            return name_h1, name_a_d_text
        else:
            print(f"Failed to fetch {link}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"An error occurred while processing {link}: {e}")
        return None, None

def main():
    # Generate list of URLs from 'a' to 'z'
    base_url = "https://dawaai.pk/all-medicines/"
    url_list = [base_url + chr(ord('a') + i) for i in range(26)]  # Generating URLs from 'a' to 'z'

    # Fetch and store data
    fetch_and_store_data(url_list)

    input_csv = "extractedlinks.csv"
    output_csv = "output_data.csv"

    with open(input_csv, 'r') as f_input, open(output_csv, 'w', newline='') as f_output:
        csv_reader = csv.reader(f_input)
        csv_writer = csv.writer(f_output)

        # Writing header for output CSV
        csv_writer.writerow(['Link', 'Medicine', 'Medicinal Drug'])

        for row in csv_reader:
            link = row[0]  # Assuming the link is in the first column of the CSV
            name_h1, name_a_d = extract_info_from_link(link)
            csv_writer.writerow([link, name_h1, name_a_d])

if __name__ == "__main__":
    main()


The above code will provide us with a csv and file containing the link to the medicine, its name and the medicinal drug it has, so the total columns here will be 3

The extracted data from this single website is 17000 enteries.



##Data Cleaning Process

The output_data.csv file provides us with 3 columns, we need to get rid of the links column and then clean the data in the other 2 columns. We need to remove any unwanted characters. Additionally we also have to remove the duplicates present in the data to get out final output file. Please  refer to the below code to do all the steps explained above

In [1]:
import pandas as pd
from google.colab import files

# If the file is in the same directory you can use it directly othwerwiswe you can upload it

# Upload CSV files from local system
print("Upload CSV files:")
uploaded = files.upload()

mdata= pd.read_csv('output_data.csv')
mdata= mdata.drop(columns=['Link'])


#separate the medicine names from any non required information (Modify the code according to your data)
#In the below line, any string part starting with a number is being removed.
mdata['Medicines'] = mdata['Medicines'].str.replace(r'\d.*$', '', regex=True)

#same is done for the Medicinal Drug column
mdata['Drugs'] = mdata['Drugs'].str.replace(r'\(.*\)', '', regex=True)

#drop/remove any duplicate values
mdata= mdata.drop_duplicates()

# Export combined DataFrame to a CSV file
mdata.to_csv('output_data_final.csv', index=False)

Upload CSV files:


KeyboardInterrupt: 