In [1]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd

In [2]:

def scrape_data(urls, name_classes, address_classes, phone_classes, no_of_attorneys_classes, referers):
    all_data = []  # List to store data from all URLs

    

    for i, url in enumerate(urls):
        try:
            referer = referers[i]
            headers = {'Referer': referer}
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise an HTTPError for bad responses

            bsobj = soup(response.content, 'html.parser')

            # Extract names
            name_list = []
            for name_class in name_classes[i]:
                names = bsobj.select(f'.{name_class}')
                if names:
                    name_list = [name.text.strip() for name in names if name.text.strip()]
                    break
            if not name_list:
                name_list = ["N/A"]

            # Extract addresses
            addr_list = []
            for address_class in address_classes[i]:
                addresses = bsobj.select(f'.{address_class}')
                if addresses:
                    addr_list = [address.text.strip().replace('<br>', ' ') for address in addresses if address.text.strip()]
                    break
            if not addr_list:
                addr_list = ["N/A"]

            # Extract phone numbers
            phone_list = []
            for phone_class in phone_classes[i]:
                phones = bsobj.select(f'.{phone_class} span')
                if phones:
                    phone_list = [phone.text.strip() for phone in phones if phone.text.strip()]
                    break
            if not phone_list:
                phone_list = ["N/A"]

            # Extract number of attorneys
            no_of_attorneys_list = []
            for no_of_attorneys_class in no_of_attorneys_classes[i]:
                no_of_attorneys = bsobj.select(f'.{no_of_attorneys_class} .SafeHtml_root__mAjEc')
                if no_of_attorneys:
                    no_of_attorneys_list = [attorney.text.strip() for attorney in no_of_attorneys if attorney.text.strip()]
                    break
            if not no_of_attorneys_list:
                no_of_attorneys_list = ["N/A"]

      

            # Make sure the lists have the same length
            min_length = min(len(name_list), len(addr_list), len(phone_list), len(no_of_attorneys_list))

            # Create a DataFrame for current URL data
            df = pd.DataFrame({
                'Law Firm': name_list[:min_length],
                'Address': addr_list[:min_length],
                'Contact': phone_list[:min_length],
                'Website URL': [url] * min_length,
                'No. Of Attorneys': no_of_attorneys_list[:min_length],
                
            })

            all_data.append(df)  # Append current URL's data DataFrame to the list

        except requests.RequestException as e:
            # Handle exceptions for failed requests
            print(f"Failed to fetch data from {url}: {str(e)}")

    if all_data:
        # Concatenate all DataFrames in the list into a single DataFrame
        result_df = pd.concat(all_data, ignore_index=True)
        
        # Save the result DataFrame to a CSV file
        csv_filename = 'law_firm_data.csv'
        result_df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}")
        
        return result_df
    else:
        return None

# Example usage
urls = [
    'https://vault.com/company-profiles/law/cravath-swaine-moore-llp',
    'https://vault.com/company-profiles/law/wachtell-lipton-rosen-katz',
    'https://vault.com/company-profiles/law/skadden-arps-slate-meagher-flom-llp-and-affiliates',
    'https://vault.com/company-profiles/law/latham-watkins-llp',
    'https://vault.com/company-profiles/law/sullivan-cromwell-llp',
    'https://vault.com/company-profiles/law/kirkland-ellis',
]


name_classes = [
    ['d-flex align-items-center justify-content-between', 'text-near-black'],
    ['d-flex align-items-center justify-content-between', 'text-near-black'],
    ['d-flex align-items-center justify-content-between', 'text-near-black'],
    ['d-flex align-items-center justify-content-between', 'text-near-black'],
    ['d-flex align-items-center justify-content-between', 'text-near-black'],
    ['d-flex align-items-center justify-content-between', 'text-near-black']
]
address_classes = [
    ['SafeHtml_root__mAjEc'],
    ['SafeHtml_root__mAjEc'],
    ['SafeHtml_root__mAjEc'],
    ['SafeHtml_root__mAjEc'],
    ['SafeHtml_root__mAjEc'],
    ['SafeHtml_root__mAjEc']
]
phone_classes = [
    ['text-nowrap.d-flex.gap-1'],
    ['text-nowrap.d-flex.gap-1'],
    ['text-nowrap.d-flex.gap-1'],
    ['text-nowrap.d-flex.gap-1'],
    ['text-nowrap.d-flex.gap-1'],
    ['text-nowrap.d-flex.gap-1']
]
no_of_attorneys_classes = [
    ['Stats_statsItem__BZ6ZC'],
    ['Stats_statsItem__BZ6ZC'],
    ['Stats_statsItem__BZ6ZC'],
    ['Stats_statsItem__BZ6ZC'],
    ['Stats_statsItem__BZ6ZC'],
    ['Stats_statsItem__BZ6ZC']
]

referers = [
    'https://search.brave.com/',
    'https://search.brave.com/',
    'https://search.brave.com/',
    'https://search.brave.com/',
    'https://search.brave.com/',
    'https://search.brave.com/'
]

# Call the function with example data
scrape_data(urls, name_classes, address_classes, phone_classes, no_of_attorneys_classes, referers)


Data saved to law_firm_data.csv


Unnamed: 0,Law Firm,Address,Contact,Website URL,No. Of Attorneys
0,"Cravath, Swaine & Moore LLP","Two Manhattan West375 Ninth AvenueNew York, NY...",(212) 474-1000,https://vault.com/company-profiles/law/cravath...,1K - 1.5K
1,"Wachtell, Lipton, Rosen & Katz","51 West 52nd St.New York, NY 10019",(212) 403-1000,https://vault.com/company-profiles/law/wachtel...,250 - 500
2,Skadden,"One Manhattan WestNew York, NY 10001-8602",(212) 735-3000,https://vault.com/company-profiles/law/skadden...,3.5K - 4K
3,Latham & Watkins,"1271 Avenue of the AmericasNew York, NY 10020",(212) 906-1200,https://vault.com/company-profiles/law/latham-...,3K - 3.5K
4,Sullivan & Cromwell LLP,"125 Broad StreetNew York, NY 10004",(212) 558-4000,https://vault.com/company-profiles/law/sulliva...,1K - 1.5K
5,Kirkland & Ellis,"333 West Wolf Point Plaza Chicago, IL 60654",(312) 862-2000,https://vault.com/company-profiles/law/kirklan...,3.5K - 4K
