In [20]:
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd

# Total pages variable (ensure it's defined somewhere in your code)
total_pages = 500  # Example value; replace with the actual number of pages
final_list = []

# Initialize lists for the entire scraping process
company_name = []
company_details = []
company_rating = []
company_review = []
company_salary = []
company_interview = []
jobs_count = []
company_benefits = []

# Iterate through all pages if total_pages is greater than 0
for page in range(1, total_pages + 1):
    url = f"https://www.ambitionbox.com/list-of-companies?page={page}"
    
    # Use the Chrome WebDriver (make sure you have it installed)
    driver = webdriver.Chrome()
    
    try:
        # Navigate to the URL
        driver.get(url)
        
        # Clear cookies
        driver.delete_all_cookies()
        
        # Get the page source after the JavaScript has executed
        html = driver.page_source
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')
        
        # Extract company names, details, and ratings
        for card in soup.find_all('div', class_='companyCardWrapper__metaInformation'):
            # Handle missing elements gracefully
            name = card.find('h2', class_='companyCardWrapper__companyName')
            details = card.find('div', class_='companyCardWrapper__secondaryInformation')
            rating = card.find('div', class_='companyCardWrapper__companyRating')

            company_name.append(name.text.strip() if name else np.nan)
            company_details.append(details.text.strip() if details else np.nan)
            company_rating.append(rating.text.strip() if rating else np.nan)

        # Extract reviews, salaries, interviews, jobs, and benefits
        for info in soup.find_all('div', class_='companyCardWrapper__tertiaryInformation'):
            action_count_spans = info.find_all('span', class_='companyCardWrapper__ActionCount')
            
            # Extract review count
            company_review.append(action_count_spans[0].text.strip() if action_count_spans[0].text.strip() else np.nan)
            
            # Extract salary count
            company_salary.append(action_count_spans[1].text.strip() if action_count_spans[1].text.strip() else np.nan)
            
            # Extract interview count
            company_interview.append(action_count_spans[2].text.strip() if action_count_spans[2].text.strip() else np.nan)

            # Extract jobs count
            jobs_count.append(action_count_spans[3].text.strip() if action_count_spans[3].text.strip() else np.nan)

            # Extract benefits
            company_benefits.append(action_count_spans[4].text.strip() if action_count_spans[4].text.strip() else np.nan)

        # Print the extracted information for the current page (for debugging purposes)
        print(f"Page: {page}")
        print("Company Names:", company_name[0], ':', company_name[-1])
        # print("Company Details:", company_details[0], ':', company_name[-1])
        print("Company Ratings:", company_rating[0], ':', company_rating[-1])
        print("Company Reviews:", company_review[0], ':', company_review[-1])
        print("Company Salaries:", company_salary[0], ':', company_salary[-1])
        print("Interview Counts:", company_interview[0], ':', company_interview[-1])
        print("Jobs Count:", jobs_count[0], ':', jobs_count[-1])
        print("Company Benefits:", company_benefits[0], ':', company_benefits[-1])

        # Add a sleep interval to avoid overwhelming the server with requests
        time.sleep(1)
        
    except Exception as e:
        print(f"An error occurred on page {page}: {e}")
    
    finally:
        # Close the browser
        driver.quit()

# Combine all the data into a single DataFrame
ambition_dict = {
    'company_name': company_name,
    'company_details': company_details,
    'company_rating': company_rating,
    'company_review': company_review,
    'company_salary': company_salary,
    'company_interview': company_interview,
    'jobs_count': jobs_count,
    'company_benefits': company_benefits
}

df = pd.DataFrame(ambition_dict)
final_list.append(df)

# Concatenate all DataFrames in the list
if final_list:
    final = pd.concat(final_list, ignore_index=True)
    # Save the final DataFrame to a CSV file
    final.to_csv('/home/rakesh/Desktop/ambitionbox_data.csv', index=False)
    print("Data saved to ambitionbox_data.csv")
else:
    print("No data to concatenate into the final DataFrame.")


Page: 1
Company Names: TCS : iEnergizer
Company Ratings: 3.8 : 4.7
Company Reviews: 79.2k : 17.5k
Company Salaries: 8.8L : 19.7k
Interview Counts: 8.9k : 412
Jobs Count: 883 : 86
Company Benefits: 11.7k : 545
Page: 2
Company Names: TCS : Bajaj Finserv
Company Ratings: 3.8 : 4.0
Company Reviews: 79.2k : 8.1k
Company Salaries: 8.8L : 26.6k
Interview Counts: 8.9k : 442
Jobs Count: 883 : 7
Company Benefits: 11.7k : 758
Page: 3
Company Names: TCS : Sutherland Global Services
Company Ratings: 3.8 : 3.7
Company Reviews: 79.2k : 6.2k
Company Salaries: 8.8L : 28.7k
Interview Counts: 8.9k : 521
Jobs Count: 883 : 238
Company Benefits: 11.7k : 968
Page: 4
Company Names: TCS : JSW Steel
Company Ratings: 3.8 : 4.0
Company Reviews: 79.2k : 5k
Company Salaries: 8.8L : 25.8k
Interview Counts: 8.9k : 502
Jobs Count: 883 : 50
Company Benefits: 11.7k : 640
Page: 5
Company Names: TCS : First Source
Company Ratings: 3.8 : 3.8
Company Reviews: 79.2k : 4.2k
Company Salaries: 8.8L : 21.1k
Interview Counts: 8.9