### UP Legislative Assembly (18)

https://uplegisassembly.gov.in/Members/main_members_en.aspx#/ElectedMembers/18

In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
import random
import logging
import json
import os
from tqdm import tqdm

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='mla_scraping.log'
)

In [3]:
class MLAFirefoxScraper:
    def __init__(self):
        self.output_file = 'mla_profiles.csv'
        self.state_file = 'scraper_state.json'
        self.start_id = 18001
        self.end_id = 18413
        self.driver = self.setup_driver()

    def setup_driver(self):
        """Set up Firefox driver with proper options."""
        firefox_options = Options()
        firefox_options.add_argument('--headless')  # Run in headless mode
        firefox_options.add_argument('--width=1920')
        firefox_options.add_argument('--height=1080')
        
        # Set preferences to make the browser more stable
        firefox_options.set_preference("browser.download.folderList", 2)
        firefox_options.set_preference("browser.download.manager.showWhenStarting", False)
        firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
        firefox_options.set_preference("general.useragent.override", 
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0")
        
        return webdriver.Firefox(options=firefox_options)

    def load_state(self):
        """Load the last known state of the scraper."""
        if os.path.exists(self.state_file):
            try:
                with open(self.state_file, 'r') as f:
                    state = json.load(f)
                logging.info(f"Loaded state: Last processed MLA ID = {state.get('last_processed_id')}")
                return state
            except Exception as e:
                logging.error(f"Error loading state file: {e}")
        return {'last_processed_id': self.start_id - 1, 'processed_ids': []}

    def save_state(self, last_id, processed_ids):
        """Save the current state of the scraper."""
        try:
            with open(self.state_file, 'w') as f:
                json.dump({
                    'last_processed_id': last_id,
                    'processed_ids': list(processed_ids),
                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                }, f)
        except Exception as e:
            logging.error(f"Error saving state: {e}")

    def scrape_mla_profile(self, mla_id):
        """Scrape individual MLA profile using Selenium."""
        url = f"https://uplegisassembly.gov.in/Members/mla_profile_hi.aspx?mla_id={mla_id}&assembly_no=18"
        
        try:
            self.driver.get(url)
            # Wait for the main content to load
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.ID, "tr_fathername"))
            )
            
            # Add a small delay to ensure everything loads
            time.sleep(2)
            
            data = {'mla_id': mla_id}
            
            # Map of Hindi labels to English column names
            label_map = {
                'पिता का नाम': 'father_name',
                'जन्‍म तिथि': 'dob',
                'जन्‍म स्थान': 'birth_place',
                'धर्म': 'religion',
                'जाति': 'caste_category',
                'शिक्षा': 'qualification',
                'विवाह तिथि': 'marriage_date',
                'पत्‍नी का नाम': 'spouse_name',
                'सन्तान': 'children',
                'व्‍यवसाय': 'occupation',
                'मुख्यावास': 'permanent_address',
                'अस्थाई पता': 'temporary_address',
                'मोबाइल नं0': 'mobile_number',
                'ई-मेल': 'email'
            }
            
            # Extract basic information
            for label, field in label_map.items():
                try:
                    # Find the row containing the label
                    rows = self.driver.find_elements(By.TAG_NAME, "tr")
                    for row in rows:
                        if label in row.text:
                            # Find the span in this row
                            span = row.find_element(By.TAG_NAME, "span")
                            data[field] = span.text.strip()
                            break
                except NoSuchElementException:
                    continue
            
            # Extract political career
            try:
                career_table = self.driver.find_element(By.ID, "lbl_political_career")
                career_info = []
                rows = career_table.find_elements(By.TAG_NAME, "tr")
                for row in rows:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) == 2:
                        career_info.append({
                            'period': cells[0].text.strip(),
                            'description': cells[1].text.strip()
                        })
                data['political_career'] = json.dumps(career_info, ensure_ascii=False)
            except NoSuchElementException:
                data['political_career'] = None
            
            return data
            
        except TimeoutException:
            logging.error(f"Timeout while scraping MLA ID {mla_id}")
            return None
        except Exception as e:
            logging.error(f"Error scraping MLA ID {mla_id}: {str(e)}")
            return None

    def write_to_csv(self, data, first_write=False):
        """Write data to CSV file."""
        try:
            df = pd.DataFrame([data])
            if first_write:
                df.to_csv(self.output_file, index=False, encoding='utf-8-sig', mode='w')
            else:
                df.to_csv(self.output_file, index=False, encoding='utf-8-sig', 
                         mode='a', header=False)
            return True
        except Exception as e:
            logging.error(f"Error writing to CSV: {e}")
            return False

    def run(self):
        """Main scraping process."""
        state = self.load_state()
        last_processed_id = state.get('last_processed_id', self.start_id - 1)
        processed_ids = set(state.get('processed_ids', []))
        first_write = not os.path.exists(self.output_file)

        try:
            start_from = max(last_processed_id + 1, self.start_id)
            print(f"Starting from MLA ID: {start_from}")
            
            for mla_id in tqdm(range(start_from, self.end_id + 1)):
                if mla_id in processed_ids:
                    continue

                try:
                    data = self.scrape_mla_profile(mla_id)
                    if data:
                        success = self.write_to_csv(data, first_write)
                        if success:
                            if first_write:
                                first_write = False
                            processed_ids.add(mla_id)
                            self.save_state(mla_id, processed_ids)
                            print(f"Successfully scraped MLA ID: {mla_id}")
                    else:
                        print(f"Failed to scrape MLA ID: {mla_id}")

                    # Random delay between requests (3-5 seconds)
                    time.sleep(3 + random.random() * 2)
                    
                except Exception as e:
                    logging.error(f"Error processing MLA ID {mla_id}: {str(e)}")
                    continue

        except KeyboardInterrupt:
            print("\nScraping interrupted!")
        except Exception as e:
            logging.error(f"Unexpected error: {str(e)}")
        finally:
            try:
                self.driver.quit()
            except:
                pass

        print("\nScraping completed!")
        print(f"Results saved to: {self.output_file}")

In [4]:
if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename='mla_scraping.log'
    )
    
    scraper = MLAFirefoxScraper()
    scraper.run()

Starting from MLA ID: 18399


  0%|                                                    | 0/15 [00:00<?, ?it/s]

Successfully scraped MLA ID: 18399


  7%|██▉                                         | 1/15 [00:09<02:13,  9.55s/it]

Successfully scraped MLA ID: 18400


 13%|█████▊                                      | 2/15 [00:18<01:57,  9.05s/it]

Successfully scraped MLA ID: 18401


 20%|████████▊                                   | 3/15 [00:26<01:43,  8.65s/it]

Successfully scraped MLA ID: 18402


 27%|███████████▋                                | 4/15 [00:35<01:38,  8.99s/it]

Successfully scraped MLA ID: 18403


 33%|██████████████▋                             | 5/15 [00:43<01:24,  8.44s/it]

Successfully scraped MLA ID: 18404


 40%|█████████████████▌                          | 6/15 [00:52<01:16,  8.52s/it]

Successfully scraped MLA ID: 18405


 47%|████████████████████▌                       | 7/15 [01:01<01:10,  8.83s/it]

Successfully scraped MLA ID: 18406


 53%|███████████████████████▍                    | 8/15 [01:09<01:00,  8.60s/it]

Successfully scraped MLA ID: 18407


 60%|██████████████████████████▍                 | 9/15 [01:19<00:53,  8.90s/it]

Successfully scraped MLA ID: 18408


 67%|████████████████████████████▋              | 10/15 [01:28<00:45,  9.08s/it]

Successfully scraped MLA ID: 18409


 73%|███████████████████████████████▌           | 11/15 [01:36<00:34,  8.73s/it]

Successfully scraped MLA ID: 18410


 80%|██████████████████████████████████▍        | 12/15 [01:45<00:26,  8.79s/it]

Successfully scraped MLA ID: 18411


 87%|█████████████████████████████████████▎     | 13/15 [01:54<00:17,  8.96s/it]

Successfully scraped MLA ID: 18412


 93%|████████████████████████████████████████▏  | 14/15 [02:03<00:08,  8.93s/it]

Successfully scraped MLA ID: 18413


100%|███████████████████████████████████████████| 15/15 [02:12<00:00,  8.82s/it]



Scraping completed!
Results saved to: mla_profiles.csv
