In [1]:
!pip install loguru
!pip install ddddocr

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: loguru
Successfully installed loguru-0.7.3
Collecting ddddocr
  Downloading ddddocr-1.5.6-py3-none-any.whl.metadata (17 kB)
Collecting onnxruntime (from ddddocr)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting coloredlogs (from onnxruntime->ddddocr)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->ddddocr)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading ddddocr-1.5.6-py3-none-any.whl (75.9 MB)
[2K   [90m━

In [2]:
!unzip /content/Firm.zip

Archive:  /content/Firm.zip
  inflating: Firm lists/2025.01.07.xlsx  
  inflating: Firm lists/2025.01.04.xlsx  
  inflating: Firm lists/2025.01.09.xlsx  
  inflating: Firm lists/2025.01.05.xlsx  
  inflating: Firm lists/2025.01.08.xlsx  
  inflating: Firm lists/2025.01.06.xlsx  


In [3]:
import requests
import ddddocr
import urllib
import time
from loguru import logger
import pandas as pd
import random
from tqdm import tqdm
from PIL import Image, ImageEnhance, ImageFilter

# Configure logger to be less verbose
logger.remove()  # Remove default handler
logger.add(lambda msg: None, level="ERROR")  # Only show errors

def read_excel_from_path(path):
    """Read excel file and return list of firm names"""
    df = pd.read_excel(path, header=None)
    firm_list = df[0].astype(str).to_numpy().tolist()
    firm_list = [firm for firm in firm_list if firm and firm != 'nan']
    return firm_list

def preprocess_captcha_image(image_path):
    """Preprocess CAPTCHA image to improve OCR accuracy"""
    img = Image.open(image_path)
    img = img.convert('L')
    enhancer = ImageEnhance.Contrast(img)
    img = enhancer.enhance(2.0)
    img = img.filter(ImageFilter.SHARPEN)
    threshold = 128
    img = img.point(lambda p: 255 if p > threshold else 0)
    img.save('response_processed.jpeg')
    return 'response_processed.jpeg'

def validate_captcha_format(captcha_text):
    """Validate CAPTCHA text format"""
    if not captcha_text:
        return False
    captcha_text = captcha_text.strip()
    if len(captcha_text) < 3 or len(captcha_text) > 8:
        return False
    if not captcha_text.isalnum():
        return False
    return True

def sc(start, end, sessions, cookies, firm_list):
    """Enhanced main scraping function with superior network resilience"""

    print("Total number of firms to search:", len(firm_list))
    output = []
    firm_captured, firm_not_found, firm_errors = [], [], []

    cookie = 'JSESSIONID=' + str(sessions) + "; insert_cookie=" + str(cookies)

    # Initialize OCR once
    ocr = ddddocr.DdddOcr(show_ad=False)

    def safe_request(func_name, request_func, max_network_retries=5):
        """
        Wrapper for all network requests with exponential backoff
        Retries on network errors but not on application-level failures
        """
        for attempt in range(max_network_retries):
            try:
                result = request_func()
                return result, True
            except (requests.exceptions.ConnectionError,
                    requests.exceptions.Timeout,
                    requests.exceptions.ChunkedEncodingError) as e:
                wait_time = min(2 ** attempt, 30)  # Exponential backoff, max 30s
                if attempt < max_network_retries - 1:
                    logger.warning(f"{func_name} network error (attempt {attempt+1}/{max_network_retries}): {type(e).__name__}. Retrying in {wait_time}s...")
                    time.sleep(wait_time)
                else:
                    logger.error(f"{func_name} failed after {max_network_retries} attempts: {e}")
                    return None, False
            except Exception as e:
                logger.error(f"{func_name} unexpected error: {e}")
                return None, False
        return None, False

    def get_image(cookie):
        """Download CAPTCHA image with network resilience"""
        def _download():
            now_time = int(round(time.time() * 1000))
            url = f"https://wzxxbg.mofcom.gov.cn/gspt/infoPub/entp/search/vCode?r={now_time}"

            headers = {
                'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
                'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive',
                'Cookie': cookie,
                'Pragma': 'no-cache',
                'Referer': 'https://wzxxbg.mofcom.gov.cn/gspt/vCode.html',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
            }

            response = requests.get(url, headers=headers, timeout=15)

            if response.status_code == 200 and len(response.content) > 0:
                with open('response.jpeg', 'wb') as photo:
                    photo.write(response.content)
                return True
            return False

        result, success = safe_request("get_image", _download, max_network_retries=3)
        return result if success else False

    def get_verify(image_code, cookie):
        """Verify CAPTCHA code with network resilience"""
        def _verify():
            url = "https://wzxxbg.mofcom.gov.cn/gspt/infoPub/entp/search/checkVCode"
            payload = f"searchWzCode={image_code}"
            headers = {
                'Accept': '*/*',
                'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8',
                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'Cookie': cookie,
                'Origin': 'https://wzxxbg.mofcom.gov.cn',
                'Referer': 'https://wzxxbg.mofcom.gov.cn/gspt/vCode.html',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest',
            }

            response = requests.post(url, headers=headers, data=payload, timeout=15)
            return response.json()

        result, success = safe_request("get_verify", _verify, max_network_retries=4)
        return result if success else {'status': 3}

    def get_search_company(company, image_code, cookie):
        """Search for company with network resilience"""
        def _search():
            url = "https://wzxxbg.mofcom.gov.cn/gspt/infoPub/entp/search/searchEntpList"
            name = urllib.parse.quote(company)
            payload = f"keyWord={name}&searchWzCode={image_code}"
            headers = {
                'Accept': '*/*',
                'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8',
                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'Cookie': cookie,
                'Origin': 'https://wzxxbg.mofcom.gov.cn',
                'Referer': 'https://wzxxbg.mofcom.gov.cn/gspt/vCode.html',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest',
            }

            response = requests.post(url, headers=headers, data=payload, timeout=20)

            if response.status_code == 200:
                res_data = response.json()
                data = res_data.get('data')
                if data:
                    wzResult = data.get('wzResult')
                    result_list = wzResult.get('result')
                    if result_list and len(result_list) > 0:
                        token = result_list[0].get('TOKEN')
                        entp_id = result_list[0].get('ENTP_MAIN_ID')
                        return [token, entp_id, company]
                    else:
                        firm_not_found.append(company)
                        logger.info(f'Firm not found: {company}')
                else:
                    firm_not_found.append(company)
                    logger.info(f'No data returned for: {company}')
            else:
                logger.error(f'Search failed for {company}: {response.status_code}')
            return []

        result, success = safe_request("get_search_company", _search, max_network_retries=5)
        if not success:
            firm_errors.append(company)
            return []
        return result if result else []

    def image_to_str():
        """Enhanced OCR with preprocessing"""
        try:
            processed_path = preprocess_captcha_image('response.jpeg')
            results = []

            # OCR on processed image
            with open(processed_path, 'rb') as f:
                img_bytes = f.read()
                res1 = ocr.classification(img_bytes)
                if validate_captcha_format(res1):
                    results.append(res1)

            # OCR on original image
            with open('response.jpeg', 'rb') as f:
                img_bytes = f.read()
                res2 = ocr.classification(img_bytes)
                if validate_captcha_format(res2):
                    results.append(res2)

            if results:
                return max(set(results), key=results.count)

            return res1 if res1 else res2

        except Exception as e:
            logger.error(f"OCR failed: {e}")
            return ""

    def get_company_detail(entp_id, token, company_name):
        """Retrieve detailed company information with network resilience"""
        def _get_detail():
            url = "https://wzxxbg.mofcom.gov.cn/gspt/infoPub/entp/search/wzEntpDetail"
            payload = f"entpId={entp_id}&token={token}"
            headers = {
                'Accept': '*/*',
                'Accept-Language': 'zh,zh-CN;q=0.9,en;q=0.8',
                'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                'Cookie': cookie,
                'Origin': 'https://wzxxbg.mofcom.gov.cn',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
                'X-Requested-With': 'XMLHttpRequest',
            }

            response = requests.post(url, headers=headers, data=payload, timeout=20)

            if response.status_code == 200:
                resp_json = response.json()
                data = resp_json.get('data')
                wzResult = data.get('wzResult')

                res_data = {}
                res_data["search_name"] = company_name
                res_data["entp_name"] = wzResult.get('ENTP_NAME', 'N/A')
                res_data["gs_status_name"] = wzResult.get('GS_STATUS_NAME', 'N/A')
                res_data["business_scope"] = wzResult.get('BUSINESS_SCOPE', 'N/A')
                res_data["entp_gs_code"] = wzResult.get("ENTP_GS_CODE", 'N/A')
                res_data["recorddatefmt"] = wzResult.get("RECORDDATEFMT", 'N/A')
                res_data["industryname"] = wzResult.get("INDUSTRYNAME", 'N/A')
                res_data["register_capital"] = wzResult.get("REGISTER_CAPITAL", 'N/A')
                res_data["unit_name"] = wzResult.get("UNITNAME", 'N/A')
                res_data["reg_addr"] = wzResult.get("REG_ADDR", 'N/A')
                res_data["right_man"] = wzResult.get("RIGHT_MAN", 'N/A')

                # Investor information
                investorResult = data.get('investorResult', [])
                investorResult = [ir for ir in investorResult if ir.get('INVESTOR_NAME') is not None]
                investors = []
                for inv in investorResult:
                    iName = inv.get('INVESTOR_NAME', 'N/A')
                    cName = inv.get('COUNTRYNAME', 'N/A')
                    amount = inv.get('CAPITAL_AMOUNT', 'N/A')
                    investors.append(f"{iName}~{cName}~{amount}")
                res_data["investor_info"] = '|'.join(investors) if investors else 'N/A'

                # Change history
                entpAlterList = data.get('entpAlterList', [])
                entpAlterList = [ir for ir in entpAlterList if ir.get('ALTITEM') is not None]
                changes = []
                for change in entpAlterList:
                    cType = change.get('ALTITEM', 'N/A')
                    cBefore = change.get('ALTBE', 'N/A')
                    cAfter = change.get('ALTAF', 'N/A')
                    cDate = change.get('ALTDATE', 'N/A')
                    changes.append(f"{cType}~{cBefore}~{cAfter}~{cDate}")
                res_data["changes_info"] = '|'.join(changes) if changes else 'N/A'

                # Annual reports
                lhnbResult = data.get('lhnbResult', [])
                if lhnbResult:
                    res_data['year_report'] = "|".join([r.get('YEAR', '') for r in lhnbResult])
                else:
                    res_data['year_report'] = 'N/A'

                return res_data
            return None

        result, success = safe_request("get_company_detail", _get_detail, max_network_retries=5)
        return result if success else None

    def try_catch_loop(cookie, start, end):
        """Main processing loop with enhanced resilience"""
        time_to_stop = False
        max_retries_for_status3 = 10  # Increased from 8
        consecutive_failures = 0
        max_consecutive_failures = 15  # Stop if 15 firms fail in a row

        for i in tqdm(range(start, end), desc="Processing firms", colour='green'):
            if time_to_stop:
                break

            # Adaptive delay based on consecutive failures
            if consecutive_failures > 5:
                delay = random.uniform(4, 7)  # Longer delay if having issues
            else:
                delay = random.uniform(2, 4)  # Normal delay
            time.sleep(delay)

            retries = 0
            captcha_success = False
            firm_processed = False

            while retries < max_retries_for_status3:
                # Download CAPTCHA
                if not get_image(cookie):
                    retries += 1
                    time.sleep(3)
                    continue

                # OCR
                image_code = image_to_str()

                if not image_code or not validate_captcha_format(image_code):
                    retries += 1
                    continue

                # Verify CAPTCHA
                verify_data = get_verify(image_code, cookie)

                if int(verify_data.get('status', 3)) == 3:
                    retries += 1
                    time.sleep(1)
                    continue

                # CAPTCHA successful
                captcha_success = True
                token_list = get_search_company(
                    company=firm_list[i],
                    image_code=image_code,
                    cookie=cookie
                )

                # Retrieve details if found
                if token_list and len(token_list) == 3:
                    entp_id = token_list[1]
                    token = token_list[0]
                    company_name = token_list[2]

                    data = get_company_detail(entp_id=entp_id, token=token, company_name=company_name)
                    if data:
                        output.append(data)
                        firm_captured.append(company_name)
                        consecutive_failures = 0  # Reset counter on success
                        firm_processed = True
                    else:
                        consecutive_failures += 1
                else:
                    # Firm not found is not a failure
                    consecutive_failures = 0
                    firm_processed = True

                break

            if not firm_processed:
                consecutive_failures += 1

            # Check for cookie expiration or persistent issues
            if retries >= max_retries_for_status3:
                logger.error(f"Max CAPTCHA retries for firm {i}: {firm_list[i]}")
                if not captcha_success:
                    time_to_stop = True
                    print("Cookie likely expired - please refresh")
                    return True

            if consecutive_failures >= max_consecutive_failures:
                logger.error(f"Too many consecutive failures ({consecutive_failures}). Network may be unstable.")
                print(f"Stopping due to {consecutive_failures} consecutive failures. Check network connection.")
                return True

        return False

    # Main execution
    start = start
    end = min(len(firm_list), end)
    max_retries = 2  # Reduced outer retries since inner retries are stronger

    for retry_attempt in range(max_retries):
        if len(firm_captured) == 0:
            s = start
        else:
            last_firm = firm_captured[-1]
            try:
                last_index = firm_list.index(last_firm)
                s = last_index + 1
            except ValueError:
                s = start

        if s >= end:
            logger.info("All firms in range have been processed")
            break

        logger.info(f"Starting/resuming from firm index {s}")

        try:
            cookie_expired = try_catch_loop(cookie, s, end)
            if cookie_expired:
                logger.warning("Cookie expired or network unstable - stopping this chunk")
                break

        except Exception as e:
            logger.error(f"Unexpected error: {e}")
            time.sleep(15)
            print(f"Captured: {len(output)}, Not found: {len(firm_not_found)}, Errors: {len(firm_errors)}")

    # Generate results
    if output:
        result_dic = {
            '搜索公司名称': [record.get('search_name', 'N/A') for record in output],
            '公司名称': [record.get('entp_name', 'N/A') for record in output],
            '统一社会信用代码/组织机构代码': [record.get('entp_gs_code', 'N/A') for record in output],
            '状况': [record.get('gs_status_name', 'N/A') for record in output],
            '成立日期': [record.get('recorddatefmt', 'N/A') for record in output],
            '投资行业': [record.get('industryname', 'N/A') for record in output],
            '注册资本': [record.get('register_capital', 'N/A') for record in output],
            '注册资本单位': [record.get('unit_name', 'N/A') for record in output],
            '经营范围': [record.get('business_scope', 'N/A') for record in output],
            '地址': [record.get('reg_addr', 'N/A') for record in output],
            '法定代表人': [record.get('right_man', 'N/A') for record in output],
            '投资者信息': [record.get('investor_info', 'N/A') for record in output],
            '变更信息': [record.get('changes_info', 'N/A') for record in output],
            '年报年度': [record.get('year_report', 'N/A') for record in output]
        }
        result = pd.DataFrame.from_dict(result_dic)
    else:
        result = pd.DataFrame()

    # Log summary
    success_rate = (len(output) / (end - start)) * 100 if (end - start) > 0 else 0
    print(f"\n{'='*50}")
    print(f"Summary:")
    print(f"Successfully captured: {len(output)}")
    print(f"Not found: {len(firm_not_found)}")
    print(f"Errors: {len(firm_errors)}")
    print(f"Success rate: {success_rate:.1f}%")
    print(f"{'='*50}\n")

    return result

def processing(start, end, session, cookie, path_to_firm_list, max_chunk_size):
    """Process firms in chunks with progress tracking"""
    firm_list = read_excel_from_path(path_to_firm_list)

    if start is None:
        start = 0
    if end is None:
        end = len(firm_list)

    print(f"Total firms to process: {end - start}")
    print(f"Processing in chunks of: {max_chunk_size}")

    total_captured = 0
    total_not_found = 0
    total_errors = 0

    for current_start in range(start, end, max_chunk_size):
        current_end = min(current_start + max_chunk_size, end)

        print(f"\n{'='*60}")
        print(f"Processing chunk: {current_start+1} to {current_end}")
        print(f"{'='*60}\n")

        result = sc(current_start, current_end, session, cookie, firm_list)

        if not result.empty:
            filename = f"./firm_info_{current_start+1}_{current_end}.xlsx"
            result.to_excel(filename, index=False)
            print(f"Saved: {filename}")
            total_captured += len(result)
        else:
            print(f"No results for chunk {current_start+1}-{current_end}")

    print("\n" + "="*60)
    print("FINAL SUMMARY:")
    print(f"Total firms captured: {total_captured}/{end - start}")
    #print(f"Number of firms tested on: {end}")
    print(f"Overall Accuracy: { (total_captured / (end - start)) * 100:.2f}%")
    print(f"Processing complete!")
    print("="*60)

In [4]:
import pandas as pd
import re

def classify_fie_probability(firm_list_path, output_path=None):
    """
    Classify firms by probability of being Foreign Invested Enterprises
    Returns firms sorted by likelihood of being in MOFCOM database
    """

    # Read firm list
    df = pd.read_excel(firm_list_path, header=None)
    df.columns = ['firm_name']
    df['firm_name'] = df['firm_name'].astype(str)
    df = df[df['firm_name'] != 'nan']

    # Initialize probability score
    df['fie_score'] = 0
    df['indicators'] = ''

    # High probability indicators (foreign company patterns)
    foreign_patterns = {
        'foreign_name': r'[A-Z][a-z]+ [A-Z][a-z]+|[A-Z]{2,}',  # English names
        'foreign_suffix': r'\(.*\)$|（.*）$',  # Company with foreign parent in parentheses
        'ltd_variants': r'(?i)(pty|pte|ltd|inc|corp|gmbh|sarl|s\.a\.|limited)\.?$',
        'location_hk': r'(?i)(香港|hk|hong kong)',
        'location_taiwan': r'(?i)(台湾|taiwan)',
        'location_foreign': r'(?i)(新加坡|singapore|日本|japan|韩国|korea|美国|usa|德国|germany)',
    }

    # Medium probability indicators
    foreign_keywords = {
        'foreign_region': r'(?i)(外商|外资)',  # "Foreign investment" in Chinese
        'intl_keywords': r'(?i)(国际|international)',
        'global_keywords': r'(?i)(环球|global|worldwide)',
    }

    # Low probability indicators (domestic-only patterns)
    domestic_patterns = {
        'province': r'^(北京|上海|天津|重庆|河北|山西|辽宁|吉林|黑龙江|江苏|浙江|安徽|福建|江西|山东|河南|湖北|湖南|广东|海南|四川|贵州|云南|陕西|甘肃|青海|台湾|内蒙古|广西|西藏|宁夏|新疆)',
        'city_start': r'^[\u4e00-\u9fa5]{2,3}市',  # Starts with Chinese city name
    }

    def calculate_score(name):
        score = 0
        indicators = []

        # Check foreign patterns (high confidence)
        for pattern_name, pattern in foreign_patterns.items():
            if re.search(pattern, name):
                if pattern_name == 'foreign_name':
                    score += 30
                    indicators.append('英文名')
                elif pattern_name == 'location_hk':
                    score += 25
                    indicators.append('香港')
                elif pattern_name == 'location_foreign':
                    score += 40
                    indicators.append('外国地名')
                else:
                    score += 20
                    indicators.append(pattern_name)

        # Check foreign keywords (medium confidence)
        for keyword_name, pattern in foreign_keywords.items():
            if re.search(pattern, name):
                score += 15
                indicators.append(keyword_name)

        # Unified credit codes starting with 9 are usually newer companies
        if name.startswith('91'):
            score += 5
            indicators.append('统一信用代码')

        # Check domestic patterns (reduce score)
        for pattern_name, pattern in domestic_patterns.items():
            if re.search(pattern, name):
                score -= 10
                indicators.append(f'domestic_{pattern_name}')

        # Pure numeric codes (might be organization codes for FIEs)
        if re.match(r'^\d{9,18}[A-Z0-9]?$', name):
            score += 10
            indicators.append('组织代码')

        return score, ', '.join(indicators) if indicators else 'none'

    # Calculate scores
    df[['fie_score', 'indicators']] = df['firm_name'].apply(
        lambda x: pd.Series(calculate_score(x))
    )

    # Classify by probability
    def classify(score):
        if score >= 40:
            return 'Very High'
        elif score >= 25:
            return 'High'
        elif score >= 10:
            return 'Medium'
        elif score >= 0:
            return 'Low'
        else:
            return 'Very Low'

    df['probability'] = df['fie_score'].apply(classify)

    # Sort by probability (highest first)
    df_sorted = df.sort_values('fie_score', ascending=False)

    # Statistics
    stats = df['probability'].value_counts()
    total = len(df)

    print("\n" + "="*60)
    print("FIE PROBABILITY ANALYSIS")
    print("="*60)
    print(f"Total firms analyzed: {total}")
    print("\nProbability Distribution:")
    for prob in ['Very High', 'High', 'Medium', 'Low', 'Very Low']:
        count = stats.get(prob, 0)
        pct = (count/total)*100
        print(f"  {prob:12s}: {count:4d} firms ({pct:5.1f}%)")

    print("\n" + "="*60)
    print("RECOMMENDATION:")
    high_prob = stats.get('Very High', 0) + stats.get('High', 0)
    expected_success = high_prob * 0.4  # Assume 40% of high-prob firms exist
    print(f"Focus on 'Very High' and 'High' probability firms: {high_prob} firms")
    print(f"Expected successful captures: ~{int(expected_success)} firms")
    print("="*60 + "\n")

    # Sample output
    print("Sample High Probability Firms:")
    print(df_sorted.head(10)[['firm_name', 'fie_score', 'probability', 'indicators']].to_string(index=False))

    print("\n\nSample Low Probability Firms:")
    print(df_sorted.tail(10)[['firm_name', 'fie_score', 'probability', 'indicators']].to_string(index=False))

    # Save if output path provided
    if output_path:
        df_sorted.to_excel(output_path, index=False)
        print(f"\n\nFull analysis saved to: {output_path}")

    return df_sorted

# Example usage:
# classified_df = classify_fie_probability('/path/to/your/firmlist.xlsx', 'classified_firms.xlsx')
#
# To scrape only high-probability firms:
# high_prob_firms = classified_df[classified_df['probability'].isin(['Very High', 'High'])]
# high_prob_firms[['firm_name']].to_excel('high_prob_only.xlsx', index=False, header=False)


In [5]:
classified_df = classify_fie_probability('/content/Firm lists/2025.01.09.xlsx', 'classified_firms.xlsx')

# To scrape only high-probability firms:
high_prob_firms = classified_df[classified_df['probability'].isin(['Very High', 'High'])]
high_prob_firms[['firm_name']].to_excel('high_prob_only.xlsx', index=False, header=False)

processing(0, 2000, 'CA1711D9EA2AD638C4265228A73BF5DE', '32151754', 'high_prob_only.xlsx', 100)


FIE PROBABILITY ANALYSIS
Total firms analyzed: 21090

Probability Distribution:
  Very High   :   58 firms (  0.3%)
  High        : 10808 firms ( 51.2%)
  Medium      : 7567 firms ( 35.9%)
  Low         : 2256 firms ( 10.7%)
  Very Low    :  401 firms (  1.9%)

RECOMMENDATION:
Focus on 'Very High' and 'High' probability firms: 10866 firms
Expected successful captures: ~4346 firms

Sample High Probability Firms:
         firm_name  fie_score probability      indicators
91440300MA5HKQYK1X         60   Very High 英文名, 香港, 统一信用代码
91440400MA55UHKB8E         60   Very High 英文名, 香港, 统一信用代码
91310115MADFHKJK6C         60   Very High 英文名, 香港, 统一信用代码
91440300MA5HKE439A         60   Very High 英文名, 香港, 统一信用代码
91331000MA7DHK4N21         60   Very High 英文名, 香港, 统一信用代码
91440300MA5HKJGQ7T         60   Very High 英文名, 香港, 统一信用代码
91440300MA5HK8344T         60   Very High 英文名, 香港, 统一信用代码
91310000MA1HK0H36A         60   Very High 英文名, 香港, 统一信用代码
91210213MA0YW2HKX0         60   Very High 英文名, 香港, 统一信用代码
9144

Processing firms: 100%|[32m██████████[0m| 100/100 [12:26<00:00,  7.46s/it]
Processing firms: 100%|[32m██████████[0m| 10/10 [00:59<00:00,  6.00s/it]



Summary:
Successfully captured: 10
Not found: 100
Errors: 0
Success rate: 10.0%

Saved: ./firm_info_1_100.xlsx

Processing chunk: 101 to 200

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [12:58<00:00,  7.79s/it]
Processing firms: 100%|[32m██████████[0m| 16/16 [01:40<00:00,  6.29s/it]



Summary:
Successfully captured: 7
Not found: 109
Errors: 0
Success rate: 7.0%

Saved: ./firm_info_101_200.xlsx

Processing chunk: 201 to 300

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [11:35<00:00,  6.95s/it]
Processing firms: 100%|[32m██████████[0m| 2/2 [00:12<00:00,  6.42s/it]



Summary:
Successfully captured: 8
Not found: 94
Errors: 0
Success rate: 8.0%

Saved: ./firm_info_201_300.xlsx

Processing chunk: 301 to 400

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:37<00:00,  8.17s/it]
Processing firms: 100%|[32m██████████[0m| 2/2 [00:12<00:00,  6.46s/it]



Summary:
Successfully captured: 10
Not found: 92
Errors: 0
Success rate: 10.0%

Saved: ./firm_info_301_400.xlsx

Processing chunk: 401 to 500

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [12:22<00:00,  7.43s/it]
Processing firms: 100%|[32m██████████[0m| 1/1 [00:05<00:00,  5.35s/it]



Summary:
Successfully captured: 14
Not found: 87
Errors: 0
Success rate: 14.0%

Saved: ./firm_info_401_500.xlsx

Processing chunk: 501 to 600

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:18<00:00,  7.98s/it]
Processing firms: 100%|[32m██████████[0m| 4/4 [00:26<00:00,  6.58s/it]



Summary:
Successfully captured: 15
Not found: 89
Errors: 0
Success rate: 15.0%

Saved: ./firm_info_501_600.xlsx

Processing chunk: 601 to 700

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [12:20<00:00,  7.40s/it]
Processing firms: 100%|[32m██████████[0m| 22/22 [02:59<00:00,  8.15s/it]



Summary:
Successfully captured: 8
Not found: 114
Errors: 0
Success rate: 8.0%

Saved: ./firm_info_601_700.xlsx

Processing chunk: 701 to 800

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:58<00:00,  8.38s/it]
Processing firms: 100%|[32m██████████[0m| 13/13 [01:21<00:00,  6.30s/it]



Summary:
Successfully captured: 9
Not found: 104
Errors: 0
Success rate: 9.0%

Saved: ./firm_info_701_800.xlsx

Processing chunk: 801 to 900

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:10<00:00,  7.90s/it]
Processing firms: 100%|[32m██████████[0m| 31/31 [03:33<00:00,  6.90s/it]



Summary:
Successfully captured: 15
Not found: 116
Errors: 0
Success rate: 15.0%

Saved: ./firm_info_801_900.xlsx

Processing chunk: 901 to 1000

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:17<00:00,  7.97s/it]
Processing firms: 100%|[32m██████████[0m| 4/4 [00:21<00:00,  5.34s/it]



Summary:
Successfully captured: 11
Not found: 93
Errors: 0
Success rate: 11.0%

Saved: ./firm_info_901_1000.xlsx

Processing chunk: 1001 to 1100

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [11:55<00:00,  7.16s/it]
Processing firms: 100%|[32m██████████[0m| 4/4 [00:25<00:00,  6.42s/it]



Summary:
Successfully captured: 7
Not found: 97
Errors: 0
Success rate: 7.0%

Saved: ./firm_info_1001_1100.xlsx

Processing chunk: 1101 to 1200

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [11:57<00:00,  7.17s/it]
Processing firms: 100%|[32m██████████[0m| 38/38 [04:31<00:00,  7.15s/it]



Summary:
Successfully captured: 9
Not found: 129
Errors: 0
Success rate: 9.0%

Saved: ./firm_info_1101_1200.xlsx

Processing chunk: 1201 to 1300

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [12:19<00:00,  7.40s/it]



Summary:
Successfully captured: 9
Not found: 91
Errors: 0
Success rate: 9.0%

Saved: ./firm_info_1201_1300.xlsx

Processing chunk: 1301 to 1400

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [11:54<00:00,  7.14s/it]
Processing firms: 100%|[32m██████████[0m| 16/16 [02:03<00:00,  7.73s/it]



Summary:
Successfully captured: 8
Not found: 108
Errors: 0
Success rate: 8.0%

Saved: ./firm_info_1301_1400.xlsx

Processing chunk: 1401 to 1500

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:06<00:00,  7.87s/it]
Processing firms: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.46s/it]



Summary:
Successfully captured: 13
Not found: 88
Errors: 0
Success rate: 13.0%

Saved: ./firm_info_1401_1500.xlsx

Processing chunk: 1501 to 1600

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:22<00:00,  8.02s/it]
Processing firms: 100%|[32m██████████[0m| 1/1 [00:06<00:00,  6.20s/it]



Summary:
Successfully captured: 15
Not found: 86
Errors: 0
Success rate: 15.0%

Saved: ./firm_info_1501_1600.xlsx

Processing chunk: 1601 to 1700

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [14:03<00:00,  8.43s/it]
Processing firms: 100%|[32m██████████[0m| 18/18 [02:36<00:00,  8.70s/it]



Summary:
Successfully captured: 17
Not found: 101
Errors: 0
Success rate: 17.0%

Saved: ./firm_info_1601_1700.xlsx

Processing chunk: 1701 to 1800

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [12:57<00:00,  7.78s/it]



Summary:
Successfully captured: 11
Not found: 89
Errors: 0
Success rate: 11.0%

Saved: ./firm_info_1701_1800.xlsx

Processing chunk: 1801 to 1900

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [13:28<00:00,  8.09s/it]
Processing firms: 100%|[32m██████████[0m| 8/8 [01:21<00:00, 10.22s/it]



Summary:
Successfully captured: 16
Not found: 92
Errors: 0
Success rate: 16.0%

Saved: ./firm_info_1801_1900.xlsx

Processing chunk: 1901 to 2000

Total number of firms to search: 10866


Processing firms: 100%|[32m██████████[0m| 100/100 [14:18<00:00,  8.58s/it]
Processing firms: 100%|[32m██████████[0m| 7/7 [00:44<00:00,  6.29s/it]


Summary:
Successfully captured: 7
Not found: 100
Errors: 0
Success rate: 7.0%

Saved: ./firm_info_1901_2000.xlsx

FINAL SUMMARY:
Total firms captured: 219/2000
Overall Accuracy: 10.95%
Processing complete!





In [6]:
# PREVIOUS: processing(1, 500, '95628633CBBB63E551F16A5EF4BF02E0', '58736055', 'high_prob_firms.xlsx', 100)