# Import Required Libraries

In [1]:
#从selenium导入浏览器驱动
from selenium import webdriver
#导入浏览器驱动设置选项
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
#导入网页解析库
from bs4 import BeautifulSoup
#导入时间库,利用time.time()防止爬虫访问过于频繁被禁止访问
import time
#导入pandas数据分析库,生成dataframe
import pandas as pd
import pickle
import re
from china_cities import *
import csv
import requests, lxml
import json
import pinyin.cedict

# Airtable Authentication

In [5]:
ENV = 'dev'
ENV_FILENAME = 'prod.env' if ENV == 'prod' else 'dev.env'
AIRTABLE_SHEET_NAME = 'Company Data ENTRY TABLE' if ENV == 'prod' else 'qcc_scrape'

In [3]:
env_vars = {}

with open(ENV_FILENAME) as f:
    for line in f:
        if line.startswith('#') or not line.strip():
            continue
        key, value = line.strip().split('=')
        env_vars[key] = value

AIRTABLE_TOKEN = env_vars["AIRTABLE_TOKEN"]
AIRTABLE_BASE_ID = env_vars["AIRTABLE_BASE_ID"]
AIRTABLE_URL = f"https://api.airtable.com/v0/{AIRTABLE_BASE_ID}"

# Set Up Chrome Driver

In [6]:
driver = webdriver.Chrome() #实例化一个浏览器
driver.get('https://www.qcc.com/')
print('请在打开的网页，扫码登录！')
time.sleep(3)
new_cookies = driver.get_cookies() #获取最新的cookies
pickle.dump( new_cookies, open("cookies.pkl","wb"))
driver.quit() #关闭浏览器

请在打开的网页，扫码登录！


In [7]:
'''若取消下方被注释的代码可静默运行浏览器，不会显示页面，仅在后台运行'''
chrome_options=Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()           #实例化一个谷歌浏览器对象
driver.get("https://www.qcc.com/")   #浏览器打开企查查网站

cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)
driver.get("https://www.qcc.com/")
driver.refresh()                #自动刷新页面，请检查是否已经自动登录账号

# Functions for Airtable API

In [9]:
def save_in_json(input, filename):        
    with open(filename, 'w') as jsonfile:
        jsonfile.write(json.dumps(input, indent=4))

In [11]:
def get_airtable_data(sheet_name, offset=None):
    url = f"{AIRTABLE_URL}/{sheet_name}"
    headers = {
        'Authorization': f'Bearer {AIRTABLE_TOKEN}',
        'Content-Type': 'application/json'
    }
    params = {}
    if offset:
        params["offset"] = offset
    
    response = requests.request("GET", url, headers=headers, params=params)
    return response

In [12]:
def update_airtable_data(sheet_name, version):
    url = f"{AIRTABLE_URL}/{sheet_name}"
    headers = {
        'Authorization': f'Bearer {AIRTABLE_TOKEN}',
        'Content-Type': 'application/json'
    }
    
    f = open ('patch_{}.json'.format(version), "r")
    payload = json.loads(f.read())
    response = requests.request("PATCH", url, headers=headers, data=json.dumps(payload))
    return response

# Functions that Scrape Data

In [8]:
def get_company_table_from_search(company):
    time.sleep(3.2)
    driver.find_element(By.XPATH, "//input[contains(@id,'searchKey')]").clear()
    driver.find_element(By.XPATH, "//input[contains(@id,'searchKey')]").send_keys(company)
    driver.find_element(By.XPATH, "//button[@class='btn btn-primary']").click()
    time.sleep(3)
    bs = BeautifulSoup(driver.page_source,'html.parser')  #将加载好的网页用BeautifulSoup解析成文本
    search_company_list_table = bs.find_all('table')[0].find_all('tr')
    search_company_list_data = []
    
    for table_row in search_company_list_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 2:
                    continue
                cell_content = table_cell.text.strip().replace('最近浏览','').split('\n',1)[0].split(' 存续',1)[0].strip()
                row_data.append(cell_content.split(' ',1)[0])
            except:
                continue
        search_company_list_data.append(row_data)
        
    dataFrame = pd.DataFrame(data = search_company_list_data[:], columns = search_company_list_data[0])
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
        
    company_name = str(dataFrame.loc[0:0]).strip().split('\n')[0]
    print('A:', company_name)
    print('B:', company)
    if company_name == company:
        wait = WebDriverWait(driver, 3)
        links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href,'https://www.qcc.com/firm/')]")))
        driver.execute_script("arguments[0].target='_self';", links[0])
        driver.find_element(By.XPATH, "//a[contains(@href,'https://www.qcc.com/firm/')]").click()
        time.sleep(3)
        bs = BeautifulSoup(driver.page_source,'html.parser')  #将加载好的网页用BeautifulSoup解析成文本
        # print('漂亮的汤:', bs)

        html_table = bs.find("table")
        str_table = str(html_table)
        # print('基本信息:', str_table)
        return str_table
    else:
        return False

In [4]:
def get_registered_year(str_table):
    str_table_year = re.compile(r'([1|2]\d\d\d)[-](0[1-9]|1[012])[-](0[1-9]|[12][0-9]|3[01])')
    iterator = str_table_year.findall(str_table)
    if len(list(iterator)) > 0:
         return list(iterator)[0][0]
    elif "年" in str_table:
        str_table_year = re.compile(r'([1|2]\d\d\d)[年](\d+)[月](\d+)[日]')
        iterator = str_table_year.findall(str_table)
        return list(iterator)[0][0]

In [5]:
def get_headquarter(str_table):
    headquarters = {
        '开曼群岛': 'Cayman Islands',
        '開曼群島':'Cayman Islands',
        '英属维尔京群岛': 'British Virgin Islands'
    }
    with open('chinese_cities.csv', newline='') as csvfile:
        headquarter_reader = csv.reader(csvfile, delimiter=',')
        for en_city, cn_city, en_province in headquarter_reader:
            if cn_city in ['澳门', '澳門', '北京市', '重庆市', '上海市', '天津市']:
                headquarters[cn_city] = "{}, China".format(en_city)
            elif cn_city in ['香港']:
                headquarters[cn_city] = "{} SAR, China".format(en_city)
            else:
                headquarters[cn_city] = "{}, {}, China".format(en_city, en_province)

    en_headquarter = [headquarter for cn_city, headquarter in headquarters.items() if cn_city in str_table]
    if '非香港' in str_table:
        en_headquarter.remove('Hong Kong SAR, China')
    if 'Cayman Islands' in en_headquarter:
        return 'Cayman Islands'
    if 'British Virgin Islands' in en_headquarter:
        return 'British Virgin Islands'
    if len(en_headquarter) == 0:
        return ''
    else:
        return en_headquarter[0]

# Ownership for Public and Private Companies

In [16]:
def get_public_company_ownership():
    all_elements = driver.find_elements(By.XPATH, "//span[@class='name']/a")
    top_ten_shareholders = []

    for index, element in enumerate(all_elements):
        top_ten_shareholders.append(element.get_attribute("innerHTML"))
#     print('十大股东:', top_ten_shareholders[:10])

    bs = BeautifulSoup(driver.page_source,'html.parser')
    shareholders_table = bs.find_all('table')[7].find_all('tr')
    shareholders_data = []

    for table_row in shareholders_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 4:
                    continue
                if column >= 4:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0]
                if column == 6 and cell_content != '持股数(股)':
                    cell_content = "{:,}".format(int(cell_content))
                if column == 9 and cell_content != '增减(股)'and cell_content != '不变':
                    cell_content = "{:,}".format(int(cell_content))
                row_data.append(cell_content)
            except:
                continue
        shareholders_data.append(row_data)

    dataFrame = pd.DataFrame(data = shareholders_data[1:], columns = shareholders_data[0])
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [17]:
def get_private_company_ownership():
    bs = BeautifulSoup(driver.page_source,'html.parser')
    shareholders_table = bs.find_all('table')[1].find_all('tr')
    shareholders_data = []
    for table_row in shareholders_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 2:
                    continue
                if column >= 2:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0]
                row_data.append(cell_content)
            except:
                continue
        shareholders_data.append(row_data)
    dataFrame = pd.DataFrame(data = shareholders_data[1:], columns = shareholders_data[0])
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

# Company Basic Information

In [19]:
def get_company_basic_information_table(basic_table):
    # get basic table column name
    basic_column = []
    for table_row in basic_table:
        column_names = []

        for column, table_cell in enumerate(table_row):
            try:
                if column in [0, 1, 4, 8]:
                    column_name = table_cell.text.strip().split('\n',1)[0]
                if column == 1:
                    column_name = column_name[0]
                if column in [2, 3, 5, 6, 7, 9, 10]:
                    continue
                column_names.append(column_name)
            except:
                continue
        basic_column.append(column_names)

    # get basic table data
    basic_data = []
    for table_row in basic_table:
        column_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column == 2:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 关联',1)[0].split('  复制',1)[0].split(' 附近企业',1)[0]
                if column in [6, 10]:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 趋势图',1)[0].split('  复制',1)[0]
                if column in [0, 1, 3, 4, 5, 7, 8, 9]:
                    continue
                column_data.append(cell_content)
            except:
                continue
        basic_data.append(column_data)

    # create dataFrame
    flat_basic_column = [item for sublist in basic_column for item in sublist]
    flat_basic_data = [item for sublist in basic_data for item in sublist]
    dataFrame = pd.DataFrame(data = [flat_basic_data], columns = flat_basic_column)
#     display(dataFrame.transpose())
    return dataFrame

In [None]:
#get_basic_information
def get_company_basic_information(company_basic_information_dataframe):
    full_english_column_name = [col for col in company_basic_information_dataframe.columns if '英文名' in col][0]
    full_chinese_column_name = "企业名称" if "企业名称" in company_basic_information_dataframe.columns else "企业中文名称"
    year_founded_column_name = "成立日期" if "成立日期" in company_basic_information_dataframe.columns else "注册日期"
    company_status_column_name = "登记状态" if "登记状态" in company_basic_information_dataframe.columns else "状态"

    company_basic_information = {
        "_full_english_name": company_basic_information_dataframe[full_english_column_name].values[0],
        "_full_chinese_name": company_basic_information_dataframe[full_chinese_column_name].values[0],
        "year_founded": company_basic_information_dataframe[year_founded_column_name].values[0][:4],
        "hq_location": company_basic_information_dataframe["注册地址"].values[0],
#         "_industry": company_basic_information_dataframe["经营范围"][0],
        "company_status": company_basic_information_dataframe[company_status_column_name][0]
    }
    if "经营范围" in company_basic_information_dataframe.columns:
        company_basic_information["_industry"] = company_basic_information_dataframe["经营范围"][0]
            
    return company_basic_information

In [2]:
# 主要人员
def get_leadership_names():
    leadership_table = bs.find_all('table')[3].find_all('tr')
    leadership_data = []
    for table_row in leadership_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 3:
                    continue
                if column >= 2:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 香港',1)[0].split(' 最终受益人',1)[0].split()[0]
                row_data.append(cell_content)
            except:
                continue
        leadership_data.append(row_data)

    dataFrame = pd.DataFrame(data = leadership_data[1:], columns = leadership_data[0])
    dataFrame.to_csv('阿里巴巴（中国）网络技术有限公司_leadership.csv')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [None]:
def get_subsidiaries():
    subsidiaries_table = bs.find_all('table')[4].find_all('tr')
    subsidiaries_data = []
    for table_row in subsidiaries_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 3:
                    continue
                if column <= 5:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0].split(' A股',1)[0]
                if column == 6:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split()[0]
                if column >= 7:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0].split(' A股',1)[0]
                row_data.append(cell_content)
            except:
                continue
        subsidiaries_data.append(row_data)

    dataFrame = pd.DataFrame(data = subsidiaries_data[1:], columns = subsidiaries_data[0])
    # dataFrame.to_csv('阿里巴巴（中国）网络技术有限公司_subsidiaries.csv')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [None]:
#Successfully click into shareholder's link
def get_individual_ownership(index, company_ownership_dataframe):
    wait = WebDriverWait(driver, 4)
    if "股东及出资信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["股东及出资信息"].values[index].strip()
    elif "发起人及出资信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["发起人及出资信息"].values[index].strip()
    elif "股东名称" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["股东名称"].values[index].strip()
    elif "合伙人信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["合伙人信息"].values[index].strip()
    
    if len(search) <= 3:
        return False
    
    print('search:', search)
    search_xpath = "//a[text()='{}']".format(search)
    print('search_xpath:', search_xpath)
    try:
        links = wait.until(EC.presence_of_all_elements_located((By.XPATH, search_xpath)))
        print('links:', links)
        driver.execute_script("arguments[0].target='_self';", links[0])
        driver.find_element(By.XPATH, search_xpath).click()
        cs = BeautifulSoup(driver.page_source,'html.parser')
        basic_table = cs.find_all('table')[0].find_all('tr')
        company_basic_information_dataframe = get_company_basic_information_table(basic_table)
        basic_information = get_company_basic_information(company_basic_information_dataframe)
    #     print(basic_information)
        driver.back()
        return basic_information
    except:
        return False

# Parse Data to Airtable Format

In [18]:
#读取Public/Private的股东信息
def set_company_ownership():
    bs = BeautifulSoup(driver.page_source,'html.parser')
    public_company_ownership_button = driver.find_element(By.XPATH, "//a[starts-with(@href,'/firm/')]")
    button_text = public_company_ownership_button.get_attribute("innerText").split(" ")[0]
    
    company_ownership = {}
    chinese_names = []
    
    if button_text == '上市信息':
        public_company_ownership_button.click()
        time.sleep(3)
        company_ownership_dataframe = get_public_company_ownership()
        company_ownership = {}
        for index in range(len(company_ownership_dataframe)):
            basic_information = get_individual_ownership(index, company_ownership_dataframe)
            company_ownership["owner_"+str(index+1)] = basic_information["english_name"]
            company_ownership["owner_"+str(index+1)+"_share_class TEST"] = company_ownership_dataframe["股份类型"].values[index]
            company_ownership["owner_"+str(index+1)+"_number_of_shares TEST"] = company_ownership_dataframe["持股数(股)"].values[index]
            company_ownership["owner_"+str(index+1)+"_percent"] = company_ownership_dataframe["持股比例"].values[index]
            company_ownership["owner_"+str(index+1)+"_ultimate_beneficiary TEST"] = company_ownership_dataframe["最终受益股份"].values[index]
            company_ownership["owner_"+str(index+1)+"_change_in_shares TEST"] = company_ownership_dataframe["增减(股)"].values[index]
            company_ownership["owner_"+str(index+1)+"_change_in_percent TEST"] = company_ownership_dataframe["变动比例"].values[index]
            chinese_names.append(basic_information['_full_chinese_name'])
    else:
        company_ownership_dataframe = get_private_company_ownership()
        company_ownership = {}
        for index in range(len(company_ownership_dataframe)):
            company_ownership["owner_"+str(index+1)] = basic_information["english_name"]
            company_ownership["owner_"+str(index+1)+"_percent"] = company_ownership_dataframe["持股比例"].values[index]
            chinese_names.append(basic_information['_full_chinese_name'])
            
    return (company_ownership, chinese_names)

Unnamed: 0,股东及出资信息,持股比例,认缴出资额(万美元),认缴出资日期,参股日期,实缴出资额(万美元),实缴出资日期,关联产品/机构
0,淘宝（中国）软件有限公司,57.5947%,617718,2040-09-08,2018-11-27,-,-,闲鱼
1,浙江天猫技术有限公司,35.7470%,383396,2040-09-08,2019-04-07,-,-,天猫精灵
2,Alibaba.com China Limited,6.6583%,71412,2040-09-08,2007-04-29,37880,最新：2013-06-27,阿里巴巴


In [None]:
def set_individual_ownership_chinese_names(patch_response, patch_owners_records_chinese_names):
    patch_records = []

    for record_id, chinese_names in patch_owners_records_chinese_names.items():
        if patch_response["records"][0]["id"] == record_id:
            for index in range(len(chinese_names)):
                owner_chinese_name = chinese_names[index]
                owner_record_id = patch_response["records"][0]["fields"]["owner_"+str(index+1)][0]
                
                patch_record = {
                    "id": owner_record_id,
                    "fields": {
                        "_full_chinese_name": owner_chinese_name
                    }
                }
                patch_records.append(patch_record)
    return patch_records

In [13]:
def format_updated_records_from_getters(all_records):
    filtered_all_records = list(filter(lambda record: '_full_chinese_name' in record['fields'], all_records))

    patch_records = []
    for record in filtered_all_records:
        if '_scrape' in record['fields'].keys():
            patch_field = {}
            
            if 'createdTime' in record.keys():
                del record['createdTime']
                
            chinese_company_name = record['fields']['_full_chinese_name']
        #         print(chinese_company_name)
            company_table = get_company_table_from_search(chinese_company_name)
            if company_table == False:
                continue
                
            patch_field["year_founded"] = get_registered_year(company_table)
            patch_field["year_founded_source"] = "Qichacha"
            patch_field["year_founded_source_url"] = driver.current_url
            
            company_headquarter = get_headquarter(company_table)
            if company_headquarter == "Cayman Islands" or company_headquarter == "British Virgin Islands":
                patch_field["_incorporated_in"] = [company_headquarter]
            else:
                patch_field["hq_location"] = [company_headquarter]
                
            big_record_id = record['id']
            (company_ownership, chinese_names) = set_company_ownership()
            patch_owners_records[big_record_id] = chinese_names
            patch_field = {**patch_field, **company_ownership}
            
            print('公司名称:', chinese_company_name, '集团总部:', company_headquarter)

            patch_record = {
                "id": record["id"],
                "fields": patch_field
            }
            patch_records.append(patch_record)
#     print(patch_records)
    return (patch_records, patch_owners_records)

# Invoke Functions

In [14]:
#Get Airtable Data
sheet_name = AIRTABLE_SHEET_NAME
airtable_response = get_airtable_data(sheet_name).json()
all_records = []
records = get_airtable_data(sheet_name)
all_records.extend(records.json()['records'])

while "offset" in records.json():
    records = get_airtable_data(sheet_name, records.json()["offset"])
    all_records.extend(records.json()['records'])

save_in_json(all_records, 'get.json')

In [15]:
#Update Airtable Data
(updated_records, patch_owners_records) = format_updated_records_from_getters(all_records)
new_owner_records = []

for count, record in enumerate(updated_records, 1):
    if count % 10 == 0:
        json_records = updated_records[count - 10:count]
        updated_requests = {'records': json_records, "typecast": True}
        version = int(count/10)
        json_filename = 'patch_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
        save_in_json(patch_response, 'response_{}.json'.format(version))
        patch_records = set_individual_ownership_chinese_names(patch_response, patch_owners_records)
        new_owner_records.append(patch_records)
        
    if count % 10 != 0:
        version = int(count/10 + 1)
        count = count % 10
        json_records = updated_records[-count:]
        updated_requests = {'records': json_records, "typecast": True}
        json_filename = 'patch_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
        save_in_json(patch_response, 'response_{}.json'.format(version))
        patch_records = set_individual_ownership_chinese_names(patch_response, patch_owners_records)
        new_owner_records.append(patch_records)

new_owner_records = [record for sublist in new_owner_records for record in sublist]

for count, record in enumerate(new_owner_records, 1):
    if count % 10 == 0:
        json_records = new_owner_records[count - 10:count]
        updated_requests = {'records': json_records, "typecast": True}
        version = int(count/10)
        json_filename = 'new_owner_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
#         print(patch_response)
    if count % 10 != 0:
        count = count % 10
        json_records = new_owner_records[-count:]
        updated_requests = {'records': json_records, "typecast": True}
        version = int(count/10 + 1)
        json_filename = 'new_owner_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
        
print(updated_records)

print("Updated {} of records.".format(len(updated_records)))

Unnamed: 0,Unnamed: 1,Unnamed: 2,已关注
0,,,已关注
1,,,股东信息
2,,,淘宝（中国）软件有限公司北京朝阳分公司
3,,,已关注
4,,,阿里巴巴（中国）网络技术有限公司
5,,,已关注
6,,,已关注
7,,,已关注
8,,,已关注
9,,,已关注


A: 已关注
B: 淘宝（中国）软件有限公司


Unnamed: 0,Unnamed: 1,Unnamed: 2,阿里巴巴（中国）网络技术有限公司
0,,,阿里巴巴（中国）网络技术有限公司
1,,,股东信息
2,,,阿里巴巴（中国）网络技术有限公司常州分公司
3,,,阿里巴巴（中国）网络技术有限公司青岛分公司
4,,,阿里巴巴（中国）网络技术有限公司佛山分公司
5,,,阿里巴巴（中国）网络技术有限公司上海分公司
6,,,阿里巴巴（中国）网络技术有限公司苏州分公司
7,,,阿里巴巴（中国）网络技术有限公司深圳分公司
8,,,阿里巴巴（中国）网络技术有限公司厦门分公司
9,,,阿里巴巴（中国）网络技术有限公司广州分公司


A: 阿里巴巴（中国）网络技术有限公司
B: 阿里巴巴（中国）网络技术有限公司
公司名称: 阿里巴巴（中国）网络技术有限公司 集团总部: Hangzhou, Zhejiang, China
[{'id': 'recMIC86o8iCbJq5u', 'fields': {'year_founded': '1999', 'year_founded_source': 'Qichacha', 'hq_location': ['Hangzhou, Zhejiang, China']}}]
[{'id': 'recMIC86o8iCbJq5u', 'fields': {'year_founded': '1999', 'year_founded_source': 'Qichacha', 'hq_location': ['Hangzhou, Zhejiang, China']}}]
Updated 1 of records.


# In progress

In [20]:
#Successfully click into shareholder's link
wait = WebDriverWait(driver, 3)
for index in range(len(company_ownership_dataframe)):
    search = company_ownership_dataframe["股东及出资信息"].values[index].strip()
    search_xpath = "//a[text()='{}']".format(search)
    links = wait.until(EC.presence_of_all_elements_located((By.XPATH, search_xpath)))
    driver.execute_script("arguments[0].target='_self';", links[0])
    driver.find_element(By.XPATH, search_xpath).click()
    cs = BeautifulSoup(driver.page_source,'html.parser')
    basic_table = cs.find_all('table')[0].find_all('tr')
    company_basic_information_dataframe = get_company_basic_information_table(basic_table)
    basic_information = get_company_basic_information(company_basic_information_dataframe)
#     print(basic_information)
    driver.back()

Unnamed: 0,0
统一社会信用代码,91330100716105852F
企业名称,阿里巴巴（中国）网络技术有限公司
法定代表人,戴珊
登记状态,存续
成立日期,1999-09-08
注册资本,1072526万美元
实缴资本,553011万美元
核准日期,2022-01-18
组织机构代码,71610585-2
工商注册号,330100400015575


In [None]:
get_leadership_names()

In [None]:
get_subsidiaries()

In [None]:
set_company_ownership()