# Import Required Libraries

In [1]:
#从selenium导入浏览器驱动
from selenium import webdriver
#导入浏览器驱动设置选项
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
#导入网页解析库
from bs4 import BeautifulSoup
#导入时间库,利用time.time()防止爬虫访问过于频繁被禁止访问
import time
#导入pandas数据分析库,生成dataframe
import pandas as pd
import pickle
import re
from china_cities import *
import csv
import requests, lxml
import json
import pinyin.cedict
from hanziconv import HanziConv
import random
from decimal import *
from google_trans_new import google_translator
translator = google_translator()

# Airtable Authentication

In [2]:
ENV = 'prod'
ENV_FILENAME = 'prod.env' if ENV == 'prod' else 'dev.env'
AIRTABLE_SHEET_NAME = 'Companies and Individuals' if ENV == 'prod' else 'qcc_scrape'
SHOULD_UPDATE_BASIC_INFORMATION = False
SHOULD_UPDATE_OWNERSHIP = True
SHOULD_UPDATE_LEADERSHIP = False
SHOULD_UPDATE_CAPITAL = False
SHOULD_UPDATE_CONTACT = False

In [3]:
env_vars = {}

with open(ENV_FILENAME) as f:
    for line in f:
        if line.startswith('#') or not line.strip():
            continue
        key, value = line.strip().split('=')
        env_vars[key] = value

AIRTABLE_TOKEN = env_vars["AIRTABLE_TOKEN"]
AIRTABLE_BASE_ID = env_vars["AIRTABLE_BASE_ID"]
AIRTABLE_URL = f"https://api.airtable.com/v0/{AIRTABLE_BASE_ID}"

# Set Up Chrome Driver

In [4]:
driver = webdriver.Chrome() #实例化一个浏览器
driver.get('https://www.qcc.com/')
print('请在打开的网页，扫码登录！')
time.sleep(5)
new_cookies = driver.get_cookies() #获取最新的cookies
pickle.dump( new_cookies, open("cookies.pkl","wb"))
driver.quit() #关闭浏览器

请在打开的网页，扫码登录！


In [5]:
'''若取消下方被注释的代码可静默运行浏览器，不会显示页面，仅在后台运行'''
chrome_options=Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Chrome()           #实例化一个谷歌浏览器对象
driver.get("https://www.qcc.com/")   #浏览器打开企查查网站

cookies = pickle.load(open("cookies.pkl", "rb"))
for cookie in cookies:
    driver.add_cookie(cookie)
driver.get("https://www.qcc.com/")
driver.refresh()                #自动刷新页面，请检查是否已经自动登录账号

# Functions for Airtable API

In [6]:
def save_in_json(input, filename):        
    with open(filename, 'w') as jsonfile:
        jsonfile.write(json.dumps(input, indent=4))

In [7]:
def get_airtable_data(sheet_name, offset=None):
    url = f"{AIRTABLE_URL}/{sheet_name}"
    headers = {
        'Authorization': f'Bearer {AIRTABLE_TOKEN}',
        'Content-Type': 'application/json'
    }
    params = {}
    if offset:
        params["offset"] = offset
    
    response = requests.request("GET", url, headers=headers, params=params)
    return response

In [8]:
def update_airtable_data(sheet_name, json_filename):
    url = f"{AIRTABLE_URL}/{sheet_name}"
    headers = {
        'Authorization': f'Bearer {AIRTABLE_TOKEN}',
        'Content-Type': 'application/json'
    }

    f = open (json_filename, "r")
    payload = json.loads(f.read())
    response = requests.request("PATCH", url, headers=headers, data=json.dumps(payload))
    return response.json()

# Functions that Scrape Data

In [9]:
def get_company_table_from_search(company):
    time.sleep(generate_random_number())
    driver.find_element(By.XPATH, "//input[contains(@id,'searchKey')]").clear()
    driver.find_element(By.XPATH, "//input[contains(@id,'searchKey')]").send_keys(company)
    driver.find_element(By.XPATH, "//button[@class='btn btn-primary']").click()
    time.sleep(generate_random_number())
    bs = BeautifulSoup(driver.page_source,'html.parser')  #将加载好的网页用BeautifulSoup解析成文本
    try:
        search_company_list_table = bs.find_all('table')[0].find_all('tr')
        search_company_list_data = []

        for table_row in search_company_list_table:
            row_data = []
            for column, table_cell in enumerate(table_row):
                try:
                    if column < 2:
                        continue
                    cell_content = table_cell.text.strip().replace('最近浏览','').split('\n',1)[0].split(' 存续',1)[0].strip()
                    row_data.append(cell_content.split(' ',1)[0])
                except:
                    continue
            search_company_list_data.append(row_data)

        dataFrame = pd.DataFrame(data = search_company_list_data[:], columns = search_company_list_data[0])

        company_name = str(dataFrame.loc[0:0]).strip().split('\n')[0].strip()
        print('A:', company_name)
        print('B:', company)
        if company_name == company or company_name == HanziConv.toTraditional(company) or company_name == HanziConv.toSimplified(company):
            wait = WebDriverWait(driver, 3)
            links = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href,'https://www.qcc.com/firm/')]")))
            driver.execute_script("arguments[0].target='_self';", links[0])
            driver.find_element(By.XPATH, "//a[contains(@href,'https://www.qcc.com/firm/')]").click()
            time.sleep(generate_random_number())
            bs = BeautifulSoup(driver.page_source,'html.parser')  #将加载好的网页用BeautifulSoup解析成文本

            html_table = bs.find("table")
            str_table = str(html_table)
            return str_table
        else:
            return False
    except:
        return False

In [10]:
def get_headquarter(unparsed_headquarter):
    headquarters = {
        '开曼群岛': 'Cayman Islands',
        '開曼群島':'Cayman Islands',
        '英属维尔京群岛': 'British Virgin Islands'
    }
    with open('chinese_cities.csv', newline='') as csvfile:
        headquarter_reader = csv.reader(csvfile, delimiter=',')
        for en_city, cn_city, en_province in headquarter_reader:
            if cn_city in ['澳门', '澳門', '北京市', '重庆市', '上海市', '上海','天津市']:
                headquarters[cn_city] = "{}, China".format(en_city)
            elif cn_city in ['香港']:
                headquarters[cn_city] = "{} SAR, China".format(en_city)
            else:
                headquarters[cn_city] = "{}, {}, China".format(en_city, en_province)

    en_headquarter = [headquarter for cn_city, headquarter in headquarters.items() if cn_city in unparsed_headquarter]
    if '非香港' in unparsed_headquarter:
        en_headquarter.remove('Hong Kong SAR, China')
    if 'Cayman Islands' in en_headquarter:
        return 'Cayman Islands'
    if 'British Virgin Islands' in en_headquarter:
        return 'British Virgin Islands'
    if len(en_headquarter) == 0:
        return ''
    else:
        return en_headquarter[0]

# Ownership for Public and Private Companies

In [11]:
def get_public_company_ownership():
    all_elements = driver.find_elements(By.XPATH, "//span[@class='name']/a")
    top_ten_shareholders = []

    for index, element in enumerate(all_elements):
        top_ten_shareholders.append(element.get_attribute("innerHTML"))

    bs = BeautifulSoup(driver.page_source,'html.parser')
    shareholders_table = bs.find_all('table')[7].find_all('tr')
    shareholders_data = []

    for table_row in shareholders_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 4:
                    continue
                if column >= 4:
                    if is_english(format_individual_names(table_cell.text)) == True:
                        if table_cell.text[0] == table_cell.text[2] and table_cell.text[1] != " ":
                            cell_content = table_cell.text[2:].split(' 大股东',1)[0].strip()
                        else:
                            cell_content = table_cell.text.split(' 大股东',1)[0].strip()
                    else:
                        cell_content = format_individual_names(table_cell.text)
                        cell_content = cell_content.strip().split('\n',1)[0].split(' ', 1)[0]
#                     cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 私募基金',1)[0].split(' 执行事务合伙人',1)[0].split(' 私募基金管理人',1)[0].split(' A股|',1)[0].split(' 有股权出质',1)[0].split(' 企业名存在变更',1)[0].split(' 中概股|',1)[0]
                if column == 6 and cell_content != '持股数(股)':
                    cell_content = "{:,}".format(int(cell_content))
                if column == 9 and cell_content != '增减(股)'and cell_content != '不变':
                    cell_content = "{:,}".format(int(cell_content))
                row_data.append(cell_content)
            except:
                continue
        shareholders_data.append(row_data)

    dataFrame = pd.DataFrame(data = shareholders_data[1:], columns = shareholders_data[0])
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [12]:
def get_private_company_ownership(index = 1):
    bs = BeautifulSoup(driver.page_source,'html.parser')
    shareholders_table = bs.find_all('table')[index].find_all('tr')
    shareholders_data = []
    for table_row in shareholders_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 2:
                    continue
                if column >= 2:
                    cell_content = format_individual_names(table_cell.text)
                    cell_content = cell_content.strip().split('\n',1)[0].split(' ', 1)[0]
#                     cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0].split(' 私募基金',1)[0].split(' 执行事务合伙人',1)[0].split(' 私募基金管理人',1)[0].split(' A股|',1)[0].split(' 有股权出质',1)[0].split(' 企业名存在变更',1)[0].split(' 中概股|',1)[0]
                row_data.append(cell_content)
            except:
                continue
        shareholders_data.append(row_data)
    dataFrame = pd.DataFrame(data = shareholders_data[1:], columns = shareholders_data[0])
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

# Company Basic Information

In [13]:
def get_company_basic_information_table(basic_table):
    # get basic table column name
    basic_column = []
    for table_row in basic_table:
        column_names = []

        for column, table_cell in enumerate(table_row):
            try:
                if column in [0, 1, 4, 8]:
                    column_name = table_cell.text.strip().split('\n',1)[0]
                if column == 1:
                    column_name = column_name[0]
                if column in [2, 3, 5, 6, 7, 9, 10]:
                    continue
                column_names.append(column_name)
            except:
                continue
        basic_column.append(column_names)

    # get basic table data
    basic_data = []
    for table_row in basic_table:
        column_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column == 2:
                    cell_content = format_individual_names(table_cell.text.strip().split('\n',1)[0].split(' 关联',1)[0].split('  复制',1)[0].split(' 附近企业',1)[0])
                if column in [6, 10]:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 趋势图',1)[0].split('  复制',1)[0]
                if column in [0, 1, 3, 4, 5, 7, 8, 9]:
                    continue
                column_data.append(cell_content)
            except:
                continue
        basic_data.append(column_data)

    # create dataFrame
    flat_basic_column = [item for sublist in basic_column for item in sublist]
    flat_basic_data = [item for sublist in basic_data for item in sublist]
    dataFrame = pd.DataFrame(data = [flat_basic_data], columns = flat_basic_column)
#     display(dataFrame.transpose())
    return dataFrame

In [14]:
#get_basic_information
def get_company_basic_information(company_basic_information_dataframe):
    is_company_profile = all(column_name in company_basic_information_dataframe.columns for column_name in ["英文名", "企业名称", "成立日期"])
    if is_company_profile:
        full_english_column_name = [col for col in company_basic_information_dataframe.columns if '英文名' in col][0]
        full_chinese_column_name = "企业名称" if "企业名称" in company_basic_information_dataframe.columns else "企业中文名称"
        year_founded_column_name = "成立日期" if "成立日期" in company_basic_information_dataframe.columns else "注册日期"
        company_status_column_name = "登记状态" if "登记状态" in company_basic_information_dataframe.columns else "状态"
        headquarter_column_name = get_column_name_in_dataframe(["注册地址", "主要经营场所", "最新年报地址", "住所"], company_basic_information_dataframe)
        legal_representative_column_name = "法定代表人" if "法定代表人" in company_basic_information_dataframe.columns else "执行事务合伙人"
        try:
            individual = company_basic_information_dataframe[legal_representative_column_name].values[0].strip()
            if len(individual) > 4:
                individual = ""
            else:
                individual = convert_name_to_pinyin(individual)
        except:
            individual = ""
        print(individual)
        company_basic_information = {
            "full_english_name": company_basic_information_dataframe[full_english_column_name].values[0].split(' （自动翻译', 1)[0],
            "english_name": company_basic_information_dataframe[full_english_column_name].values[0].split(' Co., Ltd.', 1)[0].split(' Holding Limited', 1)[0].split(' Limited', 1)[0].split(', Inc.', 1)[0].split(' Ltd', 1)[0].split(' LIMITED', 1)[0].split(' PTE. LTD.',1)[0].split(' (Limited Partnership)',1)[0].split(' (limited Partnership)',1)[0].split(' (L.P.)',1)[0].split('(L.P.)',1)[0].split('（自动翻译）',1)[0].split(' （自动翻译）',1)[0].split('(Limited Partnership)',1)[0].split('(limited Partnership)',1)[0].split('Corporation',1)[0].split('Company',1)[0],
            "full_chinese_name": company_basic_information_dataframe[full_chinese_column_name].values[0],
            "chinese_name": company_basic_information_dataframe[full_chinese_column_name].values[0].split('投资管理有限公司', 1)[0].split('股份有限公司', 1)[0].split('有限公司', 1)[0].split('（有限合伙）', 1)[0].split('合伙企业（有限合伙）', 1)[0].split('投资有限公司', 1)[0].split('资产管理有限公司', 1)[0].split('控股有限公司', 1)[0].split('公司', 1)[0],
            "year_founded": company_basic_information_dataframe[year_founded_column_name].values[0][:4],
            "location": [get_headquarter(company_basic_information_dataframe[headquarter_column_name].values[0])],
            "registered_address": format_registered_address(translate_chinese_address_to_english(company_basic_information_dataframe[headquarter_column_name][0]).strip().replace("the", "").replace("(centralized office area)", "")),
#             "year_founded_source": "Qichacha",
#             "year_founded_source_url": driver.current_url,
            "company_type": get_company_type(),
            "legal_representative": individual,
            "_legal_representative": convert_name_to_pinyin(company_basic_information_dataframe[legal_representative_column_name].values[0].strip()),
#             "company_status": company_basic_information_dataframe[company_status_column_name][0]
        }
        company_basic_information["tag_text"] = company_basic_information["english_name"]
        company_basic_information["profile_type"] = profile_type(company_basic_information["full_chinese_name"])
        if "经营范围" in company_basic_information_dataframe.columns:
            company_basic_information["_industry_information"] = company_basic_information_dataframe["经营范围"][0]
            
#         print('company_basic_information:', company_basic_information)
        return company_basic_information
    else:
        return False

In [15]:
def get_leadership_table(index = 3):
    bs = BeautifulSoup(driver.page_source,'html.parser') 
    leadership_table = bs.find_all('table')[index].find_all('tr')
    leadership_data = []
    for table_row in leadership_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 3:
                    continue
                if column >= 2:
                    cell_content = format_individual_names(table_cell.text)
                    cell_content = cell_content.strip().split('\n',1)[0].split(' ', 1)[0]
    #                     cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 香港',1)[0].split(' 最终受益人',1)[0].split()[0]
                row_data.append(cell_content)
            except:
                continue
        leadership_data.append(row_data)
    dataFrame = pd.DataFrame(data = leadership_data[1:], columns = leadership_data[0])
#     dataFrame.to_csv('阿里巴巴（中国）网络技术有限公司_leadership.csv')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [33]:
# 主要人员
def get_leadership(all_records):
    try:
        dataFrame = get_leadership_table()
        if "职务" not in dataFrame.columns:
            dataFrame = get_leadership_table(2)
        if "职务" not in dataFrame.columns:
            dataFrame = get_leadership_table(4)
        if "职务" not in dataFrame.columns:
            dataFrame = get_leadership_table(3)


        translate_leadership_title = {
            "董事长": "Chairman",
            "副董事长": "Vice Chairman",
            "首席执行官": "CEO",
            "经理": "Manager",
            "总经理": "General Manager",
            "副总经理": "Deputy General Manager",
            "董事局主席": "Chairman of the Board",
            "董事会主席": "Chairman of the Board",
            "行政总裁": "Chief Executive Officer",
            "执行董事": "Executive Director",
            "非执行董事": "Non-Executive Director",
            "首席财务官": "CFO",
            "法定代表人": "Legal Representative",
            "总裁": "President",
            "行长": "President",
            "副总裁": "Vice President",
            "副行长": "Vice President",
            "高级副总裁": "Senior Vice President",
            "董事": "Director",
            "独立董事": "Independent Director",
            "独立非执行董事": "Independent Non-Executive Director",
            "监事会主席": "Chairman of the Supervisory Board",
            "职工监事": "Employee Supervisor",
            "监事": "Statutory Auditor",
            "财务总监": "Financial Director",
            "首席营运官": "Chief Operating Officer",
            "董事会秘书": "Board Secretary",
            "首席风险官": "Chief Risk Officer",
            "公司秘书": "Company Secretary",
        }

        leadership = {}
        chinese_names = []
        english_names = []
        leadership_index = 1
    #     print("I'm HERE In Leadership")
        number_of_leaders = min(20, len(dataFrame))
        for index, row in dataFrame.iterrows():
            if leadership_index <= number_of_leaders:
                row_titles = row["职务"].split(",")
                titles = []
                for row_title in row_titles:
                    if row_title in translate_leadership_title.keys():
                        titles.append(translate_leadership_title[row_title])
    #                     print("row_title:", row_title)

                if len(titles) > 0:
                    chinese_name = row["姓名"]
                    chinese_names.append(chinese_name)
                    try: 
                        record = [record for record in all_records if 'full_chinese_name' in record['fields'].keys() and record['fields']['full_chinese_name'] == name][0]
#                         english_name = record['fields']['english_name']
                        english_name = record['fields']['full_english_name']
                        leadership["leader_"+str(leadership_index)] = english_name
                        leadership["leader_"+str(leadership_index)+"_title"] = titles
                        leadership_index = leadership_index + 1
#                         chinese_names.append(chinese_name)
                        english_names.append(english_name)
                    except:
                        leadership["leader_"+str(leadership_index)] = convert_name_to_pinyin(chinese_name)
                        leadership["leader_"+str(leadership_index)+"_title"] = titles
                        leadership_index = leadership_index + 1
                        english_names.append(convert_name_to_pinyin(chinese_name))
#                         chinese_names.append(chinese_name)

#                     leadership["leadership_source"] = "Qichacha"
#                     leadership["leadership_source_url"] = driver.current_url
        print("Leadership:", leadership)
    #     print("Chinese_name", chinese_name)
        return (leadership, chinese_names, english_names)
    except:
        leadership = {}
        chinese_names = []
        return (leadership, chinese_names, english_names)

In [17]:
def get_subsidiaries():
    subsidiaries_table = bs.find_all('table')[4].find_all('tr')
    subsidiaries_data = []
    for table_row in subsidiaries_table:
        row_data = []
        for column, table_cell in enumerate(table_row):
            try:
                if column < 3:
                    continue
                if column <= 5:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0].split(' A股',1)[0]
                if column == 6:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split()[0]
                if column >= 7:
                    cell_content = table_cell.text.strip().split('\n',1)[0].split(' 股',1)[0].split(' 大股东',1)[0].split(' 有股权质押',1)[0].split(' 香港',1)[0].split(' A股',1)[0]
                row_data.append(cell_content)
            except:
                continue
        subsidiaries_data.append(row_data)

    dataFrame = pd.DataFrame(data = subsidiaries_data[1:], columns = subsidiaries_data[0])
    # dataFrame.to_csv('阿里巴巴（中国）网络技术有限公司_subsidiaries.csv')
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(dataFrame)
    return dataFrame

In [18]:
#Successfully click into shareholder's link
def get_individual_ownership(index, company_ownership_dataframe):
    wait = WebDriverWait(driver, 4)
    if "股东及出资信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["股东及出资信息"].values[index].strip()
    elif "发起人及出资信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["发起人及出资信息"].values[index].strip()
    elif "股东名称" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["股东名称"].values[index].strip()
    elif "合伙人信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["合伙人信息"].values[index].strip()
    elif "主管部门（出资人）信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["主管部门（出资人）信息"].values[index].strip()
    elif "投资人信息" in company_ownership_dataframe.columns:
        search = company_ownership_dataframe["投资人信息"].values[index].strip()
    else:
        print('Im HEREEEEE in else:', company_ownership_dataframe)
        return False
    
    if is_english(search) == False and len(search) <= 3:
        print('Im HEREEEEE in len(3):', company_ownership_dataframe)
        return False
    
    print('search:', search)
    search_xpath = "//a[text()='{}']".format(search)
    print('search_xpath:', search_xpath)
    try:
        links = wait.until(EC.presence_of_all_elements_located((By.XPATH, search_xpath)))
        print('links:', links)
        print('Im HEREEEEE in try:', company_ownership_dataframe)
        driver.execute_script("arguments[0].target='_self';", links[0])
        driver.find_element(By.XPATH, search_xpath).click()
        cs = BeautifulSoup(driver.page_source,'html.parser')
        basic_table = cs.find_all('table')[0].find_all('tr')
        company_basic_information_dataframe = get_company_basic_information_table(basic_table)
        basic_information = get_company_basic_information(company_basic_information_dataframe)
#         print('basic_information', basic_information)
        driver.back()
        if basic_information == False:
            if is_english(search) == False:
                return {"full_chinese_name": search}
            else:
                return {"full_english_name": search}
        return basic_information
    except:
        print('Im HEREEEEE in except:', company_ownership_dataframe)
        if is_english(search) == False:
            return {"full_chinese_name": search}
        else:
            return {"full_english_name": search}

In [19]:
def get_initial_company_basic_information():
    bs = BeautifulSoup(driver.page_source,'html.parser')
    basic_table = bs.find_all('table')[0].find_all('tr')
    company_basic_information_dataframe = get_company_basic_information_table(basic_table)
    print('get_initial_company_basic_information dataframe:', company_basic_information_dataframe)
    basic_information = get_company_basic_information(company_basic_information_dataframe)
    return (basic_information, company_basic_information_dataframe)

In [20]:
def get_company_type():
    find_company_type = driver.find_element(By.XPATH, "//div[@class='tags-wrap']")
    company_type = find_company_type.get_attribute("innerHTML")
    if '央企' in company_type and 'A股' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and 'A股' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and 'A股' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '港股' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '港股' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '港股' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '.HK' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '.HK' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '.HK' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '新三板' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '新三板' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '新三板' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '科创板' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '科创板' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '科创板' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '中概股' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '中概股' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '中概股' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '.NYSE' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '.NYSE' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '.NYSE' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type and '.NQ' in company_type:
        return "Publicly Traded SOE"
    elif '国有企业' in company_type and '.NQ' in company_type:
        return "Publicly Traded SOE"
    elif '市管国企' in company_type and '.NQ' in company_type:
        return "Publicly Traded SOE"
    elif '央企' in company_type or '国有企业' in company_type or "市管国企" in company_type:
        return "SOE"
    elif 'A股' in company_type or '港股' in company_type or '.HK' in company_type or '新三板' in company_type or '科创板' in company_type or '中概股' in company_type or '.NYSE' in company_type or '.NQ' in company_type:
        return "Publicly Traded"
    else:
        return "Private"

In [None]:
def get_contact_information():
    bs = BeautifulSoup(driver.page_source,"html.parser")
    try:
        phone_number = "Phone number: " + driver.find_elements(By.XPATH, "//span[@class='f ca' and contains(text(),'电话：')]/span/span[2]")[0].get_attribute("innerHTML")
        email = "\n"
    except:
        phone_number = ""
        email = ""
    try:
        email = email + "Email: " + driver.find_elements(By.XPATH, "//span[@class='f ca' and contains(text(),'邮箱：')]/div/span/span/a")[0].get_attribute("innerHTML")
    except:
        email = ""
    try:
        website_anchor = driver.find_elements(By.XPATH, "//span[@class='f' and contains(text(),'官网：')]/span/a")[0].get_attribute("innerHTML")
        website = "http" + website_anchor.split("http")[-1].split("\n")[0]
        if '<' in website:
            website_anchor = driver.find_elements(By.XPATH, "//span[@class='f' and contains(text(),'官网：')]/span/a")[0].get_attribute("innerHTML")
            website = "www" + website_anchor.split("www")[-1].split("\n")[0]
    except:
        website = ""
    
    return {
        "contact_info": phone_number + email,
        "website": website
    }

# Parse Data to Airtable Format

In [21]:
#读取Public/Private的股东信息
def set_company_ownership(all_records):
    bs = BeautifulSoup(driver.page_source,'html.parser')
    public_company_ownership_button = driver.find_element(By.XPATH, "//a[starts-with(@href,'/firm/')]")
    button_text = public_company_ownership_button.get_attribute("innerText").split(" ")[0]

    company_ownership = {}
    chinese_names = []
    
    if button_text == '上市信息':
        public_company_ownership_button.click()
        time.sleep(generate_random_number())
        company_ownership_dataframe = get_public_company_ownership()
        number_of_owners = min(15, len(company_ownership_dataframe))
        for index in range(number_of_owners):
            prefix = "owner_" 
            #if index < 10 else "_owner_"
            basic_information = get_individual_ownership(index, company_ownership_dataframe)
            print('basic_information:', basic_information)
            ownership_column_name = company_ownership_dataframe.columns[0]
            name = format_individual_names(company_ownership_dataframe[ownership_column_name].values[index])
            print('name:', name)
            
            if is_english(name) == True:
                company_ownership[prefix+str(index+1)] = name
            elif len(name) <= 3 and is_english(name) == False:
                print('entering if len(name):')
                company_ownership[prefix+str(index+1)] = convert_name_to_pinyin(name)
            elif basic_information == False:
                return ({}, [])
            elif "english_name" in basic_information:
                print('entering elif:')
                company_ownership[prefix+str(index+1)] = basic_information["english_name"]
            else:
                print('entering else:')
                try: 
                    record = [record for record in all_records if 'full_chinese_name' in record['fields'].keys() and record['fields']['full_chinese_name'] == name][0]
                    name = record['fields']['english_name']
                    company_ownership[prefix+str(index+1)] = name
                except:
                    company_ownership[prefix+str(index+1)] = name
                
            print(company_ownership[prefix+str(index+1)])
            
            if '股份性质' in company_ownership_dataframe.columns:
                company_ownership["owner_"+str(index+1)+"_share_class"] = company_ownership_dataframe["股份性质"].values[index]
            elif '股份类型' in company_ownership_dataframe.columns:
                company_ownership["owner_"+str(index+1)+"_share_class"] = company_ownership_dataframe["股份类型"].values[index]
#             if '持股数(股)' in company_ownership_dataframe.columns:
#                 company_ownership["owner_"+str(index+1)+"_number_of_shares TEST"] = company_ownership_dataframe["持股数(股)"].values[index]
#             elif '直接持股数量（股）' in company_ownership_dataframe.columns:
#                 company_ownership["owner_"+str(index+1)+"_number_of_shares TEST"] = company_ownership_dataframe["直接持股数量（股）"].values[index]
            if '持股比例' in company_ownership_dataframe.columns:
                company_ownership[prefix+str(index+1)+"_percent"] = company_ownership_dataframe["持股比例"].values[index]
            elif '占已发行普通股比例' in company_ownership_dataframe.columns:
                company_ownership[prefix+str(index+1)+"_percent"] = company_ownership_dataframe["占已发行普通股比例"].values[index]
            if '最终受益股份' in company_ownership_dataframe.columns:
                company_ownership["owner_"+str(index+1)+"_ultimate_beneficiary"] = company_ownership_dataframe["最终受益股份"].values[index]
#             if '增减(股)' in company_ownership_dataframe.columns:
#                 company_ownership["owner_"+str(index+1)+"_change_in_shares TEST"] = company_ownership_dataframe["增减(股)"].values[index]
#             if '变动比例' in company_ownership_dataframe.columns:
#                 company_ownership["owner_"+str(index+1)+"_change_in_percent TEST"] = company_ownership_dataframe["变动比例"].values[index]
#             elif '持股比例变动' in company_ownership_dataframe.columns:
#                 company_ownership["owner_"+str(index+1)+"_change_in_percent TEST"] = company_ownership_dataframe["持股比例变动"].values[index]
#             company_ownership["owner_"+str(index+1)+"_source"] = "Qichacha"
#             company_ownership["owner_"+str(index+1)+"_source_url"] = driver.current_url
#             company_ownership["owner_1_source_url"] = driver.current_url
            if basic_information != False and "full_chinese_name" in basic_information:
                chinese_names.append(basic_information['full_chinese_name'])
            else:
                chinese_names.append(name)
            
    else:
        print("get_private_company_ownership")
        company_ownership_dataframe = get_private_company_ownership()
        if "股东及出资信息" not in company_ownership_dataframe.columns and "发起人及出资信息" not in company_ownership_dataframe.columns and "股东名称" not in company_ownership_dataframe.columns and "合伙人信息" not in company_ownership_dataframe.columns and "主管部门（出资人）信息" not in company_ownership_dataframe.columns and "投资人信息" not in company_ownership_dataframe.columns:
            company_ownership_dataframe = get_private_company_ownership(2)

        print('company_ownership_dataframe:', company_ownership_dataframe)
        number_of_owners = min(15, len(company_ownership_dataframe))
        for index in range(number_of_owners):
            prefix = "owner_" 
            #if index < 10 else "_owner_"
            basic_information = get_individual_ownership(index, company_ownership_dataframe)
            ownership_column_name = company_ownership_dataframe.columns[0]
            name = format_individual_names(company_ownership_dataframe[ownership_column_name].values[index])
            print('name:', name)

            if len(name) <= 3:
                print('entering if len(name):')
                print('NAME:', name)
                company_ownership[prefix+str(index+1)] = convert_name_to_pinyin(name)
            elif basic_information == False:
                return ({}, [])
            elif "english_name" in basic_information:
                print('entering elif:')
                company_ownership[prefix+str(index+1)] = basic_information["english_name"]
            else:
                print('entering else:')
                try: 
                    record = [record for record in all_records if 'full_chinese_name' in record['fields'].keys() and record['fields']['full_chinese_name'] == name][0]
                    name = record['fields']['english_name']
                    company_ownership[prefix+str(index+1)] = name
                except:
                    company_ownership[prefix+str(index+1)] = name

            try:
                company_ownership[prefix+str(index+1)+"_percent"] = company_ownership_dataframe["持股比例"].values[index]
            except:
                company_ownership[prefix+str(index+1)+"_percent"] = company_ownership_dataframe["出资比例"].values[index]
#                 company_ownership["owner_"+str(index+1)+"_source"] = "Qichacha"
#                 company_ownership["owner_"+str(index+1)+"_source_url"] = driver.current_url

            if basic_information != False and "full_chinese_name" in basic_information:
                chinese_names.append(basic_information['full_chinese_name'])
            else:
                chinese_names.append(name)
#         except:
#             company_ownership = {}
#             chinese_names = []
    
    return (company_ownership, chinese_names, driver.current_url)

In [22]:
def set_individual_chinese_names(patch_response, patch_records_chinese_names, leader_or_owner = "owner_"):
    patch_records = []

    for record_id, chinese_names in patch_records_chinese_names.items():
        if patch_response["records"][0]["id"] == record_id:
            for index in range(len(chinese_names)):
#                 if leader_or_owner == "owner_" and index >= 10:
#                     leader_or_owner = "_owner_"
                chinese_name = chinese_names[index]
                record_id = patch_response["records"][0]["fields"][leader_or_owner+str(index+1)][0]
                
                patch_record = {
                    "id": record_id,
                    "fields": {
                        "full_chinese_name": chinese_name,
                        "chinese_name": chinese_name,
                        "profile_type": profile_type(chinese_name),
                    }
                }
                patch_records.append(patch_record)
                
#                 print('PATCH:', patch_records)
    return patch_records

In [None]:
def set_references_sheet(patch_response, patch_reference_urls):
    patch_records = []
    
    if len(patch_reference_urls) > 0:
        for index, old_sheet_record in enumerate(patch_response):
            if "references" in patch_response["records"][index]["fields"]:
                print(patch_response["records"][index]["fields"]["references"][0])

                new_sheet_record_id = patch_response["records"][index]["fields"]["references"][0]
                record_id = patch_response["records"][index]["id"]
                print('patch_reference_urls:', patch_reference_urls, 'record_id', record_id, "new_sheet_record_id", new_sheet_record_id)
                current_url = patch_reference_urls[record_id]

                patch_record = {
                    "id": new_sheet_record_id,
                    "fields": {
                        "year_founded_source": "Qichacha",
                        "year_founded_source_url": current_url,
                        "location_source": "Qichacha",
                        "location_source_url": current_url,
                        "registered_address_source": "Qichacha",
                        "registered_address_source_url": current_url,
                        "contact_info_source": "Qichacha",
                        "contact_info_source_url": current_url,
                        "legal_representative_source": "Qichacha",
                        "legal_representative_source_url": current_url,
                        "registered_capital_source": "Qichacha",
                        "registered_capital_source_url": current_url,
                        "ownership_source": "Qichacha",
                        "ownership_source_url": current_url,
                        "leadership_source": "Qichacha",
                        "leadership_source_url": current_url,
                    }
                }
                patch_records.append(patch_record)
                print('patch_record:', patch_record)

    return patch_records

In [23]:
def convert_name_to_pinyin(name):
    if len(name) >= 3:
        first_name = name[-2:]
        last_name = name[:-2]
        return pinyin.get(last_name).capitalize() + " " + pinyin.get(first_name).capitalize()
    else:
        first_name = name[1]
        last_name = name[0]
        return pinyin.get(last_name).capitalize() + " " + pinyin.get(first_name).capitalize()

In [24]:
def format_individual_names(name):
    if len(name) >= 3 and name[0] == name[2]:
        return name[2:].split(" ",1)[0]
    else:
        return name

In [25]:
def get_column_name_in_dataframe(column_names, dataframe):
    for column_name in column_names:
        if column_name in dataframe.columns:
            return column_name
    return ""

In [26]:
def is_english(name):
    try:
        name.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [27]:
def profile_type(full_chinese_name):
    if len(full_chinese_name) > 4:
        return "Company"
    else:
        return "Individual"

In [28]:
def generate_random_number():
    number = random.randint(4000, 9000)/1000
    return number

In [None]:
def translate_chinese_address_to_english(registered_address): 
    return translator.translate(registered_address, lang_src='zh-cn', lang_tgt='en')

In [None]:
def format_registered_address(registered_address_string):
    registered_address_list = registered_address_string.split(", ")
    dedup_address_list = list(dict.fromkeys(registered_address_list))
    
    dedup_address_string = ", ".join(dedup_address_list)
    formatted_address_list = dedup_address_string.split("Road, ", 1)
    return "Road,\n".join(formatted_address_list)

In [29]:
def format_updated_records_from_getters(all_records):
    filtered_all_records = list(filter(lambda record: 'full_chinese_name' in record['fields'], all_records))
    patch_owners_records = {}
    patch_leadership_records = {}
    patch_reference_urls = {}

    patch_records = []
    for record in filtered_all_records:
        if '_scrape' in record['fields'].keys():
            patch_field = {
#                 "References": record['fields']['english_name'] + " References"
            }
            
            if 'createdTime' in record.keys():
                del record['createdTime']
                
            chinese_company_name = record['fields']['full_chinese_name'].strip()
            company_table = get_company_table_from_search(chinese_company_name)
            if company_table == False:
                continue
                
            if SHOULD_UPDATE_BASIC_INFORMATION == True:
                (initial_company_basic_information, company_basic_information_dataframe) = get_initial_company_basic_information()
                patch_field = {**patch_field, **initial_company_basic_information}
            
            print('Bug:', patch_field)
            
            if SHOULD_UPDATE_CAPITAL == True:
                (initial_company_basic_information, company_basic_information_dataframe) = get_initial_company_basic_information()
                registered_capital = get_registered_capital(company_basic_information_dataframe)
                patch_field = {**patch_field, **registered_capital}
            
            if SHOULD_UPDATE_OWNERSHIP == True:
                big_record_id = record['id']
#                 (company_ownership, chinese_names) = set_company_ownership(all_records)
                (company_ownership, chinese_names, reference_url) = set_company_ownership(all_records)
                patch_owners_records[big_record_id] = chinese_names
                patch_reference_urls[big_record_id] = reference_url
                patch_field = {**patch_field, **company_ownership}

            print('TRY:', patch_owners_records)
            
            if SHOULD_UPDATE_LEADERSHIP == True:
                big_record_id = record['id']
                (leadership, chinese_names, english_names) = get_leadership(all_records)
                patch_leadership_records[big_record_id] = chinese_names
                patch_field = {**patch_field, **leadership}
                
            if SHOULD_UPDATE_CONTACT == True:
                contact = get_contact_information()
                patch_field = {**patch_field, **contact}
        
            patch_record = {
                "id": record["id"],
                "fields": patch_field
            }
            
            patch_records.append(patch_record)
#     print(patch_records)
    return (patch_records, patch_owners_records, patch_leadership_records, patch_reference_urls)

In [None]:
class RealTimeCurrencyConverter():
    def __init__(self, url):
        self.data = requests.get(url).json()
        self.rates = self.data['rates']
        self.date = self.data['date']
    def convert(self, amount, currency):
        if currency == 'USD':
            return amount
        conversion_rate = self.rates[currency]
        usd_amount = amount / self.rates[currency]
        return usd_amount

In [None]:
exchange_rate_url = 'https://api.exchangerate-api.com/v4/latest/USD'
converter = RealTimeCurrencyConverter(exchange_rate_url)
# get_registered_capital(company_basic_information_dataframe)

In [None]:
def convert_capital_to_number(capital):
    unit_multiplier = {
        '千': 1000,
        '万': 10000,
        '亿': 100000000,
    }
    
    currency = "USD"
    if "人民币" in capital:
        currency = "CNY"
    
    if "万" in capital:
        capital = capital.split("万")[0]
        capital = float(capital)*unit_multiplier["万"]
        return (capital, currency)

In [None]:
def convert_formatted_mc_actual_number(usd_amount):
    unit_multiplier = {
        'K': 1000,
        'M': 1000000,
        'B': 1000000000,
        'T': 1000000000000
    }
    
    oneplace = Decimal(10) ** -1
    if usd_amount >= unit_multiplier['T']:
        formatted_usd_amount = usd_amount/unit_multiplier['T']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' T'
    elif usd_amount >= unit_multiplier['B']:
        formatted_usd_amount = usd_amount/unit_multiplier['B']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' B'
    elif usd_amount >= unit_multiplier['M']:
        formatted_usd_amount = usd_amount/unit_multiplier['M']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' M'
    else:
        formatted_usd_amount = usd_amount/unit_multiplier['K']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' K'

In [None]:
def get_registered_capital(company_basic_information_dataframe):
    is_company_profile = all(column_name in company_basic_information_dataframe.columns for column_name in ["英文名", "企业名称", "成立日期"])
    if is_company_profile:
        for registered_capital_column_name in ["实缴资本", "注册资本"]:
            try:
                capital_string = company_basic_information_dataframe[registered_capital_column_name].values[0]
#                 print('A:',capital_string)
                (capital, currency) = convert_capital_to_number(capital_string)
#                 print('B:',(capital, currency))
                usd_amount = converter.convert(capital, currency)
#                 print('usd_amount:', usd_amount)
                registered_capital_information = {
                    "_registered_capital": "USD " + convert_formatted_mc_actual_number(usd_amount),
                }
                return registered_capital_information
            except:
                continue
    return {}

In [None]:
def patch_airtable_data_with_pagination(all_records, json_file_name_prefix):
    for count, record in enumerate(all_records, 1):
        if count % 10 == 0:
            json_records = all_records[count - 10:count]
            updated_requests = {'records': json_records, "typecast": True}
            version = int(count/10)
            json_filename = '{}_{}.json'.format(json_file_name_prefix, version)
            save_in_json(updated_requests, json_filename)
            patch_owner_response = update_airtable_data(sheet_name, json_filename)
        if count % 10 != 0:
            count = count % 10
            json_records = all_records[-count:]
            updated_requests = {'records': json_records, "typecast": True}
            version = int(count/10 + 1)
            json_filename = '{}_{}.json'.format(json_file_name_prefix, version)
            save_in_json(updated_requests, json_filename)
            patch_owner_response = update_airtable_data(sheet_name, json_filename)

# Invoke Functions

In [30]:
#Get Airtable Data
sheet_name = AIRTABLE_SHEET_NAME
airtable_response = get_airtable_data(sheet_name).json()
all_records = []
records = get_airtable_data(sheet_name)
all_records.extend(records.json()['records'])

while "offset" in records.json():
    records = get_airtable_data(sheet_name, records.json()["offset"])
    all_records.extend(records.json()['records'])

save_in_json(all_records, 'get.json')

In [31]:
#Update Airtable Data
(updated_records, patch_owners_records, patch_leadership_records, patch_reference_urls) = format_updated_records_from_getters(all_records)
new_owner_records = []
new_leader_records = []
new_references_records = []

for count, record in enumerate(updated_records, 1):
    if count % 10 == 0:
        json_records = updated_records[count - 10:count]
        updated_requests = {'records': json_records, "typecast": True}
        version = int(count/10)
        json_filename = 'patch_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
        save_in_json(patch_response, 'response_{}.json'.format(version))
        
        #gather all newly created records for future updates/patches
        patch_records = set_individual_chinese_names(patch_response, patch_owners_records)
        new_owner_records.append(patch_records)
        patch_records = set_individual_chinese_names(patch_response, patch_leadership_records, "leader_")
        new_leader_records.append(patch_records)
        patch_records = set_references_sheet(patch_response, patch_reference_urls)
        new_references_records.append(patch_records)
        
    if count % 10 != 0:
        version = int(count/10 + 1)
        count = count % 10
        json_records = updated_records[-count:]
        updated_requests = {'records': json_records, "typecast": True}
        json_filename = 'patch_{}.json'.format(version)
        save_in_json(updated_requests, json_filename)
        patch_response = update_airtable_data(sheet_name, json_filename)
        save_in_json(patch_response, 'response_{}.json'.format(version))
        
        #gather all newly created records for future updates/patches
        patch_records = set_individual_chinese_names(patch_response, patch_owners_records)
        new_owner_records.append(patch_records)
        patch_records = set_individual_chinese_names(patch_response, patch_leadership_records, "leader_")
        new_leader_records.append(patch_records)
        patch_records = set_references_sheet(patch_response, patch_reference_urls)
        new_references_records.append(patch_records)

#flatten list
new_owner_records = [record for sublist in new_owner_records for record in sublist]
new_leader_records = [record for sublist in new_leader_records for record in sublist]
new_references_records = [record for sublist in new_references_records for record in sublist]
print('new_references_records:', new_references_records)

#update newly created records
patch_airtable_data_with_pagination(new_owner_records, "new_owner")
patch_airtable_data_with_pagination(new_leader_records, "new_leader")
patch_airtable_data_with_pagination(new_references_records, "new_references")
        
print(updated_records)

print("Updated {} of records.".format(len(updated_records)))

A: 北京快手科技有限公司
B: 北京快手科技有限公司
get_initial_company_basic_information dataframe:              统一社会信用代码        企业名称 法定代表人          登记状态        成立日期        注册资本  \
0  91110108335469089C  北京快手科技有限公司    银鑫  存续（在营、开业、在册）  2015-03-19  10000万元人民币   

        实缴资本      组织机构代码            工商注册号              纳税人识别号  ...  \
0  1000万元人民币  33546908-9  110108018780037  91110108335469089C  ...   

         所属行业 所属地区           登记机关   人员规模 参保人数        核准日期  \
0  科技推广和应用服务业  北京市  北京市海淀区市场监督管理局  少于50人    3  2022-08-02   

                                    英文名 进出口企业代码                       注册地址  \
0  Beijing Yixiao Technology Co., Ltd.        -  北京市海淀区上地西路6号1幢1层101 D1-10   

                                                经营范围  
0  一般项目：技术服务、技术开发、技术咨询、技术交流、技术转让、技术推广；计算机系统服务；软件开...  

[1 rows x 23 columns]


Unnamed: 0,股东及出资信息,持股比例,认缴出资额(万元),认缴出资日期,参股日期
0,北京华艺汇龙网络科技有限公司,100%,10000,2035-03-01,2017-06-14


company_ownership_dataframe:           股东及出资信息  持股比例 认缴出资额(万元)      认缴出资日期        参股日期
0  北京华艺汇龙网络科技有限公司  100%     10000  2035-03-01  2017-06-14
search: 北京华艺汇龙网络科技有限公司
search_xpath: //a[text()='北京华艺汇龙网络科技有限公司']
links: [<selenium.webdriver.remote.webelement.WebElement (session="f0d5db5d9f8e52fc73cc0a334ad478f3", element="46d0be54-2175-4009-af33-3e71627b706b")>, <selenium.webdriver.remote.webelement.WebElement (session="f0d5db5d9f8e52fc73cc0a334ad478f3", element="9ece5836-b54f-4b20-800a-4d9910b9c998")>, <selenium.webdriver.remote.webelement.WebElement (session="f0d5db5d9f8e52fc73cc0a334ad478f3", element="aaf5ae34-b32e-45e4-bbb3-830d0bdd4cb3")>, <selenium.webdriver.remote.webelement.WebElement (session="f0d5db5d9f8e52fc73cc0a334ad478f3", element="5299d7bb-0e07-4c5b-8453-cbf2b526e802")>]
Im HEREEEEE in try:           股东及出资信息  持股比例 认缴出资额(万元)      认缴出资日期        参股日期
0  北京华艺汇龙网络科技有限公司  100%     10000  2035-03-01  2017-06-14
name: 北京华艺汇龙网络科技有限公司
entering elif:
TRY: {'recm8q0O6XuBKmptz': ['北京华艺

Unnamed: 0,Unnamed: 1,被投资企业名称,状态,法定代表人,注册资本,成立日期,投资日期,持股比例,认缴出资额,关联产品/机构
0,,北京快晴空科技有限公司,存续,银鑫,200000万元人民币,21-05-06,21-05-05,100%,200000万元人民币,快手
1,,北京快悠然科技有限公司,存续,银鑫,200000万元人民币,21-05-06,21-05-05,100%,200000万元人民币,快手
2,,北京快星空科技有限公司,存续,银鑫,200000万元人民币,21-05-06,21-05-05,100%,200000万元人民币,快手
3,,成都快购科技有限公司,存续,杨远熙,5000万元人民币,2019-10-31,2019-10-30,100%,5000万元人民币,快手电商
4,,快手智能云（乌兰察布）科技有限公司,存续,贾弘毅,5000万元人民币,20-05-29,20-05-28,100%,5000万元人民币,快手
5,,北京顺捷中恒科技有限公司,存续,银鑫,5000万元人民币,21-07-05,21-07-04,100%,5000万元人民币,快手
6,,北京赛瑞思动文化传播有限公司,存续,贾弘毅,2019.047619万元人民币,2013-09-25,20-03-09,100%,2019.047619万元人民币,赛瑞思动
7,,河北雄安快手科技有限公司,存续,杨远熙,2000万元人民币,2018-07-13,2018-07-12,100%,2000万元人民币,快手
8,,厦门诺惟启丰创业投资合伙企业（有限合伙）,存续,厦门诺惟启丰企业管理合伙企业（有限合伙）,10000万元人民币,22-02-09,-,20%,2000万元人民币,诺惟资本
9,,武汉雨霁科技有限公司,存续,贾弘毅,1000万元人民币,2019-09-11,2019-09-10,100%,1000万元人民币,快手


Unnamed: 0,Unnamed: 1,姓名,职务,持股比例,最终受益股份,个人简介
0,,银鑫,"执行董事,经理,法定代表人",-,-,详情
1,,杨远熙,监事,-,90.00%,详情


row_title: 执行董事
row_title: 经理
row_title: 法定代表人
row_title: 监事
Leadership: {'_leader_1': 'Yín Xīn', '_leader_1_title': ['Executive Director', 'Manager', 'Legal Representative'], 'leadership_source': 'Qichacha', 'leadership_source_url': 'https://www.qcc.com/firm/9c9efa4bd873bfcb46066f9dcc5d0fc4.html', '_leader_2': 'Yáng Yuǎnxī', '_leader_2_title': ['Supervisor']}
[{'id': 'recm8q0O6XuBKmptz', 'fields': {'_full_english_name': 'Beijing Yixiao Technology Co., Ltd. ', 'english_name': 'Beijing Yixiao Technology', '_full_chinese_name': '北京快手科技有限公司', 'chinese_name': '北京快手科技', 'year_founded': '2015', 'hq_location': ['Beijing, China'], '_registered_address': '北京市海淀区上地西路6号1幢1层101 D1-10', 'year_founded_source': 'Qichacha', 'year_founded_source_url': 'https://www.qcc.com/firm/9c9efa4bd873bfcb46066f9dcc5d0fc4.html', 'company_type': 'Publicly Traded', '_legal_representative': 'Yín Xīn', 'tag_text': 'Beijing Yixiao Technology', '_profile_type': 'Company', '_industry_information': '一般项目：技术服务、技术开发、技术咨询、技术交

# In progress