### 根据剩余的技术术语及其兴起年，统计各省、各公司的创新能力

In [15]:
import pandas as pd
import numpy as np
import psycopg
import yaml

with open('../../config/config.yaml', mode='r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
    db_config = config['database']

    connection = psycopg.connect(
        dbname=db_config['db_name'],
        user=db_config['user'],
        password=db_config['password'],
        host=db_config['host'],
        port=db_config['port']
    )

tech_noun_phrase_df = pd.read_csv(r'../../output/current_tech_noun_phrases_emergence_year.txt', sep='\t', header=None)
tech_noun_phrase_df.columns = ['tech_noun_phrase', 'emergence_year']
tech_noun_phrase_inverse_index_df = pd.read_csv(r'../../output/current_tech_noun_phrases_inverse_index.txt', sep='\t', header=None)
tech_noun_phrase_inverse_index_df.columns = ['tech_noun_phrase', 'uuid']
valid_uuid_list = list(tech_noun_phrase_inverse_index_df['uuid'].unique())
uuid_list_str = ", ".join([f"'{uuid}'" for uuid in valid_uuid_list])

query = f"""
SELECT 
    t1.uuid, 
    t1.appl_year,
    t2.name AS applicant_name,
    t2.province,
    t2.city
FROM 
    cnipa_appl AS t1
JOIN
    cnipa_appl_applicant AS t2
ON
    t1.uuid = t2.uuid
WHERE 
    t1.patent_type='发明' 
    and CAST(t1.appl_year AS INT) BETWEEN 2014 AND 2023 
    and t1.appl_id is not null
    and t1.uuid IN ({uuid_list_str})
"""

patent_info_df = pd.read_sql(query, connection)

  patent_info_df = pd.read_sql(query, connection)


In [24]:
tech_noun_phrase_patent_df = pd.merge(tech_noun_phrase_inverse_index_df, tech_noun_phrase_df, on='tech_noun_phrase', how='inner')
tech_noun_phrase_patent_df = pd.merge(tech_noun_phrase_patent_df, patent_info_df, on='uuid', how='inner')

# 清除emergence_year为空的数据
tech_noun_phrase_patent_df = tech_noun_phrase_patent_df.dropna(subset=['emergence_year'])
# 清除appl_year - emergence_year > 3的数据
tech_noun_phrase_patent_df['appl_year'] = tech_noun_phrase_patent_df['appl_year'].astype(int)
tech_noun_phrase_patent_df['emergence_year'] = tech_noun_phrase_patent_df['emergence_year'].astype(int)
tech_noun_phrase_patent_df = tech_noun_phrase_patent_df[tech_noun_phrase_patent_df['appl_year'] - tech_noun_phrase_patent_df['emergence_year'] <= 5]
# 清除省份为空的数据
tech_noun_phrase_patent_df = tech_noun_phrase_patent_df.dropna(subset=['province'])
# 清除省份为0的数据
tech_noun_phrase_patent_df = tech_noun_phrase_patent_df[tech_noun_phrase_patent_df['province'] != '0']

In [25]:
tech_noun_phrase_patent_df.groupby('tech_noun_phrase').size().sort_values(ascending=False).head(10)

tech_noun_phrase
区块链        9520
联邦学习       3155
生成式对抗网络    2224
YOL        1561
智能网联        808
折叠屏         362
数字孪生        339
自监督学习       319
特征工程        316
微塑料         285
dtype: int64

#### 计算省级创新能力

专利数量

In [26]:
province_patent_count = tech_noun_phrase_patent_df.groupby('province').size().reset_index(name='patent_count')
province_patent_count = province_patent_count.sort_values(by='patent_count', ascending=False)
province_patent_count

Unnamed: 0,province,patent_count
12,广东省,6367
3,北京市,5623
19,浙江省,2092
15,江苏省,2065
0,上海市,1670
6,四川省,908
10,山东省,836
29,陕西省,755
21,湖北省,720
24,福建省,509


#### 计算企业创新能力

In [29]:
company_patent_count = tech_noun_phrase_patent_df.groupby('applicant_name').size().reset_index(name='patent_count')
company_patent_count = company_patent_count.sort_values(by='patent_count', ascending=False)
company_patent_count.head(20)

Unnamed: 0,applicant_name,patent_count
4347,腾讯科技（深圳）有限公司,677
1387,华为技术有限公司,558
2287,平安科技（深圳）有限公司,385
3647,深圳前海微众银行股份有限公司,351
4572,西安电子科技大学,278
4154,电子科技大学,227
600,中国联合网络通信集团有限公司,211
4163,百度在线网络技术（北京）有限公司,201
3003,杭州复杂美科技有限公司,201
3672,深圳壹账通智能科技有限公司,189


In [28]:
len(company_patent_count)

5078

In [32]:
company_annual_patent_count = tech_noun_phrase_patent_df.groupby(['applicant_name', 'appl_year']).size().reset_index(name='patent_count')
company_annual_patent_count = company_annual_patent_count.sort_values(by=['applicant_name', 'appl_year'], ascending=True)
company_annual_patent_count.head(20)

Unnamed: 0,applicant_name,appl_year,patent_count
0,OPPO广东移动通信有限公司,2017,9
1,OPPO广东移动通信有限公司,2018,50
2,OPPO广东移动通信有限公司,2019,60
3,OPPO广东移动通信有限公司,2020,2
4,OPPO广东移动通信有限公司,2021,4
5,OPPO广东移动通信有限公司,2022,3
6,OPPO（重庆）智能科技有限公司,2018,2
7,OPPO（重庆）智能科技有限公司,2019,3
8,TCL华星光电技术有限公司,2021,1
9,TCL科技集团股份有限公司,2020,1


In [34]:
company_annual_patent_count.to_csv(r'../../output/company_annual_patent_count.csv', index=False)