In [1]:
import pandas as pd
import psycopg
import yaml

with open('../../config/config.yaml', mode='r', encoding='utf-8') as f:
    config = yaml.safe_load(f)
    db_config = config['database']

    connection = psycopg.connect(
        dbname=db_config['db_name'],
        user=db_config['user'],
        password=db_config['password'],
        host=db_config['host'],
        port=db_config['port']
    )

In [3]:
df = pd.read_csv('../../output/current_tech_noun_phrases_inverse_index.txt', sep='\t', header=None)
df.columns = ['tech_noun_phrase', 'uuid']
df['tech_noun_phrase'].nunique()

76

In [4]:
df.groupby('tech_noun_phrase').count().sort_values('uuid', ascending=False).head(10)

Unnamed: 0_level_0,uuid
tech_noun_phrase,Unnamed: 1_level_1
特征工程,111
联邦学习,93
窄带物联网,81
残差神经网络,69
虚拟现实头戴,61
词嵌入,53
分布式账本,52
折叠屏,45
曲面屏,43
无人机飞行器,35


In [5]:
# query appl year from database
appl_year_df = pd.read_sql("SELECT uuid, appl_year FROM cnipa_appl WHERE patent_type='发明' and CAST(appl_year AS INT) BETWEEN 2014 AND 2023 and appl_id is not null", connection)
len(appl_year_df)

  appl_year_df = pd.read_sql("SELECT uuid, appl_year FROM cnipa_appl WHERE patent_type='发明' and CAST(appl_year AS INT) BETWEEN 2014 AND 2023 and appl_id is not null", connection)


16664095

In [6]:
df = df.merge(appl_year_df, on='uuid', how='inner')

In [7]:
result_df = df.groupby(['tech_noun_phrase', 'appl_year']).count().reset_index().sort_values(['tech_noun_phrase', 'appl_year'])
# change appl_year to int
result_df['appl_year'] = result_df['appl_year'].astype(int)

In [9]:
full_years = pd.Series(range(2014, 2023 + 1), name='appl_year')

In [10]:
all_combinations = pd.MultiIndex.from_product([result_df['tech_noun_phrase'].unique(), full_years], names=['tech_noun_phrase', 'appl_year'])

In [11]:
result_df = pd.merge(result_df, full_years, on='appl_year', how='right')

In [12]:
df_complete = pd.DataFrame(index=all_combinations).reset_index()
df_complete = pd.merge(df_complete, result_df, on=['tech_noun_phrase', 'appl_year'], how='left')
df_complete['uuid'] = df_complete['uuid'].fillna(0)

In [13]:
df_complete['cumsum'] = df_complete.groupby('tech_noun_phrase')['uuid'].cumsum()

In [14]:
df_complete[df_complete['tech_noun_phrase'] == '数字孪生']

Unnamed: 0,tech_noun_phrase,appl_year,uuid,cumsum
370,数字孪生,2014,0.0,0.0
371,数字孪生,2015,0.0,0.0
372,数字孪生,2016,0.0,0.0
373,数字孪生,2017,1.0,1.0
374,数字孪生,2018,1.0,2.0
375,数字孪生,2019,2.0,4.0
376,数字孪生,2020,3.0,7.0
377,数字孪生,2021,0.0,7.0
378,数字孪生,2022,0.0,7.0
379,数字孪生,2023,0.0,7.0


In [15]:
df_complete['increase_ratio'] = df_complete.groupby('tech_noun_phrase')['cumsum'].pct_change()

In [30]:
df_complete[df_complete['tech_noun_phrase'] == '信息年龄']

Unnamed: 0,tech_noun_phrase,appl_year,uuid,cumsum,increase_ratio
120,信息年龄,2014,0.0,0.0,
121,信息年龄,2015,0.0,0.0,
122,信息年龄,2016,0.0,0.0,
123,信息年龄,2017,0.0,0.0,
124,信息年龄,2018,0.0,0.0,
125,信息年龄,2019,4.0,4.0,inf
126,信息年龄,2020,0.0,4.0,0.0
127,信息年龄,2021,0.0,4.0,0.0
128,信息年龄,2022,0.0,4.0,0.0
129,信息年龄,2023,0.0,4.0,0.0


In [21]:
import pickle

existing_set = pickle.load(open('../../output/existing_noun_phrase_set.pkl', 'rb'))

In [29]:
len(existing_set)

13338607

In [88]:
'双极化馈电网络' in existing_set

False

In [None]:
for tech_noun_phrase, group in result_df.groupby('tech_noun_phrase'):
    year_list = group['year'].tolist()
    count_list = group['count'].tolist()