In [21]:
import pandas as pd
from pathlib import Path

data_folder = str(Path('.') / '../../input_data/ch')
data_path1 = str(Path(data_folder) / '2022.xls')
data_path2 = str(Path(data_folder) / '2023.xls')
data_path3 = str(Path(data_folder) / '2024.xls')
data_out_path = "ch_data_en.csv"

In [22]:
df1 = pd.read_excel(data_path1, header=1, usecols=['Veröffentlichungs-Nummer', 'Titel', 'Zusammenfassung', 'Veröffentlichungs-Datum'])
df2 = pd.read_excel(data_path2, header=1, usecols=['Veröffentlichungs-Nummer', 'Titel', 'Zusammenfassung', 'Veröffentlichungs-Datum'])
df3 = pd.read_excel(data_path3, header=1, usecols=['Veröffentlichungs-Nummer', 'Titel', 'Zusammenfassung', 'Veröffentlichungs-Datum'])

In [23]:
# concatenate the dataframes
df = pd.concat([df1, df2, df3], ignore_index=True)

In [24]:
df.rename(columns={'Veröffentlichungs-Nummer': 'id', 'Titel': 'title', 'Zusammenfassung': 'abstract', 'Veröffentlichungs-Datum': 'pub_date'}, inplace=True)

In [25]:
df.dropna(subset=['abstract'], inplace=True)

In [26]:
from arrangement import arrange_txt_en

In [27]:
mask_en = df['title'].str.contains(r'\[EN\]') & df['abstract'].str.contains(r'\[EN\]')
df = df[mask_en].copy()

In [28]:
df[['title', 'abstract']] = df[['title', 'abstract']].apply(arrange_txt_en, axis=1)

In [29]:
df.dropna(subset=['abstract'], inplace=True)

In [31]:
df['pub_date'] = pd.to_datetime(df['pub_date'], format='%d.%m.%Y')

In [32]:
df_filtered = df.sort_values(by='pub_date', ascending=False).drop_duplicates(subset='abstract')

In [33]:
pd.set_option('display.max_colwidth', None)

In [36]:
df_filtered['pub_date'] = df_filtered['pub_date'].dt.to_period('M')

In [38]:
start_date = '2022-01'
end_date = '2024-04'
df_t = df_filtered[(df_filtered['pub_date'] >= start_date) & (df_filtered['pub_date'] <= end_date)]

In [41]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27005 entries, 20065 to 9999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype    
---  ------    --------------  -----    
 0   id        27005 non-null  object   
 1   pub_date  27005 non-null  period[M]
 2   title     27005 non-null  object   
 3   abstract  27005 non-null  object   
dtypes: object(3), period[M](1)
memory usage: 1.0+ MB


In [37]:
df_filtered['pub_date']

20011    2024-05
20007    2024-05
20015    2024-05
20014    2024-05
20013    2024-05
          ...   
9927     2022-01
9926     2022-01
9925     2022-01
9924     2022-01
9999     2022-01
Name: pub_date, Length: 27059, dtype: period[M]

In [28]:
# change pub_date to pub_year
# Convert pub_date to datetime format
# Extract the year from pub_date to create pub_year
df_filtered['pub_year'] = df_filtered['pub_date'].dt.year
df_filtered['text'] = df_filtered['title'] + '. ' + df_filtered['abstract']

In [29]:
df_filtered[['text']].head(2)

Unnamed: 0,text
20013,"METHOD FOR DYNAMIC TARGET TRACKING BY LEGGED ROBOT. Disclosed in the present invention is a method for dynamic target tracking by a legged robot. Firstly, images and videos in walking processes of pedestrians wearing masks, not wearing masks, and incorrectly wearing masks are collected to build a data set used for pedestrian mask wearing detection. Secondly, a target recognition module based on a recurrent neural network is used to determine whether a pedestrian wears a mask, and acquire face position coordinates of a pedestrian not wearing a mask in a current frame. Thirdly, a residual network is used as a reference network to extract a face semantic feature of the pedestrian not wearing a mask in the current image frame, and predict a face semantic feature of the pedestrian in a next image frame. Finally, a target tracking module based on a siamese network is designed to, by calculating a correlation coefficient between pedestrian face position semantic feature mapping in the current frame and the next frame, track the pedestrian ..."
20008,"OPERATION AND MAINTENANCE ROBOT. An operation and maintenance robot (100), comprising: an operation and maintenance robot body (10); first support members (20, 20a) each comprising a first support body (21) and a connection part (22), the connection part (22) being fixedly arranged on the first support body (21), and the first support body (21) being fixed to the operation and maintenance robot body (10); second support members (30, 30a) each comprising a second support body (31), a first sliding part (32a) and a second sliding part (32b), the second support body (31) comprising a first side surface and a second side surface, the first sliding part (32a) being fixedly arranged on the first side surface, the second sliding part (32b) being fixedly arranged on the second side surface, and the first sliding part (32a) being slidably connected to the connection part (22), such that the second support members (30, 30a) are close to or away from the operation and maintenance robot body (10) relative to the first support ..."


In [30]:
df_filtered.drop(columns=['title', 'abstract', 'pub_date'], inplace=True)

In [32]:
df_filtered[['id', 'pub_year', 'text']].to_csv(data_out_path, index=False)