# Convert No

`No`列に`注1`というような注記が書かれている場合に、注記部分を除去する処理。
ただし、事前に表の末尾に記載の注記内容を確認して必要な処理を行ったうえで、この処理を実行すること。

In [18]:
import os, re
import pandas as pd

csv_folder = os.path.join('..', 'intermediate-files')
file_id = '001325486'
csv_file_name = f'{file_id}-manually2.csv' # sys.argv[1]
converted_csv_file_name = f'{file_id}-manually3.csv' # sys.argv[2]

csv_file_path = os.path.join(csv_folder, csv_file_name)
original_df = pd.read_csv(csv_file_path)
df = original_df.copy()

In [19]:
# No列が文字列データの場合のみ、「注」の除去やint型への型変換などを行う
if pd.api.types.is_string_dtype(df['no'].dtype):
	target_series = df[df['no'].str.contains('注')]['no']
	for index, no_value in target_series.items():
		fixed_no = re.sub('注[0-9]*', '', no_value)
		#print(f'no: {no_value}, fixed_no: {fixed_no}')
		df.loc[index, 'no'] = fixed_no
	df['no'] = df['no'].astype(int)

In [20]:
df = df.sort_values('no').reset_index(drop=True)

In [21]:
# 何回かに分けてageデータをきれいにする
## 米印以降を削除
def delete_after_asterisk(df, key):
	not_nan_df = df[df[key].notna()]
	if not_nan_df.empty:
		return
	
	contains_series = not_nan_df[key].str.contains('※')
	need_delete_asterisk_series = df.loc[contains_series.index, key]
	for index, value in need_delete_asterisk_series.items():
		value = value.replace('\r\n', '\n').replace('\n', '')
		fixed_value = re.sub('※.*', '', value)
		#print(f'value: {value}, fixed_value: {fixed_value}')
		df.loc[index, key] = fixed_value

In [22]:
## 右矢印がある場合に、右側のデータだけ使う
def use_right_side_of_arrow(df, key):
	not_nan_df = df[df[key].notna()]
	if not_nan_df.empty:
		return

	contains_series = not_nan_df[key].str.contains('→')
	need_use_right_side_of_arrow_series = df.loc[contains_series.index, key]
	for index, value in need_use_right_side_of_arrow_series.items():
		value = value.replace('\r\n', '\n').replace('\n', '')
		fixed_value = re.sub('（.*', '', value)

		split_values = fixed_value.split('→')
		if len(split_values) > 1:
			fixed_value = split_values[len(split_values)-1]

		#print(f'value: {value}, fixed_value: {fixed_value}')
		df.loc[index, key] = fixed_value

In [23]:
delete_after_asterisk(df, 'age')
delete_after_asterisk(df, 'vaccinated_date')
delete_after_asterisk(df, 'onset_date')
delete_after_asterisk(df, 'lot_no')
delete_after_asterisk(df, 'vaccinated_times')
delete_after_asterisk(df, 'pre_existing_disease_names')
delete_after_asterisk(df, 'reported_desc')
delete_after_asterisk(df, 'causal_relationship')
delete_after_asterisk(df, 'possible_presence_of_other_factors')

In [24]:
use_right_side_of_arrow(df, 'age')
use_right_side_of_arrow(df, 'vaccinated_date')
use_right_side_of_arrow(df, 'onset_date')
use_right_side_of_arrow(df, 'lot_no')
use_right_side_of_arrow(df, 'vaccinated_times')
use_right_side_of_arrow(df, 'pre_existing_disease_names')
use_right_side_of_arrow(df, 'reported_desc')
use_right_side_of_arrow(df, 'causal_relationship')
use_right_side_of_arrow(df, 'possible_presence_of_other_factors')

In [25]:
cr_nan_df = df[df['causal_relationship'].isna()]
tufd_series = cr_nan_df[cr_nan_df['tests_used_for_determination'].notna()]['tests_used_for_determination']
split_tufd = tufd_series.str.split(' ')

for index, value in split_tufd.items():
	if len(value) == 2:
		df.loc[index, 'tests_used_for_determination'] = value[0]
		df.loc[index, 'causal_relationship'] = value[1]


In [26]:
lot_no_nan_df = df[df['lot_no'].isna()]
split_onset_date = lot_no_nan_df['onset_date'].str.split(' ')

if pd.api.types.is_float_dtype(df['lot_no'].dtype):
	df['lot_no'] = df['lot_no'].astype(object)

for index, value in split_onset_date.items():
	if len(value) == 2:
		df.loc[index, 'onset_date'] = value[0]
		df.loc[index, 'lot_no'] = value[1]
	elif len(value) == 3:
		df.loc[index, 'vaccinated_date'] = value[0]
		df.loc[index, 'onset_date'] = value[1]
		df.loc[index, 'lot_no'] = value[2]

In [27]:
vd_nan_df = df[df['vaccinated_date'].isna()]
split_gender = vd_nan_df['gender'].str.split(' ')

for index, value in split_gender.items():
	if len(value) == 2:
		df.loc[index, 'gender'] = value[0]
		df.loc[index, 'vaccinated_date'] = value[1]

In [28]:
onset_date_nan_df = df[df['onset_date'].isna()]
split_vd = onset_date_nan_df['vaccinated_date'].str.split(' ')

for index, value in split_vd.items():
	if len(value) == 2:
		df.loc[index, 'vaccinated_date'] = value[0]
		df.loc[index, 'onset_date'] = value[1]

In [29]:
lot_no_nan_df_series = df[df['lot_no'].isna()]['no']
if lot_no_nan_df_series.count() > 0:
	print('以下の項目はロット番号が空のため、手作業で修正が必要です。')
	for index, no_value in lot_no_nan_df_series.items():
		print(f' - Index {index}, No {no_value}')

In [30]:
converted_csv_file_path = os.path.join(csv_folder, converted_csv_file_name)
with open(converted_csv_file_path, encoding='utf-8', mode='w') as f:
	f.write(df.to_csv(index=False))

In [31]:
## 取り消し線で消して書き直した可能性があるデータを列挙してログ出力
need_manually_fix_age_series = df[df['age'].str.contains('基づ')]['age']
for index, age_value in need_manually_fix_age_series.items():
	age_value = age_value.replace('\r\n', '\n').replace('\n', '')
	print(f'No. {df.loc[index, "no"]} の 年齢 {age_value} は取り消し線で訂正されている可能性があります。手作業で修正が必要です。')

In [32]:
crbe_nan_series = df[df['causal_relationship_by_expert'].isna()]['no']
if not crbe_nan_series.empty:
	print('以下のデータは causal_relationship_by_expert がNaNです。手作業で修正が必要です。')
	for index, value in crbe_nan_series.items():
		print(f' - Index: {index}, No: {value}')

In [33]:
def remove_empty_lines(source_path, target_path):
    fixed_data = ''
    with open(source_path, encoding="utf-8") as f:
        for line in f:
            if line.isspace():
                continue
            if line.startswith('0,1,2,3,4,5'):
                continue
            if line.startswith(','):
                line = re.sub('^,', '', line)
            fixed_data += line

    with open(target_path, "w", encoding="utf-8") as f:
        f.write(fixed_data)

In [34]:
remove_empty_lines(converted_csv_file_path, converted_csv_file_path)