In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import io
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d')



### <span style="color: blue; ">Data, All(include abuse NA)</span>

##### input "data0", change the order to "df"
input dataは元データセットの内容で、CGC, sex, age_year&age_month→age, abuse, 歯種以外の項目を英語でmappingしたデータです。

In [2]:
data0 = pd.read_csv("/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/AllData_tillMar2024.csv")

# data0.head()

In [3]:
# 1. Define the logical order for each column
abuse_order = [
    "Physical Abuse", "Neglect", "Emotional Abuse", "Sexual Abuse","Delinquency","Parenting Difficulties","Others"
]
occlusal_order = [
    "Normal Occlusion", "Crowding", "Anterior Crossbite", "Open Bite", "Maxillary Protrusion", "Crossbite", "Others"
]
need_treated_order = ["No Treatment Required", "Treatment Required"]
emergency_order = ["Urgent Treatment Required"]
gingivitis_order = ["No Gingivitis", "Gingivitis"]
oral_clean_order = ["Poor", "Fair", "Good"]
habits_order = [
    "None", "Digit Sucking", "Nail biting", "Tongue Thrusting", "Smoking", "Others"
]

# 2. Convert columns to Categorical with the specified order
target_df = data0.copy()

target_df['abuse'] = pd.Categorical(target_df['abuse'], categories=abuse_order, ordered=True)
target_df['occlusalRelationship'] = pd.Categorical(target_df['occlusalRelationship'], categories=occlusal_order, ordered=True)
target_df['needTOBEtreated'] = pd.Categorical(target_df['needTOBEtreated'], categories=need_treated_order, ordered=True)
target_df['emergency'] = pd.Categorical(target_df['emergency'], categories=emergency_order, ordered=True)
target_df['gingivitis'] = pd.Categorical(target_df['gingivitis'], categories=gingivitis_order, ordered=True)
target_df['OralCleanStatus'] = pd.Categorical(target_df['OralCleanStatus'], categories=oral_clean_order, ordered=True)
target_df['habits'] = pd.Categorical(target_df['habits'], categories=habits_order, ordered=True)

df = target_df

print("Columns converted to ordered categories.")

Columns converted to ordered categories.


In [4]:
# Simple counts
print(f"Abuse types count: {len(df)}")
print(df['abuse'].value_counts())

print("\nAbuse_num for each abuse type:")
print(df['abuse_num'].value_counts())

print("\nAbuse_num for each abuse type:")
print(df.groupby('abuse')['abuse_num'].value_counts())

Abuse types count: 2162
abuse
Physical Abuse            677
Delinquency               540
Neglect                   355
Parenting Difficulties    224
Emotional Abuse           211
Others                     87
Sexual Abuse               62
Name: count, dtype: int64

Abuse_num for each abuse type:
abuse_num
1    2071
2      85
0       6
Name: count, dtype: int64

Abuse_num for each abuse type:
abuse                   abuse_num
Physical Abuse          1            646
                        2             31
                        0              0
Neglect                 1            328
                        2             27
                        0              0
Emotional Abuse         1            201
                        2             10
                        0              0
Sexual Abuse            1             60
                        2              2
                        0              0
Delinquency             1            531
                        2            

  print(df.groupby('abuse')['abuse_num'].value_counts())


#### DMFT Calculate → Final Dataframe

In [33]:
# permanent teeth
perm_cols = [
    'U17', 'U16', 'U15', 'U14', 'U13', 'U12', 'U11', 'U21', 'U22', 'U23', 'U24', 'U25', 'U26', 'U27', 
    'L37', 'L36', 'L35', 'L34', 'L33', 'L32', 'L31', 'L41', 'L42', 'L43', 'L44', 'L45', 'L46', 'L47'
]
# baby teeth
baby_cols = [
    'u55', 'u54', 'u53', 'u52', 'u51', 'u61', 'u62', 'u63', 'u64', 'u65', 
    'l75', 'l74', 'l73', 'l72', 'l71', 'l81', 'l82', 'l83', 'l84', 'l85'
]
# all teeth
all_cols = perm_cols + baby_cols

# 部位定義（前歯・奥歯）
perm_front = [c for c in perm_cols if c[-1] in ['1', '2', '3']]
perm_back = [c for c in perm_cols if c[-1] in ['4', '5', '6', '7']]
baby_front = [c for c in baby_cols if c[-1] in ['1', '2', '3']]
baby_back = [c for c in baby_cols if c[-1] in ['4', '5']]

関数

In [34]:
def calculate_comprehensive_metrics(row):
    """
    基本指標(DMFT等)とリスク指標(UTN等)をまとめて計算し、
    D,M,F,d,m,f の詳細内訳も返す関数
    """
    # 歯列ごとのデータ抽出
    p_teeth = row[perm_cols] # 永久歯
    b_teeth = row[baby_cols] # 乳歯
    all_teeth = pd.concat([p_teeth, b_teeth]) # 全顎
    
    # -------------------------------------------------------
    # A. 個別カウント (D, M, F, d, m, f)
    # ※ D/d には '3:C' と '8:残根' の両方を含めます（未処置のため）
    # -------------------------------------------------------
    # 永久歯 (Permanent)
    Perm_D = (p_teeth == 3).sum() + (p_teeth == 8).sum() # 未処置 + 残根
    Perm_M = (p_teeth == 4).sum()                        # 喪失
    Perm_F = (p_teeth == 1).sum()                        # 処置
    Perm_DMFT = Perm_D + Perm_M + Perm_F                 # DMFT: 未処置 + 喪失 + 処置

    Perm_C0 = (p_teeth == 2).sum()                       # 永久歯C0：要観察歯
    Perm_DMFT_C0 = Perm_D + Perm_M + Perm_F + Perm_C0    # DMFT+C0: 未処置 + 喪失 + 処置 + 要観察歯
    
    # 乳歯 (Deciduous)
    Baby_d = (b_teeth == 3).sum() + (b_teeth == 8).sum() # 未処置 + 残根
    Baby_m = (b_teeth == 4).sum()                        # 喪失
    Baby_f = (b_teeth == 1).sum()                        # 処置
    Baby_DMFT = Baby_d + Baby_m + Baby_f

    Baby_C0 = (b_teeth == 2).sum()                       # 乳歯C0：要観察歯
    Baby_DMFT_C0 = Baby_d + Baby_m + Baby_f + Baby_C0    # DMFT+C0: 未処置 + 喪失 + 処置 + 要観察歯


    # その他のカウント
    count_C0 = (all_teeth == 2).sum()       # 要観察歯(全体)
    count_Trauma = (all_teeth == 7).sum()   # 外傷(全体)
    count_RDT_total = (all_teeth == 8).sum()# 残根(全体)

    # -------------------------------------------------------
    # B. 総合指標 (Summary Indices)
    # -------------------------------------------------------
    # DMFT Index (永久歯 + 乳歯の合計負荷)
    dmft_total_score = (Perm_D + Perm_M + Perm_F) + (Baby_d + Baby_m + Baby_f)
    dmft_C0 = Perm_DMFT_C0 + Baby_DMFT_C0
    
    # Present Teeth (現在歯数)
    # 除外: -1(未萌出), 6(先天欠如), 4(喪失) and NA (Missing)
    # ※喪失歯(4)は「過去に虫歯だった」指標ですが、「現在口にある」歯ではないため除きます
    present_teeth = len(all_teeth) - all_teeth.isin([-1, 6, 4]).sum() - all_teeth.isna().sum()
    present_baby_teeth = len(b_teeth) - b_teeth.isin([-1, 6, 4]).sum() - b_teeth.isna().sum()
    present_perm_teeth = len(p_teeth) - p_teeth.isin([-1, 6, 4]).sum() - p_teeth.isna().sum()
    
    # Healthy Rate (健全歯率: 処置も虫歯もない歯 / 現在歯数)
    # 健全(0)のみカウント
    count_sound = (all_teeth == 0).sum()
    if present_teeth > 0:
        healthy_rate = (count_sound / present_teeth) * 100
    else:
        healthy_rate = 0
        
    # Care Index (処置率: F / (D+M+F))
    total_filled = Perm_F + Baby_f
    if dmft_total_score > 0:
        care_index = (total_filled / dmft_total_score) * 100
    else:
        care_index = np.nan # カリエスフリーの場合は計算不可
        
    # -------------------------------------------------------
    # C. ネグレクト・リスク指標 (Risk Metrics)
    # -------------------------------------------------------
    # UTN (未処置う蝕率): (D + d) / (D + d + F + f) ※Mは含めない
    active_decay = Perm_D + Baby_d
    total_experience_present = active_decay + total_filled
    
    if total_experience_present > 0:
        utn_score = (active_decay / total_experience_present) * 100
    else:
        utn_score = 0

    # 結果をSeriesで返す
    return pd.Series({
        'Perm_D': Perm_D, 'Perm_M': Perm_M, 'Perm_F': Perm_F,
        'Baby_d': Baby_d, 'Baby_m': Baby_m, 'Baby_f': Baby_f,
        'Perm_DMFT': Perm_DMFT,
        'Baby_DMFT': Baby_DMFT,
        'Perm_DMFT_C0': Perm_DMFT_C0,
        'Baby_DMFT_C0': Baby_DMFT_C0,
        'DMFT_Index': dmft_total_score, # 全体のう蝕経験歯数
        'DMFT_C0': dmft_C0,
        
        'Present_Teeth': present_teeth,
        'Present_Perm_Teeth': present_perm_teeth,
        'Present_Baby_Teeth': present_baby_teeth,
        
        'Healthy_Rate': round(healthy_rate, 1),
        'C0_Count': count_C0,
        'Care_Index': round(care_index, 1),
        
        'Trauma_Count': count_Trauma,   # 外傷数
        'RDT_Count': count_RDT_total,   # 残根数
        'UTN_Score': round(utn_score, 1)# 未処置率(%)
    })

In [35]:
# ==========================================
# 3. 計算とデータフレームへの結合
# ==========================================

# 関数を適用して新しいデータフレームを作成
metrics_df = df.apply(calculate_comprehensive_metrics, axis=1)

# 元のデータフレームに結合 (axis=1 で横に結合)
df_final = pd.concat([df, metrics_df], axis=1)

# ==========================================
# 確認出力
# ==========================================
print("--- 追加された指標のカラム ---")
print(metrics_df.columns.tolist())

--- 追加された指標のカラム ---
['Perm_D', 'Perm_M', 'Perm_F', 'Baby_d', 'Baby_m', 'Baby_f', 'Perm_DMFT', 'Baby_DMFT', 'Perm_DMFT_C0', 'Baby_DMFT_C0', 'DMFT_Index', 'DMFT_C0', 'Present_Teeth', 'Present_Perm_Teeth', 'Present_Baby_Teeth', 'Healthy_Rate', 'C0_Count', 'Care_Index', 'Trauma_Count', 'RDT_Count', 'UTN_Score']


### Grade Setting

In [36]:
df = df_final
# Create life_stage column
# life stage: 0-5 Early Childhood
# life stage: 6-11 Middle Childhood
# life stage: 12-17 Adolescence

df['life_stage'] = pd.cut(
    df['age_year'],
    bins=[0, 6, 12, 18],
    labels=['Early Childhood', 'Middle Childhood', 'Adolescence'],
    right=False,
    include_lowest=True
)

# Detailed verification
print("=== Life Stage Summary ===")
print(df.groupby('life_stage').agg({
    'age_year': ['min', 'max', 'count', 'mean']
}).round(2))

print("\n=== Sample Records ===")
print(df[['age_year', 'life_stage', 'abuse']].head(15))

# Check for any missing values
print(f"\n=== Missing Values ===")
print(f"Total records: {len(df)}")
print(f"Life stage assigned: {df['life_stage'].notna().sum()}")
print(f"Life stage missing: {df['life_stage'].isna().sum()}")

=== Life Stage Summary ===
                 age_year                 
                      min max count   mean
life_stage                                
Early Childhood         1   5   318   3.69
Middle Childhood        6  11   736   8.90
Adolescence            12  17  1108  14.01

=== Sample Records ===
    age_year   life_stage                   abuse
0         16  Adolescence  Parenting Difficulties
1         14  Adolescence             Delinquency
2         15  Adolescence          Physical Abuse
3         14  Adolescence         Emotional Abuse
4         15  Adolescence                 Neglect
5         16  Adolescence             Delinquency
6         16  Adolescence                  Others
7         15  Adolescence          Physical Abuse
8         17  Adolescence          Physical Abuse
9         16  Adolescence             Delinquency
10        13  Adolescence                  Others
11        16  Adolescence         Emotional Abuse
12        13  Adolescence             Del

  print(df.groupby('life_stage').agg({


In [37]:
df.to_csv(f"/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/df_final_dmft_Grade_N{len(df)}.csv")

### <span style="color: blue; ">Data</span> → 単一の入所理由 → 虐待のみ

In [42]:
df= pd.read_csv(f"/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/df_final_dmft_Grade_N{len(df)}.csv", index_col=0)
df.head()

Unnamed: 0,No_All,CGC,date,sex,age_year,age_month,age,abuse_1,abuse,abuse_num,...,Present_Teeth,Present_Perm_Teeth,Present_Baby_Teeth,Healthy_Rate,C0_Count,Care_Index,Trauma_Count,RDT_Count,UTN_Score,life_stage
0,1,tokyo,2016-10-12,Female,16,3,16.25,,Parenting Difficulties,1,...,28.0,28.0,0.0,67.9,2.0,28.6,0.0,0.0,71.4,Adolescence
1,2,tokyo,2016-10-12,Female,14,6,14.5,,Delinquency,1,...,26.0,26.0,0.0,84.6,0.0,50.0,0.0,0.0,50.0,Adolescence
2,3,tokyo,2016-10-12,Female,15,1,15.083333,,Physical Abuse,1,...,28.0,28.0,0.0,82.1,2.0,100.0,0.0,0.0,0.0,Adolescence
3,4,tokyo,2016-10-12,Female,14,5,14.416667,,Emotional Abuse,1,...,28.0,28.0,0.0,100.0,0.0,,0.0,0.0,0.0,Adolescence
4,5,tokyo,2016-10-12,Female,15,10,15.833333,,Neglect,1,...,28.0,28.0,0.0,60.7,3.0,62.5,0.0,0.0,37.5,Adolescence


In [43]:
df_Only1Abuse = df[(df['abuse_num'] == 1) ]

# Simple counts
print(f"Abuse types count: N={len(df_Only1Abuse)}")
print(df_Only1Abuse['abuse'].value_counts())

print("\nAbuse_num for each abuse type:")
print(df_Only1Abuse['abuse_num'].value_counts())

df_Only1Abuse.to_csv(f"/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/data_noNA_Only1Abuse_N{len(df_Only1Abuse)}.csv")

Abuse types count: N=2071
abuse
Physical Abuse            646
Delinquency               531
Neglect                   328
Parenting Difficulties    218
Emotional Abuse           201
Others                     87
Sexual Abuse               60
Name: count, dtype: int64

Abuse_num for each abuse type:
abuse_num
1    2071
Name: count, dtype: int64


### Only Abuse

In [44]:
target_abuse_types = [
    "Physical Abuse", 
    "Neglect", 
    "Emotional Abuse", 
    "Sexual Abuse"
]

# Filter the DataFrame to include only rows where 'abuse' is in the target list
df_OnlyAbuse = df_Only1Abuse[df_Only1Abuse['abuse'].isin(target_abuse_types)]
print(f"Abuse types count: N={len(df_OnlyAbuse)}")
print(df_OnlyAbuse['abuse'].value_counts())
df_OnlyAbuse.to_csv(f"/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/data_OnlyAbuse_N{len(df_OnlyAbuse)}.csv")

Abuse types count: N=1235
abuse
Physical Abuse     646
Neglect            328
Emotional Abuse    201
Sexual Abuse        60
Name: count, dtype: int64


In [45]:
print(f"/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/data_OnlyAbuse_N{len(df_OnlyAbuse)}.csv")

/Users/ayo/Desktop/_GSAIS_/Research/OralHealth_tokyo/paper_analysis/data/data_OnlyAbuse_N1235.csv
