<a href="https://colab.research.google.com/github/ykitaguchi77/statistics_for_articles/blob/main/Orbital_tumor_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Orbital tumor**

In [1]:
# prompt: gdriveをマウント
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install japanize_matplotlib

Collecting japanize_matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: japanize_matplotlib
  Building wheel for japanize_matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize_matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120257 sha256=26e991511994c5cfab86ce896aa50a1c315bbb3387de02c262869054d61c685f
  Stored in directory: /root/.cache/pip/wheels/61/7a/6b/df1f79be9c59862525070e157e62b08eab8ece27c1b68fbb94
Successfully built japanize_matplotlib
Installing collected packages: japanize_matplotlib
Successfully installed japanize_matplotlib-1.1.3


In [3]:
import pandas as pd

# Load the CSV file with shift-jis encoding
file_path = "/content/drive/MyDrive/発表/2024近畿神経眼科セミナー/merged_patient_data.csv"
data = pd.read_csv(file_path, encoding='shift-jis')

# Define the categories and their corresponding keywords without numbers in parentheses
categories = {
    '炎症性疾患': [
        'Inflammation', 'myositis', 'GPA', 'Sarcoidosis'
    ],
    'リンパ性疾患': [
        'IgG4-ROD', 'MALT lymphoma', 'Diffuse large B-cell lymphoma',
        'Follicular lymphoma', 'Indolent lymphoma', 'Reactive lymphoid hyperplasia',
        'NKT lymphoma', 'Malignant lymphoma'
    ],
    '嚢胞': [
        'Dermoid', 'Dacriops', 'Cyst', 'Epidermal cyst'
    ],
    '上皮性腫瘍': [
        'Pleomorphic adenoma', 'Adenoid cystic carcinoma', 'Squamous cell carcinoma',
        'Sebaceous gland carcinoma', 'Carcinoma ex pleomorphic adenoma', 'Sebaceous gland carcinomas'
    ],
    '非上皮性良性腫瘍': [
        'Cavenous hemangioma', 'Schwannoma', 'Lipoma', 'Neurofibroma'
    ],
    '非上皮性悪性腫瘍': [
        'Metastatic', 'Malignant melanoma', 'Sarcoma', 'Ewing', 'Clear cell sarcoma',
        'well-differentiated liposarcoma', 'Sebaceous gland carcinoma'
    ],
    '血管性病変': [
        'Lymphatic malfomation', 'Venous malformation', 'Infantile hemangioma',
        'Lymphatic malformation'
    ],
    '眼窩外からの進展': [
        'Osteoma', 'Paranasal sinus carcinoma', 'Paranasal sinus cyst',
        'Paranasal sinus SCC', 'Paranasal sinus adenoid cystic carcinoma',
        'Paranasal sinus melanoma', 'Fibrous dysplasia', 'Parasinus sinus carcinoma'
    ],
    'その他': [
        'Orbital abscess', 'Foreign body granuloma', 'Amyloidosis', 'Multiple myeloma',
        'Optic meningioma', 'Meningioma', 'Swollen MiraGel', 'Xanthogranuloma', 'CCF',
        'Jugular venous reflux syndrome', 'Venous thrombosis', 'Hystiocytosis', 'Hematoma',
        'Optic glioma', 'Fibrous lesion'
    ]
}

# Create a function to categorize each row based on the '概要' column
def categorize_row(row):
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in row.lower():
                return category
    return '分類なし'

# Apply the function to the '概要' column
data['分類'] = data['概要'].apply(categorize_row)

# Check for rows that are not categorized
uncategorized = data[data['分類'] == '分類なし']


In [4]:
# Creating the conditions and assigning categories based on those conditions

# Condition 1: 病理診断 == "なし"
data.loc[data['病理診断'] == 'なし', 'diagnosis'] = '病理検査なし'

# Condition 2: 病理診断 != "あり" and 診断_x is NaN (equivalent to Nil in your instruction)
data.loc[(data['病理診断'] == 'あり') & (data['診断_x'].isna()), 'diagnosis'] = '他科で生検'

# Condition 3: 病理診断 != "あり" and 診断_x is not NaN
data.loc[(data['病理診断'] == 'あり') & (~data['診断_x'].isna()), 'diagnosis'] = '眼科で生検'



In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# '病名開始日_1_Year'列を年代に変換
data['Decade'] = pd.to_datetime(data['病名開始日_1']).dt.year

# 年代と診断名でグループ化してカウント
diagnosis_counts = data.groupby(['Decade', 'diagnosis']).size().unstack(fill_value=0)

# 診断名の順序を指定
order = ["眼科で生検", "他科で生検", "病理検査なし"]
diagnosis_counts = diagnosis_counts[order]

# 積み上げ棒グラフを作成
ax = diagnosis_counts.plot(kind='bar', stacked=True, figsize=(12, 6))

# 各バーの上に合計値を表示
for i, year in enumerate(diagnosis_counts.index):
    total = diagnosis_counts.loc[year].sum()
    ax.text(i, total, str(total), ha='center', va='bottom')

# グラフの設定
plt.title('New patients by Year')
plt.xlabel('Year')
plt.ylabel('Number of New Patients')
plt.legend(title='Diagnosis', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# x軸のラベルを回転
plt.xticks(rotation=45, ha='right')

# y軸の上限を設定（余白を追加）
plt.ylim(0, plt.ylim()[1] * 1.1)

# グラフを表示
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import japanize_matplotlib


# 病名開始日_1を日付型に変換し、年だけを抽出
data['年度'] = pd.to_datetime(data['病名開始日_1']).dt.year

# 年度ごとの症例数をカウント
yearly_counts = data['年度'].value_counts().sort_index()

# グラフの作成
plt.figure(figsize=(12, 6))
yearly_counts.plot(kind='bar')

# グラフの設定
plt.title('年度別症例数')
plt.xlabel('年度')
plt.ylabel('症例数')
plt.xticks(rotation=45)

# グリッド線の追加
plt.grid(axis='y', linestyle='--', alpha=0.7)

# 各バーの上に値を表示
for i, v in enumerate(yearly_counts):
    plt.text(i, v, str(v), ha='center', va='bottom')

# グラフの表示
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Extract the year from the '病名開始日_1' column
data['年度'] = pd.to_datetime(data['病名開始日_1'], errors='coerce').dt.year

# Group by year and count the number of cases
yearly_counts = data['年度'].value_counts().sort_index()

# Convert the yearly counts to a DataFrame
yearly_counts_df = yearly_counts.reset_index()
yearly_counts_df.columns = ['年度', '症例数']


# Set the font size and other style parameters
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8)})

# Plot the yearly counts
plt.figure(figsize=(12, 8))
ax = sns.barplot(x='年度', y='症例数', data=yearly_counts_df)

# Add value labels on top of each bar
for i, v in enumerate(yearly_counts_df['症例数']):
    ax.text(i, v, str(v), ha='center', va='bottom')

plt.xticks(rotation=45, ha='right')
plt.title('年度別の症例数')
plt.xlabel('年度')
plt.ylabel('症例数')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define the categories and their keywords
categories = {
    '炎症性疾患': ['Inflammation', 'myositis', 'GPA', 'Sarcoidosis', 'Xanthogranuloma'],
    'リンパ性疾患': ['IgG4-ROD', 'MALT lymphoma', 'Diffuse large B-cell lymphoma',
                 'Follicular lymphoma', 'Indolent lymphoma', 'Reactive lymphoid hyperplasia',
                 'NKT lymphoma', 'Malignant lymphoma'],
    '嚢胞': ['Dermoid', 'Dacriops', 'Cyst', 'Epidermal cyst'],
    '上皮性良性腫瘍': ['Pleomorphic adenoma'],
    '上皮性悪性腫瘍': ['Adenoid cystic carcinoma', 'Squamous cell carcinoma',
                'Sebaceous gland carcinoma', 'Carcinoma ex pleomorphic adenoma', 'Sebaceous gland carcinomas'],
    '非上皮性良性腫瘍': ['Cavenous hemangioma', 'Schwannoma', 'Lipoma', 'Neurofibroma',
            'Optic meningioma', 'Meningioma', 'Optic glioma'],
    '非上皮性悪性腫瘍': ['Metastatic', 'Malignant melanoma', 'Sarcoma', 'Ewing', 'Clear cell sarcoma',
                     'well-differentiated liposarcoma', 'Sebaceous gland carcinoma'],
    '血管性病変': ['Lymphatic malfomation', 'Venous malformation', 'Infantile hemangioma',
                'Lymphatic malformation'],
    '眼窩外からの進展': ['Osteoma', 'Paranasal sinus carcinoma', 'Paranasal sinus cyst',
                    'Paranasal sinus SCC', 'Paranasal sinus adenoid cystic carcinoma',
                    'Paranasal sinus melanoma', 'Fibrous dysplasia', 'Parasinus sinus carcinoma'],
    'その他': ['Orbital abscess', 'Foreign body granuloma', 'Amyloidosis', 'Multiple myeloma', 'Swollen MiraGel', 'CCF',
            'Jugular venous reflux syndrome', 'Venous thrombosis', 'Hystiocytosis', 'Hematoma',
             'Fibrous lesion']
}

# Initialize a dictionary to count the number of cases in each category
category_counts = {key: 0 for key in categories.keys()}

# Classify each diagnosis and count the occurrences in each category
for diagnosis in data['概要'].dropna():
    classified = False
    for category, keywords in categories.items():
        if any(keyword in diagnosis for keyword in keywords):
            category_counts[category] += 1
            classified = True
            break
    if not classified:
        category_counts['その他'] += 1

# Convert the category counts to a DataFrame
category_counts_df = pd.DataFrame(list(category_counts.items()), columns=['Category', 'Count'])

# Sort the DataFrame by the count and move 'その他' to the end
category_counts_df_sorted = category_counts_df[category_counts_df['Category'] != 'その他'].sort_values(by='Count', ascending=False)
category_counts_df_sorted = pd.concat([category_counts_df_sorted, category_counts_df[category_counts_df['Category'] == 'その他']])

# Set the font size and other style parameters
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8)})

# Create the bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='Category', y='Count', data=category_counts_df_sorted)
plt.xticks(rotation=45, ha='right')
#plt.title('カテゴリーごとの症例数')
plt.xlabel('カテゴリー')
plt.ylabel('症例数')
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Define the categories and their keywords
categories = {
    '炎症性疾患': ['Inflammation', 'myositis', 'GPA', 'Sarcoidosis', 'Xanthogranuloma'],
    'リンパ性疾患': ['IgG4-ROD', 'MALT lymphoma', 'Diffuse large B-cell lymphoma',
                 'Follicular lymphoma', 'Indolent lymphoma', 'Reactive lymphoid hyperplasia',
                 'NKT lymphoma', 'Malignant lymphoma'],
    '嚢胞': ['Dermoid', 'Dacriops', 'Cyst', 'Epidermal cyst'],
    '上皮性良性腫瘍': ['Pleomorphic adenoma'],
    '上皮性悪性腫瘍': ['Adenoid cystic carcinoma', 'Squamous cell carcinoma',
                'Sebaceous gland carcinoma', 'Carcinoma ex pleomorphic adenoma', 'Sebaceous gland carcinomas'],
    '非上皮性良性腫瘍': ['Cavenous hemangioma', 'Schwannoma', 'Lipoma', 'Neurofibroma',
            'Optic meningioma', 'Meningioma', 'Optic glioma'],
    '非上皮性悪性腫瘍': ['Metastatic', 'Malignant melanoma', 'Sarcoma', 'Ewing', 'Clear cell sarcoma',
                     'well-differentiated liposarcoma', 'Sebaceous gland carcinoma'],
    '血管性病変': ['Lymphatic malfomation', 'Venous malformation', 'Infantile hemangioma',
                'Lymphatic malformation'],
    '眼窩外からの進展': ['Osteoma', 'Paranasal sinus carcinoma', 'Paranasal sinus cyst',
                    'Paranasal sinus SCC', 'Paranasal sinus adenoid cystic carcinoma',
                    'Paranasal sinus melanoma', 'Fibrous dysplasia', 'Parasinus sinus carcinoma'],
    'その他': ['Orbital abscess', 'Foreign body granuloma', 'Amyloidosis', 'Multiple myeloma', 'Swollen MiraGel', 'CCF',
            'Jugular venous reflux syndrome', 'Venous thrombosis', 'Hystiocytosis', 'Hematoma',
             'Fibrous lesion']
}

# Create a new column to indicate if there is a pathological diagnosis
data['病理診断あり'] = data['病理診断'].apply(lambda x: 'あり' if x == 'あり' else 'なし')

# Initialize a dictionary to count the number of cases in each category with subcategories for pathological diagnosis
category_counts_detailed = {key: {'あり': 0, 'なし': 0} for key in categories.keys()}

# Classify each diagnosis from the '概要' column and count the occurrences in each category with pathological diagnosis subcategories
for index, row in data.iterrows():
    diagnosis = row['概要']
    pathology = row['病理診断あり']
    if pd.notna(diagnosis):
        classified = False
        for category, keywords in categories.items():
            if any(keyword in diagnosis for keyword in keywords):
                category_counts_detailed[category][pathology] += 1
                classified = True
                break
        if not classified:
            category_counts_detailed['その他'][pathology] += 1

# Convert the detailed category counts to a DataFrame
category_counts_detailed_df = pd.DataFrame(category_counts_detailed).T
category_counts_detailed_df.reset_index(inplace=True)
category_counts_detailed_df.columns = ['Category', '病理診断あり', '病理診断なし']

# Sum the counts for each category to sort by the total count
category_counts_detailed_df['Total'] = category_counts_detailed_df['病理診断あり'] + category_counts_detailed_df['病理診断なし']

# Sort by total count excluding 'その他'
sorted_df = category_counts_detailed_df[category_counts_detailed_df['Category'] != 'その他'].sort_values(by='Total', ascending=False)

# Append 'その他' at the end
other_df = category_counts_detailed_df[category_counts_detailed_df['Category'] == 'その他']
sorted_df = pd.concat([sorted_df, other_df])

# Set the font size and other style parameters
plt.rcParams.update({'font.size': 14, 'figure.figsize': (12, 8)})

# Create the stacked bar plot
fig, ax = plt.subplots()
bottom_bar = ax.bar(sorted_df['Category'], sorted_df['病理診断あり'], color='blue', label='病理診断あり')
top_bar = ax.bar(sorted_df['Category'], sorted_df['病理診断なし'], bottom=sorted_df['病理診断あり'], color='orange', label='病理診断なし')

plt.xticks(rotation=45, ha='right')
#plt.title('カテゴリーごとの症例数')
plt.xlabel('カテゴリー')
plt.ylabel('症例数')
plt.legend()
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib

# Sort categories by the median age
median_ages = data.groupby('分類')['Age'].median().sort_values()
sorted_categories = median_ages.index.tolist()

# Set the font size
plt.rcParams.update({'font.size': 20})

# Create a box plot for Age based on the sorted categories using seaborn
plt.figure(figsize=(14, 8))
sns.boxplot(x='分類', y='Age', data=data, order=sorted_categories)
plt.title('年齢別の統計値', fontsize=18)
plt.xlabel('分類', fontsize= 20)
plt.ylabel('年齢', fontsize=20)
plt.xticks(rotation=45, fontsize=20)
plt.yticks(fontsize=20)
plt.show()