<a href="https://colab.research.google.com/github/t8101349/group-project-202503/blob/main/gradio_web_0318.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas
!pip install numpy
!pip install gradio
!pip install rdkit
!pip install scikit-learn
!pip install xgboost
!pip install lightgbm

Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [9]:
import gradio as gr
import pandas as pd
import traceback
from gradio.themes import Base
from process import predict_process

class AppState:
    def __init__(self):
        self.df = None
        self.result_df = None
        self.file_uploaded = False
        self.prediction_done = False

state = AppState()

import pandas as pd
import numpy as np
import joblib
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from huggingface_hub import hf_hub_download


def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)


def predict_process(df):
    # 載入模型
    model_path = hf_hub_download(repo_id="sinanju/model_voting", filename="voting_model.bin")
    model = joblib.load(model_path)

    # 對 "molecule_smiles" 欄位進行轉換
    df["molecule_smiles"] = df["molecule_smiles"].apply(lambda x: smiles_to_morgan_fingerprint(x))
    df.columns = df.columns.astype(str)

    # 轉成 int8 以節省記憶體
    int_cols = df.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df[col] = df[col].astype(np.int8)

    # 處理指紋數據和蛋白質編碼
    fingerprints_df = pd.DataFrame(df['molecule_smiles'].to_list())
    protein_onehot = pd.get_dummies(df["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)
    X_test.columns = X_test.columns.astype(str)  # 修復點：統一欄位名稱為字串

    # 預測機率並轉為二元分類
    probabilities = model.predict_proba(X_test)[:, 1]
    threshold = 0.5
    predictions = (probabilities >= threshold).astype(int)

    # 產生新的 id
    df['id'] = range(1, 1 + len(df))

    # 建立結果 DataFrame
    result_df = pd.DataFrame({
        'id': df['id'],
        'molecule_smiles': df['molecule_smiles'],
        'binds': predictions
    })

    return result_df


def confirm_file(file):
    if file is None:
        return "請上傳分子數據集！", False
    try:
        original_filename = file.name if hasattr(file, "name") else "未知檔案"
        if original_filename.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif original_filename.endswith('.parquet'):
            df = pd.read_parquet(file.name)
        else:
            return f"不支援的檔案格式：{original_filename}！僅支援 .csv 和 .parquet", False
        required_columns = ["molecule_smiles", "protein_name"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            return f"檔案缺少必要的欄位：{', '.join(missing_columns)}", False
        if len(df) > 500000:
            return "資料筆數超過 50 萬筆，請減少資料量！", False
        state.df = df
        state.file_uploaded = True
        state.prediction_done = False
        return f"✅ 已成功上傳檔案：{original_filename}，共 {len(df)} 筆資料", True
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"檔案處理錯誤：{str(e)}\n{error_details}")
        return f"❌ 檔案處理錯誤：{str(e)}", False

def run_prediction():
    if not state.file_uploaded or state.df is None:
        return "❌ 請先上傳並確認檔案！", False

    # 檢查缺失值
    if state.df.isnull().values.any():
        missing_info = state.df.isnull().sum()
        missing_summary = missing_info[missing_info > 0].to_dict()
        return f"❌ 資料包含缺失值，請處理後再預測！缺失欄位: {missing_summary}", False

    try:
        result_df = predict_process(state.df)
        state.result_df = result_df
        state.prediction_done = True
        return f"✅ 預測完成！共處理 {len(result_df)} 筆資料", True
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"預測錯誤：{str(e)}\n{error_details}")
        return f"❌ 預測錯誤：{str(e)}", False

def generate_file(format_choice):
    import tempfile
    import os
    if not state.prediction_done or state.result_df is None:
        return None, "❌ 請先執行預測！"
    try:
        temp_dir = tempfile.gettempdir()
        if format_choice == "CSV":
            filename = "prediction.csv"
            filepath = os.path.join(temp_dir, filename)
            state.result_df.to_csv(filepath, index=False)
        else:
            filename = "prediction.parquet"
            filepath = os.path.join(temp_dir, filename)
            state.result_df.to_parquet(filepath, index=False)
        return filepath, f"✅ 已生成 {filename}，點擊下方按鈕即可下載"
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"生成檔案錯誤：{str(e)}\n{error_details}")
        return None, f"❌ 生成檔案錯誤：{str(e)}"

def update_button_status(status_text, button_value):
    if "✅" in status_text:
        return gr.update(interactive=True, value="執行預測")
    else:
        return gr.update(interactive=False, value="執行預測")

custom_theme = Base(
    primary_hue="cyan",
    secondary_hue="teal",
    neutral_hue="gray"
)

with gr.Blocks(
    theme=custom_theme,
    title="新藥預測工具",
    css="""
    .gradio-container {
        width: 800px !important;
        margin: auto !important;
    }
    h2 {
        text-align: center;
        font-size: 20px;
    }
    .gradient-title h1 {
        background: linear-gradient(45deg, #13A9E6, #3DD69E);
        -webkit-background-clip: text;
        background-clip: text;
        color: transparent;
        text-align: center;
        font-size: 52px;
        font-weight: bold;
    }
    .gr-radio input[type="radio"] {
        accent-color: #13A9E6;
    }
    footer {
        display: none !important;
    }
    .file-status {
        font-weight: bold;
    }
    """
) as demo:
    gr.Markdown("# 新藥預測工具", elem_classes=["gradient-title"])

    with gr.Accordion("點此查看工具詳細說明", open=False):
        gr.Markdown("""
            **詳細說明：**
            此工具可將SMILES形式的分子資料集根據一個預測分子與三種蛋白質標靶(sEH, BRD4, HSA)
            是否結合的機器學習模型，來快速篩選出可能的藥物分子資料集。
            **操作說明：**
            1. 上傳分子數據集 (支援 CSV 與 Parquet 格式, 檔案大小上限50MB, 資料筆數上限50萬筆)
            2. 確認上傳檔案
            3. 執行預測
            4. 選擇下載格式
            5. 產生並下載預測檔案
            **必要欄位說明：**
            - molecule_smiles: 分子的SMILES表示法
            - protein_name: 蛋白質名稱 (必須為 sEH, BRD4, HSA 其中之一)
            """)

    with gr.Column():
        gr.Markdown("## 上傳分子數據集", elem_classes=["sub_title"])

        file_input = gr.File(
            label="拖曳檔案至此或點擊上傳，上限50MB",
            file_types=[".csv", ".parquet"],
            type="filepath"
        )

        confirm_btn = gr.Button("確認檔案", variant="primary")
        file_status = gr.Textbox(
            label="檔案狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        predict_btn = gr.Button("執行預測", variant="primary", interactive=False)
        predict_status = gr.Textbox(
            label="預測狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        with gr.Row():
            download_format = gr.Radio(
                choices=["CSV", "Parquet"],
                label="選擇下載格式",
                value="CSV"
            )

        generate_btn = gr.Button("產生下載檔案", variant="primary", interactive=False)
        generate_status = gr.Textbox(
            label="檔案生成狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        download_btn = gr.DownloadButton(
            label="下載預測結果",
            variant="primary",
            interactive=False,
            visible=True
        )

    confirm_btn.click(
        fn=confirm_file,
        inputs=file_input,
        outputs=[file_status, predict_btn]
    ).then(
        fn=update_button_status,
        inputs=[file_status, predict_btn],
        outputs=predict_btn
    )

    predict_btn.click(
        fn=run_prediction,
        inputs=None,
        outputs=[predict_status, generate_btn]
    ).then(
        fn=lambda status_text, btn_value: gr.update(
            interactive="✅" in status_text,
            value="產生下載檔案"
        ),
        inputs=[predict_status, generate_btn],
        outputs=generate_btn
    )

    generate_btn.click(
        fn=generate_file,
        inputs=download_format,
        outputs=[download_btn, generate_status]
    ).then(
        fn=lambda filepath, status: (
            gr.update(
                value=filepath,
                interactive=True,
                visible=True
            ) if "✅" in status else gr.update(
                value=None,
                interactive=False,
                visible=True
            )
        ),
        inputs=[download_btn, generate_status],
        outputs=download_btn
    )

if __name__ == "__main__":
    demo.launch(
        # 設定上傳檔案大小50MB限制
        max_file_size=50 * 1024 * 1024
    )

ModuleNotFoundError: No module named 'gradio'

In [1]:
!pip install huggingface_hub




In [2]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
!git clone https://huggingface.co/spaces/weber8101349/my-medicine-predict-model
%cd my-medicine-predict-model



fatal: destination path 'my-medicine-predict-model' already exists and is not an empty directory.
/content/my-medicine-predict-model


In [5]:
%%writefile requirements.txt
gradio
transformers
torch
pandas
numpy
rdkit
scikit-learn
xgboost
lightgbm


Writing requirements.txt


In [10]:
%%writefile app.py

import gradio as gr
import pandas as pd
import traceback
from gradio.themes import Base
from process import predict_process

class AppState:
    def __init__(self):
        self.df = None
        self.result_df = None
        self.file_uploaded = False
        self.prediction_done = False

state = AppState()

import pandas as pd
import numpy as np
import joblib
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from huggingface_hub import hf_hub_download


def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)


def predict_process(df):
    # 載入模型
    model_path = hf_hub_download(repo_id="sinanju/model_voting", filename="voting_model.bin")
    model = joblib.load(model_path)

    # 對 "molecule_smiles" 欄位進行轉換
    df["molecule_smiles"] = df["molecule_smiles"].apply(lambda x: smiles_to_morgan_fingerprint(x))
    df.columns = df.columns.astype(str)

    # 轉成 int8 以節省記憶體
    int_cols = df.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df[col] = df[col].astype(np.int8)

    # 處理指紋數據和蛋白質編碼
    fingerprints_df = pd.DataFrame(df['molecule_smiles'].to_list())
    protein_onehot = pd.get_dummies(df["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)
    X_test.columns = X_test.columns.astype(str)  # 修復點：統一欄位名稱為字串

    # 預測機率並轉為二元分類
    probabilities = model.predict_proba(X_test)[:, 1]
    threshold = 0.5
    predictions = (probabilities >= threshold).astype(int)

    # 產生新的 id
    df['id'] = range(1, 1 + len(df))

    # 建立結果 DataFrame
    result_df = pd.DataFrame({
        'id': df['id'],
        'molecule_smiles': df['molecule_smiles'],
        'binds': predictions
    })

    return result_df


def confirm_file(file):
    if file is None:
        return "請上傳分子數據集！", False
    try:
        original_filename = file.name if hasattr(file, "name") else "未知檔案"
        if original_filename.endswith('.csv'):
            df = pd.read_csv(file.name)
        elif original_filename.endswith('.parquet'):
            df = pd.read_parquet(file.name)
        else:
            return f"不支援的檔案格式：{original_filename}！僅支援 .csv 和 .parquet", False
        required_columns = ["molecule_smiles", "protein_name"]
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            return f"檔案缺少必要的欄位：{', '.join(missing_columns)}", False
        if len(df) > 500000:
            return "資料筆數超過 50 萬筆，請減少資料量！", False
        state.df = df
        state.file_uploaded = True
        state.prediction_done = False
        return f"✅ 已成功上傳檔案：{original_filename}，共 {len(df)} 筆資料", True
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"檔案處理錯誤：{str(e)}\n{error_details}")
        return f"❌ 檔案處理錯誤：{str(e)}", False

def run_prediction():
    if not state.file_uploaded or state.df is None:
        return "❌ 請先上傳並確認檔案！", False

    # 檢查缺失值
    if state.df.isnull().values.any():
        missing_info = state.df.isnull().sum()
        missing_summary = missing_info[missing_info > 0].to_dict()
        return f"❌ 資料包含缺失值，請處理後再預測！缺失欄位: {missing_summary}", False

    try:
        result_df = predict_process(state.df)
        state.result_df = result_df
        state.prediction_done = True
        return f"✅ 預測完成！共處理 {len(result_df)} 筆資料", True
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"預測錯誤：{str(e)}\n{error_details}")
        return f"❌ 預測錯誤：{str(e)}", False

def generate_file(format_choice):
    import tempfile
    import os
    if not state.prediction_done or state.result_df is None:
        return None, "❌ 請先執行預測！"
    try:
        temp_dir = tempfile.gettempdir()
        if format_choice == "CSV":
            filename = "prediction.csv"
            filepath = os.path.join(temp_dir, filename)
            state.result_df.to_csv(filepath, index=False)
        else:
            filename = "prediction.parquet"
            filepath = os.path.join(temp_dir, filename)
            state.result_df.to_parquet(filepath, index=False)
        return filepath, f"✅ 已生成 {filename}，點擊下方按鈕即可下載"
    except Exception as e:
        error_details = traceback.format_exc()
        print(f"生成檔案錯誤：{str(e)}\n{error_details}")
        return None, f"❌ 生成檔案錯誤：{str(e)}"

def update_button_status(status_text, button_value):
    if "✅" in status_text:
        return gr.update(interactive=True, value="執行預測")
    else:
        return gr.update(interactive=False, value="執行預測")

custom_theme = Base(
    primary_hue="cyan",
    secondary_hue="teal",
    neutral_hue="gray"
)

with gr.Blocks(
    theme=custom_theme,
    title="新藥預測工具",
    css="""
    .gradio-container {
        width: 800px !important;
        margin: auto !important;
    }
    h2 {
        text-align: center;
        font-size: 20px;
    }
    .gradient-title h1 {
        background: linear-gradient(45deg, #13A9E6, #3DD69E);
        -webkit-background-clip: text;
        background-clip: text;
        color: transparent;
        text-align: center;
        font-size: 52px;
        font-weight: bold;
    }
    .gr-radio input[type="radio"] {
        accent-color: #13A9E6;
    }
    footer {
        display: none !important;
    }
    .file-status {
        font-weight: bold;
    }
    """
) as demo:
    gr.Markdown("# 新藥預測工具", elem_classes=["gradient-title"])

    with gr.Accordion("點此查看工具詳細說明", open=False):
        gr.Markdown("""
            **詳細說明：**
            此工具可將SMILES形式的分子資料集根據一個預測分子與三種蛋白質標靶(sEH, BRD4, HSA)
            是否結合的機器學習模型，來快速篩選出可能的藥物分子資料集。
            **操作說明：**
            1. 上傳分子數據集 (支援 CSV 與 Parquet 格式, 檔案大小上限50MB, 資料筆數上限50萬筆)
            2. 確認上傳檔案
            3. 執行預測
            4. 選擇下載格式
            5. 產生並下載預測檔案
            **必要欄位說明：**
            - molecule_smiles: 分子的SMILES表示法
            - protein_name: 蛋白質名稱 (必須為 sEH, BRD4, HSA 其中之一)
            """)

    with gr.Column():
        gr.Markdown("## 上傳分子數據集", elem_classes=["sub_title"])

        file_input = gr.File(
            label="拖曳檔案至此或點擊上傳，上限50MB",
            file_types=[".csv", ".parquet"],
            type="filepath"
        )

        confirm_btn = gr.Button("確認檔案", variant="primary")
        file_status = gr.Textbox(
            label="檔案狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        predict_btn = gr.Button("執行預測", variant="primary", interactive=False)
        predict_status = gr.Textbox(
            label="預測狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        with gr.Row():
            download_format = gr.Radio(
                choices=["CSV", "Parquet"],
                label="選擇下載格式",
                value="CSV"
            )

        generate_btn = gr.Button("產生下載檔案", variant="primary", interactive=False)
        generate_status = gr.Textbox(
            label="檔案生成狀態",
            elem_classes=["file-status"],
            interactive=False
        )

        download_btn = gr.DownloadButton(
            label="下載預測結果",
            variant="primary",
            interactive=False,
            visible=True
        )

    confirm_btn.click(
        fn=confirm_file,
        inputs=file_input,
        outputs=[file_status, predict_btn]
    ).then(
        fn=update_button_status,
        inputs=[file_status, predict_btn],
        outputs=predict_btn
    )

    predict_btn.click(
        fn=run_prediction,
        inputs=None,
        outputs=[predict_status, generate_btn]
    ).then(
        fn=lambda status_text, btn_value: gr.update(
            interactive="✅" in status_text,
            value="產生下載檔案"
        ),
        inputs=[predict_status, generate_btn],
        outputs=generate_btn
    )

    generate_btn.click(
        fn=generate_file,
        inputs=download_format,
        outputs=[download_btn, generate_status]
    ).then(
        fn=lambda filepath, status: (
            gr.update(
                value=filepath,
                interactive=True,
                visible=True
            ) if "✅" in status else gr.update(
                value=None,
                interactive=False,
                visible=True
            )
        ),
        inputs=[download_btn, generate_status],
        outputs=download_btn
    )

if __name__ == "__main__":
    demo.launch(
        # 設定上傳檔案大小50MB限制
        max_file_size=50 * 1024 * 1024
    )

Overwriting app.py


In [11]:
!git status


On branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mnew file:   app.py[m
	[32mnew file:   requirements.txt[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   app.py[m



In [12]:
!git add .
!git commit -m "First commit from Colab"
!git push


Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@1bad210cf151.(none)')
Everything up-to-date


In [None]:
!git add .
!git commit -m 'Add application file'
!git push
