In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
from joblib import load
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the data
import os
import pandas as pd
!pip install openpyxl

folder_path = "/content/drive/MyDrive/antibiotic/TimeOut_2024"
output_file = "/content/drive/MyDrive/antibiotic/TimeOut2024_combined.xlsx"
combined_data = pd.DataFrame()

# loop through all files in the folder
for month in range (1,13):
  file_name = f"TimeOut_{month}2024.xlsx"
  file_path = os.path.join(folder_path, file_name)

  if os.path.exists(file_path):
    df = pd.read_excel(file_path)
    df['Month'] = month
    combined_data = pd.concat([combined_data, df], ignore_index=True)
  else:
    print(f"File not found: {file_path}")

# save the combined data to a new excel file
combined_data.to_excel(output_file, index=False, engine='openpyxl')

print(f"Combined data saved to {output_file}")

Combined data saved to /content/drive/MyDrive/antibiotic/TimeOut2024_combined.xlsx


In [4]:
def modify_antibiotic_column(data):
    data['SENSITIVITY_ANTIBIOTIC'] = data['SENSITIVITY_ANTIBIOTIC'].str.split('(').str[0].str.strip()
    data['SENSITIVITY_ANTIBIOTIC'] = data['SENSITIVITY_ANTIBIOTIC'].str.replace(r"\s*[+/\t]\s*", ";", regex=True)
    data['SENSITIVITY_ANTIBIOTIC'] = data['SENSITIVITY_ANTIBIOTIC'].str.lower()
    return data

def age_columns(data):
    data['date_of_birth'] = pd.to_datetime(data['DATE_OF_BIRTH'], errors='coerce')
    today = datetime.today()
    data['AGE'] = data['date_of_birth'].apply(lambda x: today.year -x.year - ((today.month, today.day) <(x.month, x.day)))
    return data

def select_columns_1(data):
    return data[['VISIBLE_PATIENT_ID', 'SEX_RCD', 'AGE', 'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED', 'PROBLEM', 'SENSITIVITY_ANTIBIOTIC', 'SENSITIVITY_INTERPRETION']]

def filter_sensitivity(data):
    return data[data['SENSITIVITY_INTERPRETION'].isin (['R', 'S', 'I'])]

def replace_sensitivity(data):
    data['SENSITIVITY_INTERPRETION'] = data['SENSITIVITY_INTERPRETION'].replace({
    'S': 1,
    'I': 1,
    'R': 0
})
    return data

def remove_duplicates(data):
  return data.drop_duplicates()

def merge_unique_prb (data):
  unq_prb = pd.read_excel('/content/drive/MyDrive/antibiotic/unique_problems.xlsx')
  return data.merge(unq_prb, left_on='PROBLEM', right_on='unique_problem', how='left')

def select_columns_2(data):
    return data[['VISIBLE_PATIENT_ID', 'SEX_RCD', 'AGE', 'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED', 'general_problem','SENSITIVITY_ANTIBIOTIC', 'SENSITIVITY_INTERPRETION']]

def encode_columns(data):
    """Mã hóa các cột bằng LabelEncoder."""
    os.makedirs('encoders', exist_ok=True)
    columns_to_encode = ['SEX_RCD', 'general_problem', 'SENSITIVITY_ANTIBIOTIC', 'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED']
    # Fit và lưu từng encoder
    for col in columns_to_encode:
        label_encoder = LabelEncoder()
        data[col] = data[col].astype(str)  # đảm bảo kiểu string
        label_encoder.fit(data[col])
        data[col] = label_encoder.transform(data[col])
        joblib.dump(label_encoder, f'encoders/{col}_encoder.pkl')
    return data

In [5]:
# 🧩 Pipeline xử lý
def preprocess_pipeline(df):
    df = modify_antibiotic_column(df)
    df = age_columns(df)
    df = select_columns_1(df)
    df = filter_sensitivity(df)
    df = replace_sensitivity(df)
    df = remove_duplicates(df)
    df = merge_unique_prb(df)
    df = select_columns_2(df)
    df = encode_columns(df)
    return df

In [6]:
result1 = preprocess_pipeline(combined_data)
result1.head()

Unnamed: 0,VISIBLE_PATIENT_ID,PRESCRIPTION_REFERENCE_NUMBER,SEX_RCD,AGE,ORDER_OWNER,LOCATION,DEPARTMENT_ORDERED,general_problem,SENSITIVITY_ANTIBIOTIC,SENSITIVITY_INTERPRETION
0,800046403,10028530,1,99,37,5,26,34,3,0
1,800046403,10028530,1,99,37,5,26,34,1,0
2,800046403,10028530,1,99,37,5,26,34,35,0
3,800046403,10028530,1,99,37,5,26,34,9,0
4,800046403,10028530,1,99,37,5,26,34,11,0


In [7]:
def xgb_model (data):
    X = data[['VISIBLE_PATIENT_ID', 'SEX_RCD', 'AGE',
       'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED', 'general_problem','SENSITIVITY_ANTIBIOTIC']]
    y = data[["SENSITIVITY_INTERPRETION"]]

    # Tạo mô hình với tham số tốt nhất
    best_xgb = XGBClassifier(
    learning_rate=0.2,
    max_depth=7,
    n_estimators=200,
    use_label_encoder=False,
    eval_metric='logloss'
    )

    # Huấn luyện mô hình
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    best_xgb.fit(X_train, y_train)

    # 3. Lưu mô hình vào file
    joblib.dump(best_xgb, '/content/drive/MyDrive/antibiotic/xgb_model.pkl')

    # Dự đoán
    y_pred = best_xgb.predict(X_test)
    print("✅ Accuracy:", accuracy_score(y_test, y_pred))
    print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))
    print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

     # Gộp kết quả dự đoán với dữ liệu test
    results = X_test.copy()
    results = results.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)

    results['Actual'] = y_test
    results['Predicted'] = y_pred

    return best_xgb, results

In [8]:
model, prediction_df = xgb_model(result1)
print(prediction_df.head(20))

✅ Accuracy: 0.8206622516556291

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.79      1573
           1       0.85      0.84      0.85      2202

    accuracy                           0.82      3775
   macro avg       0.82      0.82      0.82      3775
weighted avg       0.82      0.82      0.82      3775


📉 Confusion Matrix:
 [[1247  326]
 [ 351 1851]]
    VISIBLE_PATIENT_ID  PRESCRIPTION_REFERENCE_NUMBER  SEX_RCD  AGE  \
0            800788803                       11375947        1   55   
1            800773008                       10329843        1   67   
2            800009119                       10549087        1   88   
3            800784996                       11119411        0   55   
4            800169398                       10596961        1   40   
5            800765895                       11443757        1   55   
6            800780376                       10820406        1   55  

# Dự đoán khả năng nhạy/kháng với tất cả kháng sinh cho 1 bệnh nhân mới

In [9]:
# Nhập thông tin bệnh nhân
new_patient_data = {
    'VISIBLE_PATIENT_ID': [123456789],
    'SEX_RCD': ['M'],  # hoặc 'M'
    'DATE_OF_BIRTH': ['1960-05-07'],
    'ORDER_OWNER': ['Dr. Duong Bich Thuy'],
    'LOCATION': ['ICU'],
    'DEPARTMENT_ORDERED': ['ICU Department'],
    'PROBLEM': ['Sigmoid colonic perforation  with diffuse peritonitis']
}
# Tạo DataFrame
new_patient = pd.DataFrame(new_patient_data)

In [10]:
def age_columns_new_pt (data):
    data['date_of_birth'] = pd.to_datetime(data['DATE_OF_BIRTH'], errors='coerce')
    today = datetime.today()
    dob = data.at[0, 'date_of_birth']
    age = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    data['AGE'] = age
    return data

def select_columns(data):
    return data[['VISIBLE_PATIENT_ID', 'SEX_RCD', 'AGE', 'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED', 'general_problem']]

def add_antibiotic_column(data, all_antibiotics):
    """ Thêm cột kháng sinh. """
    data = data.loc[data.index.repeat(len(all_antibiotics))].copy()
    # Reset index để gán lại chính xác
    data.reset_index(drop=True, inplace=True)
    data['SENSITIVITY_ANTIBIOTIC'] = all_antibiotics
    return data

def load_and_encode(data):
    columns_to_encode = ['SEX_RCD', 'general_problem', 'SENSITIVITY_ANTIBIOTIC',
                         'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED']

    for col in columns_to_encode:
        encoder_path = f'encoders/{col}_encoder.pkl'
        if os.path.exists(encoder_path):
            label_encoder = joblib.load(encoder_path)
            data[col] = label_encoder.transform(data[col].astype(str))
        else:
            raise FileNotFoundError(f"Encoder for column {col} not found at {encoder_path}")

    return data

In [11]:
# thiếu 'ceftazidime;avibactam', 'ampicillin;sulbactam
def preprocess_pipeline_new_pt(data):
    all_antibiotics = ['ampicillin', 'amoxicillin sodium;clavulanic acid',
       'piperacillin;tazobactam', 'cefazolin', 'cefotaxime', 'cefepime',
       'ertapenem', 'imipenem', 'meropenem', 'amikacin sulfate',
       'gentamicin', 'tobramycin', 'ciprofloxacin', 'nitrofurantoin',
       'trimethoprim;sulfamethoxazole', 'fosfomycin', 'ceftazidime', 'fluconazole',
       'voriconazole', 'caspofungin', 'micafungin', 'amphotericin b',
       'benzylpenicillin', 'oxacillin', 'erythromycin', 'clindamycin',
       'quinupristin;dalfopristin', 'linezolid', 'vancomycin',
       'tetracycline', 'tigecycline', 'rifampicin', 'azithromycin',
       'aztreonam', 'ceftriaxone', 'streptomycin', 'levofloxacin',
       'moxifloxacin', 'chloramphenicol', 'cefoxitin', 'ticarcillin',
       'ticarcillin;clavulanic acid', 'piperacillin', 'colistin',
       'ceftolozane;tazobactam']
    data = age_columns_new_pt(data)
    data = merge_unique_prb(data)
    data = select_columns(data)
    data = add_antibiotic_column(data, all_antibiotics)
    data = load_and_encode(data)
    return data

In [12]:
new_patient_encoded = preprocess_pipeline_new_pt(new_patient)

In [13]:
new_patient_encoded.head()

Unnamed: 0,VISIBLE_PATIENT_ID,PRESCRIPTION_REFERENCE_NUMBER,SEX_RCD,AGE,ORDER_OWNER,LOCATION,DEPARTMENT_ORDERED,general_problem,SENSITIVITY_ANTIBIOTIC
0,123456789,987654321,1,65,7,2,19,69,3
1,123456789,987654321,1,65,7,2,19,69,1
2,123456789,987654321,1,65,7,2,19,69,35
3,123456789,987654321,1,65,7,2,19,69,9
4,123456789,987654321,1,65,7,2,19,69,11


In [14]:
model = joblib.load('/content/drive/MyDrive/antibiotic/xgb_model.pkl')

In [15]:
y_pred = model.predict(new_patient_encoded)
probs = model.predict_proba(new_patient_encoded)

In [16]:
new_patient_encoded['SENSITIVITY_INTERPRETION'] = y_pred
new_patient_encoded['PROB_SENSITIVE'] = np.round(probs[:, 1],2)  # Xác suất nhạy (class 1)
new_patient_encoded['PROB_RESISTANT'] = np.round(probs[:, 0],2)  # Xác suất kháng (class 0)

In [17]:
new_patient_encoded.head()

Unnamed: 0,VISIBLE_PATIENT_ID,PRESCRIPTION_REFERENCE_NUMBER,SEX_RCD,AGE,ORDER_OWNER,LOCATION,DEPARTMENT_ORDERED,general_problem,SENSITIVITY_ANTIBIOTIC,SENSITIVITY_INTERPRETION,PROB_SENSITIVE,PROB_RESISTANT
0,123456789,987654321,1,65,7,2,19,69,3,1,0.71,0.29
1,123456789,987654321,1,65,7,2,19,69,1,1,0.9,0.1
2,123456789,987654321,1,65,7,2,19,69,35,1,0.99,0.01
3,123456789,987654321,1,65,7,2,19,69,9,1,0.71,0.29
4,123456789,987654321,1,65,7,2,19,69,11,1,0.98,0.02


In [18]:
def decode_and_overwrite(data):
    columns_to_decode = ['SEX_RCD', 'general_problem', 'SENSITIVITY_ANTIBIOTIC',
                         'ORDER_OWNER', 'LOCATION', 'DEPARTMENT_ORDERED']

    for col in columns_to_decode:
        encoder_path = f'encoders/{col}_encoder.pkl'
        if os.path.exists(encoder_path):
            label_encoder = joblib.load(encoder_path)
            data[col] = label_encoder.inverse_transform(data[col])
        else:
            print(f"⚠️ Không tìm thấy encoder cho {col}")

    return data

In [19]:
new_patient_encoded = decode_and_overwrite(new_patient_encoded)

In [20]:
new_patient_encoded.head()

Unnamed: 0,VISIBLE_PATIENT_ID,PRESCRIPTION_REFERENCE_NUMBER,SEX_RCD,AGE,ORDER_OWNER,LOCATION,DEPARTMENT_ORDERED,general_problem,SENSITIVITY_ANTIBIOTIC,SENSITIVITY_INTERPRETION,PROB_SENSITIVE,PROB_RESISTANT
0,123456789,987654321,M,65,Dr. Duong Bich Thuy,ICU,ICU Department,sigmoid colonic perforation with diffuse peri...,ampicillin,1,0.71,0.29
1,123456789,987654321,M,65,Dr. Duong Bich Thuy,ICU,ICU Department,sigmoid colonic perforation with diffuse peri...,amoxicillin sodium;clavulanic acid,1,0.9,0.1
2,123456789,987654321,M,65,Dr. Duong Bich Thuy,ICU,ICU Department,sigmoid colonic perforation with diffuse peri...,piperacillin;tazobactam,1,0.99,0.01
3,123456789,987654321,M,65,Dr. Duong Bich Thuy,ICU,ICU Department,sigmoid colonic perforation with diffuse peri...,cefazolin,1,0.71,0.29
4,123456789,987654321,M,65,Dr. Duong Bich Thuy,ICU,ICU Department,sigmoid colonic perforation with diffuse peri...,cefotaxime,1,0.98,0.02
