# In the notebook, a model is trained for each medicine

data source: ./simplified_data/simplified_data2.csv

the last few blocks were commmented since I have no clue what they were for.




# 0. Global Variables

In [1]:
# Remove the entire column related to a specific medicine if its occurrence is below the defined threshold.
# In the given dataset, setting this threshold to 250 would retain columns for only the top 10 most frequently used medicines.
DeleteMedThreshold = 100

# Determine the number of medicines to be trained as output
# The total number of medicine is 102
NumMedTrain = 102

# Decide whether to enhance accuracy by utilizing class weights
UseClassWeight = False

# Decide learning_rate
LearningRate = 0.0015

# 1. Import module


In [2]:

# Importing necessary libraries
import numpy as np
import pandas as pd
import statistics
from tabulate import tabulate
import matplotlib.pyplot as plt

# Importing TensorFlow for deep learning
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.callbacks import EarlyStopping, LambdaCallback
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy

# Importing scikit-learn for data preprocessing and utilities
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_sample_weight

# Importing custom utility functions
from utility_file import my_utilities as myutil
from utility_file import load_data

# 2. Read Data
Load the data using the custom module "load_data"

In [3]:
# Load data for training a model with one specific medicine, including debugging information
(X_np, X_val_np, train_y, val_y, 
 num_col_x, num_1_valy, num_0_valy) = load_data.load_data_for_1_med_with_debug(del_med_thres=DeleteMedThreshold, random_seed=3, n=NumMedTrain)

# Ensure the correct data types for loaded variables
assert isinstance(X_np, np.ndarray)
assert isinstance(X_val_np, np.ndarray)
assert isinstance(train_y, pd.DataFrame)
assert isinstance(val_y, pd.DataFrame)
assert isinstance(num_col_x, int)


--------------------------------------------------------------------------------
ReadData:
Type of data: <class 'pandas.core.frame.DataFrame'>
Shape of data = (797 rows, 215 cols).
End of ReadData
--------------------------------------------------------------------------------
SplitXY:
Shape of X = (796 rows, 111 cols).
Shape of y = (796 rows, 102 cols).
End of SplitXY
--------------------------------------------------------------------------------
In load_data_for_1_med_with_debug of load_data.py, random_seed= 3
After SplitXY, total number of 0, 1 in y:
Number of 0s: 72318
Number of 1s: 8874
--------------------------------------------------------------------------------
DeleteMedicine: shape of y is (796, 31).
Train_X.shape:  (637, 111)
Train_y.shape:  (637, 31)

Split Training Validation
Number of 0s in train_y: 14345
Number of 1s train_y: 5402
Number of 0s in val_y: 3617
Number of 1s val_y: 1312
--------------------------------------------------------------------------------


# 3. Data Type Checking

In [4]:
# Uncomment the line below to display the DataFrame content and structure
# myutil.print_df(val_y, "---- y ----")

# Uncomment the line below to print the DataFrame directly
# print(val_y)

# Checking:

# Counting NA values in y
na_count = val_y.isna().sum().sum()

# Counting str values in y
str_count = val_y[val_y.map(type) == str].count().sum()

# Counting int values in y
int_count = val_y[val_y.map(type) == int].count().sum()

# Counting float values in y
float_count = val_y[val_y.map(type) == float].count().sum()

# Display the results
print(f"Number of NA values in y: {na_count}")
print(f"Number of str values in y: {str_count}")
print(f"Number of int values in y: {int_count}")
print(f"Number of float values in y: {float_count}")


Number of NA values in y: 0
Number of str values in y: 0
Number of int values in y: 4929
Number of float values in y: 0


# 4. Compute Class Weight

In [5]:
# Convert the 'train_y' DataFrame to a NumPy array
train_y_np = np.array(train_y)

# Determine the number of labels (columns) in the array
num_labels = train_y_np.shape[1]

# Initialize an empty dictionary to store class weights for each label
class_weight_dic = {}

# Iterate over each label column
for i in range(num_labels):
    # Count the occurrences of each class (0 and 1) in the current label column
    unique_values, counts = np.unique(train_y_np[:, i], return_counts=True)
    
    # Create a dictionary mapping class values to their frequencies
    value_frequency_dict = dict(zip(unique_values, counts))
    
    # Calculate the total number of occurrences for normalization
    total = value_frequency_dict.get(0, 0) + value_frequency_dict.get(1, 0)
    
    # Calculate class weights and store them in the dictionary
    class_weight_dic[i] = {0: (value_frequency_dict.get(1, 0) / total), 1: (value_frequency_dict.get(0, 0) / total)}

# Display the computed class weights
print(class_weight_dic)

{0: {0: 0.152276295133438, 1: 0.847723704866562}, 1: {0: 0.4301412872841444, 1: 0.5698587127158555}, 2: {0: 0.31711145996860285, 1: 0.6828885400313972}, 3: {0: 0.2794348508634223, 1: 0.7205651491365777}, 4: {0: 0.3359497645211931, 1: 0.6640502354788069}, 5: {0: 0.13657770800627944, 1: 0.8634222919937206}, 6: {0: 0.16326530612244897, 1: 0.8367346938775511}, 7: {0: 0.20094191522762953, 1: 0.7990580847723705}, 8: {0: 0.40816326530612246, 1: 0.5918367346938775}, 9: {0: 0.14442700156985872, 1: 0.8555729984301413}, 10: {0: 0.16797488226059654, 1: 0.8320251177394035}, 11: {0: 0.3390894819466248, 1: 0.6609105180533752}, 12: {0: 0.38304552590266877, 1: 0.6169544740973313}, 13: {0: 0.22762951334379905, 1: 0.7723704866562009}, 14: {0: 0.4379905808477237, 1: 0.5620094191522763}, 15: {0: 0.13971742543171115, 1: 0.8602825745682888}, 16: {0: 0.13971742543171115, 1: 0.8602825745682888}, 17: {0: 0.14442700156985872, 1: 0.8555729984301413}, 18: {0: 0.15070643642072212, 1: 0.8492935635792779}, 19: {0: 0.

# 5. Build Model

In [6]:
# Define a Sequential model
model = Sequential([
    Dense(units=32, input_shape=(num_col_x,), activation='sigmoid'),
    # Additional layers (commented out for simplicity)
    # Dense(units=64, activation='relu'), 
    # Dense(units=128, activation='relu'), 
    # Dense(units=32, activation='relu'),
    # Dense(units=16, activation='sigmoid'), 
    Dense(units=2, activation='softmax')
])

# Display a summary of the model architecture
model.summary()

# Compile the model with specified optimizer, loss function, and metrics
model.compile(optimizer=Adam(learning_rate=LearningRate),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                3584      
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 3,650
Trainable params: 3,650
Non-trainable params: 0
_________________________________________________________________


# 6. Train Model
There will be a model for each medicine

In [7]:

# Initialize dictionaries to store results and training history
result_df_dict = {}        # Dictionary of DataFrames
accuracy_dict = {}         # Dictionary of accuracy for each medicine
prediction_train_dict = {}  # Dictionary of raw predictions for the training set
prediction_val_dict = {}    # Dictionary of raw predictions for the validation set

# Iterate over each medicine
for i in range(train_y.shape[1]):
    chosen_col = train_y.iloc[:, i].copy()
    
    # Ensure that the chosen column is a pandas Series
    assert(isinstance(chosen_col, pd.Series))
    assert(len(chosen_col) == len(train_y))
    
    print(f"Processing medicine {i + 1} of {train_y.shape[1]}: {chosen_col.name}")

    # Convert the chosen column to NumPy array
    chosen_y_np = chosen_col.values.astype('float64')

    # Copy the corresponding validation set column
    y_val_chosen_col = val_y.iloc[:, i].copy()
    
    # Ensure that the validation set column is a pandas Series
    assert(isinstance(y_val_chosen_col, pd.Series))
    assert(len(y_val_chosen_col) == len(val_y))

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='loss', patience=30, restore_best_weights=True)

    # Fit the model for the current medicine
    Model = model.fit(
        x=X_np,
        y=chosen_y_np,
        class_weight=class_weight_dic[i] if UseClassWeight else None,
        epochs=5000,
        shuffle=True,
        verbose=0,
        callbacks=[early_stopping]
    )
    
    # Print when training stopped
    print(f"Training stopped at epoch {Model.epoch[-1]}")
    
    # Predict against the training set for diagnosing overfitting or underfitting
    predictions_train_set = model.predict(X_np)
    
    # Save raw result numpy array of training set to the dictionary
    prediction_train_dict[chosen_col.name] = predictions_train_set
    
    # Make predictions for the validation set
    predictions_val_set = model.predict(X_val_np)
    
    # Save raw result numpy array of validation set to the dictionary
    prediction_val_dict[chosen_col.name] = predictions_val_set
    
    # Plotting loss vs. epoch
    # plt.plot(Model.history['loss'], label='Training Loss')
    # plt.title('Loss vs. Epoch')
    # plt.xlabel('Epoch')
    # plt.ylabel('Loss')
    # plt.legend()
    # plt.show()

print("Training done.")


Processing medicine 1 of 31: 麻黃


Training stopped at epoch 1496
Processing medicine 2 of 31: 桂枝
Training stopped at epoch 1076
Processing medicine 3 of 31: 細辛
Training stopped at epoch 805
Processing medicine 4 of 31: 生薑
Training stopped at epoch 886
Processing medicine 5 of 31: 柴胡
Training stopped at epoch 922
Processing medicine 6 of 31: 石膏
Training stopped at epoch 1080
Processing medicine 7 of 31: 知母
Training stopped at epoch 803
Processing medicine 8 of 31: 黃連
Training stopped at epoch 1001
Processing medicine 9 of 31: 黃芩
Training stopped at epoch 855
Processing medicine 10 of 31: 龍膽草
Training stopped at epoch 570
Processing medicine 11 of 31: 大黃
Training stopped at epoch 865
Processing medicine 12 of 31: 茯苓
Training stopped at epoch 1180
Processing medicine 13 of 31: 澤瀉
Training stopped at epoch 1261
Processing medicine 14 of 31: 乾薑
Training stopped at epoch 1358
Processing medicine 15 of 31: 附子
Training stopped at epoch 1541
Processing medicine 16 of 31: 枳實
Training stopped at epoch 1441
Processing medicine 17 

# 7. Handle result

### 7.1 Calculate the f1 score of the training dataset and store the values in TrainMedicineDictioanry

In [8]:
# Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN) for the training set
total_tp_train = 0
total_fp_train = 0
total_tn_train = 0
total_fn_train = 0

# Create a dictioanry to store all values of a medicine
TrainMedicineDictioanry = {}

# Iterate through each medicine's raw prediction array
for key, arr in prediction_train_dict.items():
    
    # Create a DataFrame from the raw prediction array
    df_tmp = pd.DataFrame(arr, columns=["predicted as 0", "predicted as 1"])

    # Determine the predicted value based on probabilities
    df_tmp["predicted value"] = np.where(df_tmp["predicted as 0"] > df_tmp["predicted as 1"], 0, 1)
    
    # Get the column number of the current medicine in the training labels
    col_num = train_y.columns.get_loc(key)
    
    # Add ground truth values to the DataFrame
    df_tmp["ground truth"] = train_y.iloc[:, col_num].copy().values
    
    TP = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 1)).sum()
    FP = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 1)).sum()
    FN = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 0)).sum()
    TN = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 0)).sum()
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate TP, FP, FN, TN for the current medicine
    total_tp_train += TP
    total_fp_train += FP
    total_fn_train += FN
    total_tn_train += TN
    
    TrainMedicineDictioanry[key] = {
        "TP" : TP,
        "FP" : FP,
        "FN" : FN,
        "TN" : TN,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : f1score
    }

precision = total_tp_train / (total_tp_train + total_fp_train)
recall = total_tp_train / (total_tp_train + total_fn_train)

TrainMedicineDictioanry["overall"] = {
        "TP" : total_tp_train,
        "FP" : total_fp_train,
        "FN" : total_fn_train,
        "TN" : total_tn_train,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
}

print("TrainMedicineDictioanry:")
for key in TrainMedicineDictioanry:
    print(key, TrainMedicineDictioanry[key])

TrainMedicineDictioanry:
麻黃 {'TP': 88, 'FP': 2, 'FN': 9, 'TN': 538, 'precision': 0.9777777777777777, 'recall': 0.9072164948453608, 'f1-score': 0.9411764705882353}
桂枝 {'TP': 265, 'FP': 3, 'FN': 9, 'TN': 360, 'precision': 0.9888059701492538, 'recall': 0.9671532846715328, 'f1-score': 0.977859778597786}
細辛 {'TP': 194, 'FP': 1, 'FN': 8, 'TN': 434, 'precision': 0.9948717948717949, 'recall': 0.9603960396039604, 'f1-score': 0.9773299748110831}
生薑 {'TP': 171, 'FP': 6, 'FN': 7, 'TN': 453, 'precision': 0.9661016949152542, 'recall': 0.9606741573033708, 'f1-score': 0.9633802816901408}
柴胡 {'TP': 207, 'FP': 1, 'FN': 7, 'TN': 422, 'precision': 0.9951923076923077, 'recall': 0.9672897196261683, 'f1-score': 0.981042654028436}
石膏 {'TP': 80, 'FP': 1, 'FN': 7, 'TN': 549, 'precision': 0.9876543209876543, 'recall': 0.9195402298850575, 'f1-score': 0.9523809523809523}
知母 {'TP': 99, 'FP': 0, 'FN': 5, 'TN': 533, 'precision': 1.0, 'recall': 0.9519230769230769, 'f1-score': 0.9753694581280787}
黃連 {'TP': 126, 'FP': 0

### 7.2 Calculate the f1 score of the validation dataset and store the values in ValMedicineDictioanry

In [9]:
# Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN) for the training set
total_tp_train = 0
total_fp_train = 0
total_tn_train = 0
total_fn_train = 0

# Create a dictioanry to store all values of a medicine
ValMedicineDictioanry = {}

# Iterate through each medicine's raw prediction array
for key, arr in prediction_val_dict.items():
    
    # Create a DataFrame from the raw prediction array
    df_tmp = pd.DataFrame(arr, columns=["predicted as 0", "predicted as 1"])

    # Determine the predicted value based on probabilities
    df_tmp["predicted value"] = np.where(df_tmp["predicted as 0"] > df_tmp["predicted as 1"], 0, 1)
    
    # Get the column number of the current medicine in the training labels
    col_num = val_y.columns.get_loc(key)
    
    # Add ground truth values to the DataFrame
    df_tmp["ground truth"] = val_y.iloc[:, col_num].copy().values
    
    TP = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 1)).sum()
    FP = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 1)).sum()
    FN = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 0)).sum()
    TN = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 0)).sum()
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate TP, FP, FN, TN for the current medicine
    total_tp_train += TP
    total_fp_train += FP
    total_fn_train += FN
    total_tn_train += TN
    
    ValMedicineDictioanry[key] = {
        "TP" : TP,
        "FP" : FP,
        "FN" : FN,
        "TN" : TN,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : f1score
    }

precision = total_tp_train / (total_tp_train + total_fp_train)
recall = total_tp_train / (total_tp_train + total_fn_train)

ValMedicineDictioanry["overall"] = {
        "TP" : total_tp_train,
        "FP" : total_fp_train,
        "FN" : total_fn_train,
        "TN" : total_tn_train,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
}

print("ValMedicineDictioanry:")
for key in ValMedicineDictioanry:
    print(key, ValMedicineDictioanry[key])

ValMedicineDictioanry:
麻黃 {'TP': 5, 'FP': 12, 'FN': 30, 'TN': 112, 'precision': 0.29411764705882354, 'recall': 0.14285714285714285, 'f1-score': 0.1923076923076923}
桂枝 {'TP': 21, 'FP': 32, 'FN': 42, 'TN': 64, 'precision': 0.39622641509433965, 'recall': 0.3333333333333333, 'f1-score': 0.3620689655172413}
細辛 {'TP': 21, 'FP': 29, 'FN': 26, 'TN': 83, 'precision': 0.42, 'recall': 0.44680851063829785, 'f1-score': 0.43298969072164945}
生薑 {'TP': 9, 'FP': 27, 'FN': 28, 'TN': 95, 'precision': 0.25, 'recall': 0.24324324324324326, 'f1-score': 0.24657534246575344}
柴胡 {'TP': 14, 'FP': 25, 'FN': 33, 'TN': 87, 'precision': 0.358974358974359, 'recall': 0.2978723404255319, 'f1-score': 0.3255813953488372}
石膏 {'TP': 2, 'FP': 7, 'FN': 17, 'TN': 133, 'precision': 0.2222222222222222, 'recall': 0.10526315789473684, 'f1-score': 0.14285714285714285}
知母 {'TP': 5, 'FP': 8, 'FN': 18, 'TN': 128, 'precision': 0.38461538461538464, 'recall': 0.21739130434782608, 'f1-score': 0.27777777777777773}
黃連 {'TP': 7, 'FP': 15, '

the following were all commented since i have no idead what this is

In [10]:

# file_path_suffix = "each_med_csv"   # type the dir for you to remember where u save the result
# for key, df in result_df_dict.items():
#     print(f"DataFrame for {key}:")
#     #myutil.print_df(df)
#     myutil.df_to_csv(df, save_path=("./result/"+file_path_suffix), file_prefix=key)
   



In [11]:
# # make df for all f1_score of each medicine
# all_f1_df = pd.DataFrame([(key, val['f1_score'], (val['TP']+val['FN'])) for key, val in acc_each_med.items()], columns=['medicine', 'f1_score', 'TP+FN'])
# myutil.df_to_csv(all_f1_df, save_path=("./result/"+file_path_suffix), file_prefix='all_f1_score')

In [12]:
# # save f1_score and TP/FP/TN/TN/FN

# spec_str = "model layer:  32-64-128-64-32 units, activation: relu, optimizer: Adam, learning rate: 0.001, epochs: 1000, batch_size: 32, num_med: all. del_med_under_thres: 0"
# # need to type this spec str each time to record the result

# file_path="./result/1_med_accuracy"
# myutil.dict_to_txt(acc_each_med, save_path=file_path, 
#                    file_prefix="accuracy_each_med",
#                    textbox=spec_str)


# myutil.dict_to_txt(train_set_acc, save_path=file_path, 
#                    file_prefix="accuracy_train_set",
#                    textbox="train set"+spec_str)
