# In the notebook, a model is trained for each medicine

data source: ./simplified_data/simplified_data2.csv

the last few blocks were commmented since I have no clue what they were for.




# 0. Global Variables

In [25]:
# Remove the entire column related to a specific medicine if its occurrence is below the defined threshold.
# In the given dataset, setting this threshold to 250 would retain columns for only the top 10 most frequently used medicines.
DeleteMedThreshold = 250

# Determine the number of medicines to be trained as output
# The total number of medicine is 102
NumMedTrain = 102

# Decide whether to enhance accuracy by utilizing class weights
UseClassWeight = True

# Decide learning_rate
LearningRate = 0.005

# 1. Import module


In [26]:

# Importing necessary libraries
import numpy as np
import pandas as pd
import statistics
import csv
from tabulate import tabulate
import matplotlib.pyplot as plt

# Importing TensorFlow for deep learning
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout
from keras.callbacks import EarlyStopping, LambdaCallback
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy

# Importing scikit-learn for data preprocessing and utilities
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_sample_weight

# Importing custom utility functions
from utility_file import my_utilities as myutil
from utility_file import load_data

# 2. Read Data
Load the data using the custom module "load_data"

In [27]:
# Load data for training a model with one specific medicine, including debugging information
(X_np, X_val_np, train_y, val_y, 
 num_col_x, num_1_valy, num_0_valy) = load_data.load_data_for_1_med_with_debug(del_med_thres=DeleteMedThreshold, random_seed=1, n=NumMedTrain)

# Ensure the correct data types for loaded variables
assert isinstance(X_np, np.ndarray)
assert isinstance(X_val_np, np.ndarray)
assert isinstance(train_y, pd.DataFrame)
assert isinstance(val_y, pd.DataFrame)
assert isinstance(num_col_x, int)


--------------------------------------------------------------------------------
ReadData:
Type of data: <class 'pandas.core.frame.DataFrame'>
Shape of data = (797 rows, 215 cols).
End of ReadData
--------------------------------------------------------------------------------
SplitXY:
Shape of X = (796 rows, 111 cols).
Shape of y = (796 rows, 102 cols).
End of SplitXY
--------------------------------------------------------------------------------
In load_data_for_1_med_with_debug of load_data.py, random_seed= 1
After SplitXY, total number of 0, 1 in y:
Number of 0s: 72318
Number of 1s: 8874
--------------------------------------------------------------------------------
DeleteMedicine: shape of y is (796, 10).
save med num done
Train_X.shape:  (637, 111)
Train_y.shape:  (637, 10)

Split Training Validation
Number of 0s in train_y: 3660
Number of 1s train_y: 2710
Number of 0s in val_y: 860
Number of 1s val_y: 730
------------------------------------------------------------------------

# 3. Data Type Checking

In [28]:
# Uncomment the line below to display the DataFrame content and structure
# myutil.print_df(val_y, "---- y ----")

# Uncomment the line below to print the DataFrame directly
# print(val_y)

# Checking:

# Counting NA values in y
na_count = val_y.isna().sum().sum()

# Counting str values in y
str_count = val_y[val_y.map(type) == str].count().sum()

# Counting int values in y
int_count = val_y[val_y.map(type) == int].count().sum()

# Counting float values in y
float_count = val_y[val_y.map(type) == float].count().sum()

# Display the results
print(f"Number of NA values in y: {na_count}")
print(f"Number of str values in y: {str_count}")
print(f"Number of int values in y: {int_count}")
print(f"Number of float values in y: {float_count}")


Number of NA values in y: 0
Number of str values in y: 0
Number of int values in y: 1590
Number of float values in y: 0


# 4. Compute Class Weight

In [29]:
# Convert the 'train_y' DataFrame to a NumPy array
train_y_np = np.array(train_y)

# Determine the number of labels (columns) in the array
num_labels = train_y_np.shape[1]

# Initialize an empty dictionary to store class weights for each label
class_weight_dic = {}

# Iterate over each label column
for i in range(num_labels):
    # Count the occurrences of each class (0 and 1) in the current label column
    unique_values, counts = np.unique(train_y_np[:, i], return_counts=True)
    
    # Create a dictionary mapping class values to their frequencies
    value_frequency_dict = dict(zip(unique_values, counts))
    
    # Calculate the total number of occurrences for normalization
    total = value_frequency_dict.get(0, 0) + value_frequency_dict.get(1, 0)
    
    # Calculate class weights and store them in the dictionary
    class_weight_dic[i] = {0: (value_frequency_dict.get(1, 0) / total), 1: (value_frequency_dict.get(0, 0) / total)}

# Display the computed class weights
print(class_weight_dic)

{0: {0: 0.42543171114599687, 1: 0.5745682888540031}, 1: {0: 0.3218210361067504, 1: 0.6781789638932496}, 2: {0: 0.39717425431711145, 1: 0.6028257456828885}, 3: {0: 0.32653061224489793, 1: 0.673469387755102}, 4: {0: 0.35478806907378335, 1: 0.6452119309262166}, 5: {0: 0.4207221350078493, 1: 0.5792778649921507}, 6: {0: 0.5667189952904239, 1: 0.43328100470957615}, 7: {0: 0.33124018838304553, 1: 0.6687598116169545}, 8: {0: 0.5117739403453689, 1: 0.48822605965463106}, 9: {0: 0.598116169544741, 1: 0.40188383045525905}}


# 5. Build Model

In [30]:
# Define a Sequential model
model = Sequential([
    Dense(units=64, input_shape=(num_col_x,), activation='sigmoid'),
    Dense(units=16, activation='sigmoid'), 
    Dense(units=2, activation='sigmoid')
])

# Display a summary of the model architecture
model.summary()

# Compile the model with specified optimizer, loss function, and metrics
model.compile(optimizer=Adam(learning_rate=LearningRate),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                7168      
                                                                 
 dense_7 (Dense)             (None, 16)                1040      
                                                                 
 dense_8 (Dense)             (None, 2)                 34        
                                                                 
Total params: 8,242
Trainable params: 8,242
Non-trainable params: 0
_________________________________________________________________


# 6. Train Model
There will be a model for each medicine

In [31]:

# Initialize dictionaries to store results and training history
result_df_dict = {}        # Dictionary of DataFrames of each medicine in training set 
accuracy_dict = {}         # Dictionary of accuracy for each medicine
prediction_train_dict = {}  # Dictionary of raw predictions for the training set
prediction_val_dict = {}    # Dictionary of raw predictions for the validation set

# Iterate over each medicine
for i in range(train_y.shape[1]):
    chosen_col = train_y.iloc[:, i].copy()
    
    # Ensure that the chosen column is a pandas Series
    assert(isinstance(chosen_col, pd.Series))
    assert(len(chosen_col) == len(train_y))
    
    print(f"Processing medicine {i + 1} of {train_y.shape[1]}: {chosen_col.name}")

    # Convert the chosen column to NumPy array
    chosen_y_np = chosen_col.values.astype('float64')

    # Copy the corresponding validation set column
    y_val_chosen_col = val_y.iloc[:, i].copy()
    
    # Ensure that the validation set column is a pandas Series
    assert(isinstance(y_val_chosen_col, pd.Series))
    assert(len(y_val_chosen_col) == len(val_y))

    # Early stopping callback
    early_stopping = EarlyStopping(monitor='loss', patience=30, restore_best_weights=True)

    # Fit the model for the current medicine
    Model = model.fit(
        x=X_np,
        y=chosen_y_np,
        class_weight=class_weight_dic[i] if UseClassWeight else None,
        epochs=2000,
        shuffle=True,
        verbose=0,
        callbacks=[early_stopping]
    )
    
    # Print when training stopped
    print(f"Training stopped at epoch {Model.epoch[-1]}")
    
    # Predict against the training set for diagnosing overfitting or underfitting
    predictions_train_set = model.predict(X_np)
    
    # Save raw result numpy array of training set to the dictionary
    prediction_train_dict[chosen_col.name] = predictions_train_set
    
    # Make predictions for the validation set
    predictions_val_set = model.predict(X_val_np)
    
    # Save raw result numpy array of validation set to the dictionary
    prediction_val_dict[chosen_col.name] = predictions_val_set
    
    # Plotting loss vs. epoch
    # plt.plot(Model.history['loss'], label='Training Loss')
    # plt.title('Loss vs. Epoch')
    # plt.xlabel('Epoch')
    # plt.ylabel('Loss')
    # plt.legend()
    # plt.show()

print("Training done.")


Processing medicine 1 of 10: 桂枝


Training stopped at epoch 348
Processing medicine 2 of 10: 柴胡
Training stopped at epoch 282
Processing medicine 3 of 10: 黃芩
Training stopped at epoch 224
Processing medicine 4 of 10: 茯苓
Training stopped at epoch 312
Processing medicine 5 of 10: 澤瀉
Training stopped at epoch 291
Processing medicine 6 of 10: 附子
Training stopped at epoch 206
Processing medicine 7 of 10: 甘草
Training stopped at epoch 342
Processing medicine 8 of 10: 當歸
Training stopped at epoch 297
Processing medicine 9 of 10: 白芍
Training stopped at epoch 289
Processing medicine 10 of 10: 炙甘草
Training stopped at epoch 205
Training done.


# 7. Handle result

### 7.1 Calculate the f1 score of the training dataset and store the values in TrainMedicineDictioanry

In [32]:
# Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN) for the training set
total_tp_train = 0
total_fp_train = 0
total_tn_train = 0
total_fn_train = 0

# Create a dictioanry to store all values of a medicine
TrainMedicineDictioanry = {}

# Iterate through each medicine's raw prediction array
for key, arr in prediction_train_dict.items():
    
    # Create a DataFrame from the raw prediction array
    df_tmp = pd.DataFrame(arr, columns=["predicted as 0", "predicted as 1"])

    # Determine the predicted value based on probabilities
    df_tmp["predicted value"] = np.where(df_tmp["predicted as 0"] > df_tmp["predicted as 1"], 0, 1)
    
    # Get the column number of the current medicine in the training labels
    col_num = train_y.columns.get_loc(key)
    
    # Add ground truth values to the DataFrame
    df_tmp["ground truth"] = train_y.iloc[:, col_num].copy().values
    
    
    TP = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 1)).sum()
    FP = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 1)).sum()
    FN = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 0)).sum()
    TN = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 0)).sum()
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate TP, FP, FN, TN for the current medicine
    total_tp_train += TP
    total_fp_train += FP
    total_fn_train += FN
    total_tn_train += TN
    
    TrainMedicineDictioanry[key] = {
        "TP" : TP,
        "FP" : FP,
        "FN" : FN,
        "TN" : TN,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : f1score
    }

precision = total_tp_train / (total_tp_train + total_fp_train) if (total_tp_train + total_fp_train) > 0 else 0
recall = total_tp_train / (total_tp_train + total_fn_train) if (total_tp_train + total_fn_train) > 0 else 0

TrainMedicineDictioanry["overall"] = {
        "TP" : total_tp_train,
        "FP" : total_fp_train,
        "FN" : total_fn_train,
        "TN" : total_tn_train,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
}

print("TrainMedicineDictioanry:")
for key in TrainMedicineDictioanry:
    print(key, TrainMedicineDictioanry[key])
# print("overall", TrainMedicineDictioanry["overall"])

TrainMedicineDictioanry:
桂枝 {'TP': 258, 'FP': 2, 'FN': 13, 'TN': 364, 'precision': 0.9923076923076923, 'recall': 0.9520295202952029, 'f1-score': 0.9717514124293786}
柴胡 {'TP': 201, 'FP': 4, 'FN': 4, 'TN': 428, 'precision': 0.9804878048780488, 'recall': 0.9804878048780488, 'f1-score': 0.9804878048780488}
黃芩 {'TP': 250, 'FP': 2, 'FN': 3, 'TN': 382, 'precision': 0.9920634920634921, 'recall': 0.9881422924901185, 'f1-score': 0.9900990099009901}
茯苓 {'TP': 200, 'FP': 4, 'FN': 8, 'TN': 425, 'precision': 0.9803921568627451, 'recall': 0.9615384615384616, 'f1-score': 0.970873786407767}
澤瀉 {'TP': 218, 'FP': 2, 'FN': 8, 'TN': 409, 'precision': 0.990909090909091, 'recall': 0.9646017699115044, 'f1-score': 0.9775784753363229}
附子 {'TP': 258, 'FP': 2, 'FN': 10, 'TN': 367, 'precision': 0.9923076923076923, 'recall': 0.9626865671641791, 'f1-score': 0.9772727272727273}
甘草 {'TP': 340, 'FP': 1, 'FN': 21, 'TN': 275, 'precision': 0.9970674486803519, 'recall': 0.9418282548476454, 'f1-score': 0.9686609686609687}
當

### 7.2 Calculate the f1 score of the validation dataset and store the values in ValMedicineDictioanry

In [33]:
# Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN) for the training set
total_tp_train = 0
total_fp_train = 0
total_tn_train = 0
total_fn_train = 0

# Create a dictioanry to store all values of a medicine
ValMedicineDictioanry = {}

# Iterate through each medicine's raw prediction array
for key, arr in prediction_val_dict.items():
    
    # Create a DataFrame from the raw prediction array
    df_tmp = pd.DataFrame(arr, columns=["predicted as 0", "predicted as 1"])

    # Determine the predicted value based on probabilities
    df_tmp["predicted value"] = np.where(df_tmp["predicted as 0"] > df_tmp["predicted as 1"], 0, 1)
    
    # Get the column number of the current medicine in the training labels
    col_num = val_y.columns.get_loc(key)
    
    # Add ground truth values to the DataFrame
    df_tmp["ground truth"] = val_y.iloc[:, col_num].copy().values
    result_df_dict[key] = df_tmp
    
    TP = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 1)).sum()
    FP = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 1)).sum()
    FN = ((df_tmp['ground truth'] == 1) & (df_tmp['predicted value'] == 0)).sum()
    TN = ((df_tmp['ground truth'] == 0) & (df_tmp['predicted value'] == 0)).sum()
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate TP, FP, FN, TN for the current medicine
    total_tp_train += TP
    total_fp_train += FP
    total_fn_train += FN
    total_tn_train += TN
    
    ValMedicineDictioanry[key] = {
        "TP" : TP,
        "FP" : FP,
        "FN" : FN,
        "TN" : TN,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : f1score
    }

precision = total_tp_train / (total_tp_train + total_fp_train) if (total_tp_train + total_fp_train) > 0 else 0
recall = total_tp_train / (total_tp_train + total_fn_train) if (total_tp_train + total_fn_train) > 0 else 0

ValMedicineDictioanry["overall"] = {
        "TP" : total_tp_train,
        "FP" : total_fp_train,
        "FN" : total_fn_train,
        "TN" : total_tn_train,
        "precision" : precision,
        "recall" : recall,
        "f1-score" : 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
}

print("ValMedicineDictioanry:")
for key in ValMedicineDictioanry:
    print(key, ValMedicineDictioanry[key])
# print("overall", ValMedicineDictioanry["overall"])

ValMedicineDictioanry:
桂枝 {'TP': 32, 'FP': 27, 'FN': 34, 'TN': 66, 'precision': 0.5423728813559322, 'recall': 0.48484848484848486, 'f1-score': 0.5120000000000001}
柴胡 {'TP': 25, 'FP': 21, 'FN': 31, 'TN': 82, 'precision': 0.5434782608695652, 'recall': 0.44642857142857145, 'f1-score': 0.4901960784313726}
黃芩 {'TP': 31, 'FP': 28, 'FN': 36, 'TN': 64, 'precision': 0.5254237288135594, 'recall': 0.4626865671641791, 'f1-score': 0.4920634920634921}
茯苓 {'TP': 21, 'FP': 26, 'FN': 40, 'TN': 72, 'precision': 0.44680851063829785, 'recall': 0.3442622950819672, 'f1-score': 0.38888888888888884}
澤瀉 {'TP': 29, 'FP': 24, 'FN': 43, 'TN': 63, 'precision': 0.5471698113207547, 'recall': 0.4027777777777778, 'f1-score': 0.46399999999999997}
附子 {'TP': 42, 'FP': 31, 'FN': 36, 'TN': 50, 'precision': 0.5753424657534246, 'recall': 0.5384615384615384, 'f1-score': 0.5562913907284769}
甘草 {'TP': 46, 'FP': 32, 'FN': 43, 'TN': 38, 'precision': 0.5897435897435898, 'recall': 0.5168539325842697, 'f1-score': 0.5508982035928144}

# 8. Result Saving

Saving resulting DataFrames to csv and dictionaries to txt for later usage

In [34]:
# Path identifier for saving results in the directory
file_path = "./result/one_medicine_result"

# Exporting the result of each medicine in validation set to csv file
for key, df in result_df_dict.items():

    # Uncomment if you want to print the DataFrames on console
    # print(f"DataFrame for {key}:")
    # myutil.print_df(df)
    myutil.df_to_csv(df, save_path=file_path, file_prefix=key)
   



桂枝 saved to ./result/one_medicine_result/桂枝_25.csv
柴胡 saved to ./result/one_medicine_result/柴胡_25.csv
黃芩 saved to ./result/one_medicine_result/黃芩_25.csv
茯苓 saved to ./result/one_medicine_result/茯苓_25.csv
澤瀉 saved to ./result/one_medicine_result/澤瀉_25.csv
附子 saved to ./result/one_medicine_result/附子_25.csv
甘草 saved to ./result/one_medicine_result/甘草_25.csv
當歸 saved to ./result/one_medicine_result/當歸_25.csv
白芍 saved to ./result/one_medicine_result/白芍_25.csv
炙甘草 saved to ./result/one_medicine_result/炙甘草_25.csv


In [35]:
# Create a DataFrame to record f1 score, TP/FP/TN/FN of each medicine
all_f1_df = pd.DataFrame([(key, val['f1-score'], val['precision'], val['recall'], 
                           val['TP'], val['FP'], val['TN'], val['FN']) for key, val in ValMedicineDictioanry.items()], 
                         columns=['medicine', 'f1-score','precision', 'recall', 'TP', 'FP', 'TN', 'FN']
                        )

file_path = "./result/result_all_medicine_val"

# Exporting the DataFrame to csv file
myutil.df_to_csv(all_f1_df, save_path=file_path, file_prefix='f1_score_all_medicine_val')

f1_score_all_medicine_val saved to ./result/result_all_medicine_val/f1_score_all_medicine_val_25.csv


In [36]:

# Specification string to be printed onto the resulting txt files
# This is for recording the training specifications that produced these results
training_specification  = "model layer:  32-64-128-64-32 units, activation: relu, optimizer: Adam, learning rate: 0.001, epochs: 1000, batch_size: 32, num_med: all. del_med_under_thres: 0"

file_path="./result/one_med_ValMedicineDict"

# Exporting f1-score, TP/FP/TN/FN of the training and validation sets to text files
myutil.dict_to_txt(ValMedicineDictioanry, save_path=file_path, 
                   file_prefix="ValMedicineDict",
                   textbox=training_specification )

file_path="./result/one_med_TrainMedicineDict"
myutil.dict_to_txt(TrainMedicineDictioanry, save_path=file_path, 
                   file_prefix="TrainMedicineDict",
                   textbox="train set, "+training_specification )


ValMedicineDict saved to ./result/one_med_ValMedicineDict/ValMedicineDict_25.txt
TrainMedicineDict saved to ./result/one_med_TrainMedicineDict/TrainMedicineDict_25.txt
