In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

import torch  
import torch.nn as nn  
import torch.optim as optim  
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

from pytorch_tabnet import tab_network
from pytorch_tabnet.utils import create_group_matrix
from rtdl_revisiting_models import FTTransformer
from tab_transformer_pytorch import TabTransformer
data = pd.read_csv('FGDD.csv')
data = data[~data['Disease_id'].isnull()]
data = data.reset_index(drop=True)

# ------------------ pre-processing ------------------

In [2]:
data['Variant_Gene_id_1'].fillna(0, inplace=True)
data['Variant_Gene_id_2'].fillna(0, inplace=True)

data['Variant_Gene_id_1'] = data['Variant_Gene_id_1'].astype(int)
data['Variant_Gene_id_1'] = data['Variant_Gene_id_1'].astype(str)
data['Variant_Gene_id_2'] = data['Variant_Gene_id_2'].astype(int)
data['Variant_Gene_id_2'] = data['Variant_Gene_id_2'].astype(str)
data['Disease_id'] = data['Disease_id'].astype(int)
data['Disease_id'] = data['Disease_id'].astype(str)
# fill missing age
data['age'].fillna(data['age'].mean(), inplace=True)

# fill missing race
data['race'].fillna('unknown', inplace=True)

# fill missing region
data['region'].fillna('unknown', inplace=True)

# fill missing gender
data['gender'].fillna('unknown', inplace=True)

In [3]:
# select feature columns
patient_information = ['gender', 'age', 'region', 'race']
variant_information = ['Variant_Gene_id_1', 'Variant_Gene_id_2', 'Variant_Gene_chromosome_name_1', 'Variant_Gene_chromosome_name_2', 'Variant_Gene_chromosome_location_1', 'Variant_Gene_chromosome_location_2', 
'Variant_Gene_exon_count_1', 'Variant_Gene_exon_count_2']
phenotype_information = [col for col in data.columns if col.startswith('HP')]

features = []
features = features + patient_information 
features = features + variant_information
features = features + phenotype_information

In [4]:
le = LabelEncoder()
scaler = StandardScaler()  
cont_cols = ['age', 'Variant_Gene_exon_count_1', 'Variant_Gene_exon_count_2']
cat_cols = [col for col in features if col not in cont_cols]

# X_cont, X_cat are used in TabeTransformer and FTTransfromer, as these models require separation of continuous variables from categorical variables
X_cont = data[cont_cols]
X_cont = scaler.fit_transform(X_cont)
X_cat = pd.get_dummies(data[cat_cols], prefix=None, prefix_sep='_').values 

X = pd.get_dummies(data[features], prefix=None, prefix_sep='_')
X_scaled = scaler.fit_transform(X)

y = data['Disease_id']
y = le.fit_transform(y)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)  
y_tensor = torch.tensor(y, dtype=torch.long)
X_cont_tensor = torch.tensor(X_cont, dtype=torch.float32)
X_cat_tensor = torch.tensor(X_cat, dtype=torch.long)


# split
indices = np.arange(len(X))
train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=5)
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_tensor, y_tensor, test_size=0.3, random_state=5) 
X_cont_train, X_cont_test = X_cont_tensor[train_idx], X_cont_tensor[test_idx]
X_cat_train, X_cat_test = X_cat_tensor[train_idx], X_cat_tensor[test_idx]

print("Continuous variables shape: ", X_cont_tensor.shape)
print("Categorical variables: ", X_cat_tensor.shape)
print("All variables shape: ", X_tensor.shape)
print(X_cont_train.shape, X_cat_train.shape, X_train_nn.shape, y_train_nn.shape)
print(X_cont_test.shape, X_cat_test.shape, X_test_nn.shape, y_test_nn.shape)

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

Continuous variables shape:  torch.Size([689, 3])
Categorical variables:  torch.Size([689, 999])
All variables shape:  torch.Size([689, 1002])
torch.Size([482, 3]) torch.Size([482, 999]) torch.Size([482, 1002]) torch.Size([482])
torch.Size([207, 3]) torch.Size([207, 999]) torch.Size([207, 1002]) torch.Size([207])
cuda


# ------------------ DL methods ------------------

In [5]:
# ============== MLP ==============
class TableDataClassifier(nn.Module):  
    def __init__(self, input_dim, output_dim):  
        super(TableDataClassifier, self).__init__()  
        self.fc1 = nn.Linear(input_dim, 512)  
        self.relu = nn.ReLU()  
        self.dropout = nn.Dropout(0.5)  
        self.fc2 = nn.Linear(512, 256)  
        self.fc3 = nn.Linear(256, output_dim)  
          
    def forward(self, x):  
        x = self.fc1(x)  
        x = self.relu(x)  
        x = self.dropout(x)  
        x = self.fc2(x)  
        x = self.relu(x)  
        x = self.dropout(x)  
        x = self.fc3(x)  
        return x  


# model
MLP = TableDataClassifier(X_train_nn.shape[1], 211)
criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(MLP.parameters(), lr=0.001)  

num_epochs = 300
batch_size = 32  
train_dataset = TensorDataset(X_train_nn, y_train_nn)  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# train
MLP.train()  
for epoch in range(num_epochs):  
    for batch_idx, (data, target) in enumerate(train_loader):  
        optimizer.zero_grad()  
        output = MLP(data)  
        loss = criterion(output, target)  
        loss.backward()  
        optimizer.step()  
        
    if (epoch+1) % 10 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  

        # eval
        MLP.eval()  
        with torch.no_grad():  
            X_test_nn = X_test_nn
            y_pred_nn = MLP(X_test_nn).argmax(dim=1)  
            f1_macro_nn = f1_score(y_test_nn, y_pred_nn, average='macro')
            print(f'MLP F1 score macro: {f1_macro_nn:.4f}') 
            print('MLP Test Accuracy:', accuracy_score(y_test_nn, y_pred_nn))
    
# ============== MLP ==============

Epoch [10/300], Loss: 0.5892
MLP F1 score macro: 0.5152
MLP Test Accuracy: 0.7681159420289855
Epoch [20/300], Loss: 0.0023
MLP F1 score macro: 0.5309
MLP Test Accuracy: 0.7874396135265701
Epoch [30/300], Loss: 0.0029
MLP F1 score macro: 0.5347
MLP Test Accuracy: 0.7874396135265701
Epoch [40/300], Loss: 0.0009
MLP F1 score macro: 0.5338
MLP Test Accuracy: 0.7874396135265701
Epoch [50/300], Loss: 0.0001
MLP F1 score macro: 0.5338
MLP Test Accuracy: 0.7874396135265701
Epoch [60/300], Loss: 0.0001
MLP F1 score macro: 0.5338
MLP Test Accuracy: 0.7874396135265701
Epoch [70/300], Loss: 0.0003
MLP F1 score macro: 0.5338
MLP Test Accuracy: 0.7874396135265701
Epoch [80/300], Loss: 0.0001
MLP F1 score macro: 0.5338
MLP Test Accuracy: 0.7874396135265701
Epoch [90/300], Loss: 0.0006
MLP F1 score macro: 0.5346
MLP Test Accuracy: 0.7874396135265701
Epoch [100/300], Loss: 0.0000
MLP F1 score macro: 0.5346
MLP Test Accuracy: 0.7874396135265701
Epoch [110/300], Loss: 0.0001
MLP F1 score macro: 0.5346
ML

In [7]:
# ============== TabNet ==============
# TabNet : Attentive Interpretable Tabular Learning
input_dim = X_train_nn.shape[1]
output_dim = 211
group_matrix = create_group_matrix([], input_dim)
model = tab_network.TabNet(input_dim=input_dim, output_dim=output_dim, group_attention_matrix=group_matrix)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=4e-2)  
num_epochs = 800
batch_size = 256
lambda_sparse = 0.01
train_dataset = TensorDataset(X_train_nn, y_train_nn)  
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# train
for epoch in range(num_epochs):  
    model.train()  
    for batch_idx, (data, target) in enumerate(train_loader):  
        optimizer.zero_grad()  
        output, M_loss = model(data)  
        loss = criterion(output, target)
        loss = loss - lambda_sparse * M_loss  
        loss.backward()  
        optimizer.step()  
        
    if (epoch+1) % 10 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  

        # eval
        model.eval()  
        with torch.no_grad():  
            output, M_loss = model(X_test_nn)
            y_pred_nn = output.argmax(dim=1)
            f1_macro_nn = f1_score(y_test_nn, y_pred_nn, average='macro')
            print(f'TabNet F1 score macro: {f1_macro_nn:.4f}') 
            print('TabNet Test Accuracy:', accuracy_score(y_test_nn, y_pred_nn))

# ============== TabNet ==============

Epoch [10/800], Loss: 5.1148
TabNet F1 score macro: 0.0003
TabNet Test Accuracy: 0.014492753623188406
Epoch [20/800], Loss: 4.5467
TabNet F1 score macro: 0.0020
TabNet Test Accuracy: 0.05314009661835749
Epoch [30/800], Loss: 4.0606
TabNet F1 score macro: 0.0314
TabNet Test Accuracy: 0.10144927536231885
Epoch [40/800], Loss: 3.2806
TabNet F1 score macro: 0.0760
TabNet Test Accuracy: 0.24154589371980675
Epoch [50/800], Loss: 2.4693
TabNet F1 score macro: 0.0869
TabNet Test Accuracy: 0.24154589371980675
Epoch [60/800], Loss: 2.0482
TabNet F1 score macro: 0.1353
TabNet Test Accuracy: 0.34299516908212563
Epoch [70/800], Loss: 1.7213
TabNet F1 score macro: 0.1528
TabNet Test Accuracy: 0.391304347826087
Epoch [80/800], Loss: 1.6021
TabNet F1 score macro: 0.1934
TabNet Test Accuracy: 0.43478260869565216
Epoch [90/800], Loss: 1.1583
TabNet F1 score macro: 0.2133
TabNet Test Accuracy: 0.42995169082125606
Epoch [100/800], Loss: 0.9745
TabNet F1 score macro: 0.2009
TabNet Test Accuracy: 0.43961352

In [10]:
# ============== FTTransformer ==============
# Revisiting Deep Learning Models for Tabular Data
output_dim = 211
cat_cardinalities = [len(torch.unique(X_cat_tensor[:, i])) for i in range(X_cat_tensor.shape[1])]

model = FTTransformer(
    n_cont_features = len(cont_cols),
    cat_cardinalities = cat_cardinalities,
    d_out = output_dim,
    **FTTransformer.get_default_kwargs(),
).to(device)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=1e-4)  
num_epochs = 100
batch_size = 64

train_dataset = TensorDataset(X_cat_train, X_cont_train, y_train_nn)
test_dataset = TensorDataset(X_cat_test, X_cont_test, y_test_nn)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# train
for epoch in range(num_epochs):  
    model.train()  
    optimizer.zero_grad()
    for batch_idx, (X_cat, X_cont, y) in enumerate(train_loader):  
        X_cat = X_cat.to(device)
        X_cont = X_cont.to(device)
        y = y.to(device)

        output = model(X_cont, X_cat)  
        loss = criterion(output, y)
        loss.backward()  
        optimizer.step()  
        
    if (epoch+1) % 1 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  

        # eval
        model.eval()
        all_preds = []  
        with torch.no_grad():  
            for X_cat, X_cont, y in test_loader:
                X_cat = X_cat.to(device)
                X_cont = X_cont.to(device)
                y = y.to(device)

                output = model(X_cont, X_cat)
                preds = output.argmax(dim=1)
                all_preds.append(preds)
        y_pred_nn = torch.cat(all_preds, dim=0).cpu()
        f1_macro_nn = f1_score(y_test_nn, y_pred_nn, average='macro')
        print(f'FTTransformer F1 score macro: {f1_macro_nn:.4f}') 
        print('FTTransformer Test Accuracy:', accuracy_score(y_test_nn, y_pred_nn))
# ============== FT Transformer ==============

Epoch [1/100], Loss: 5.0539
FTTransformer F1 score macro: 0.0001
FTTransformer Test Accuracy: 0.004830917874396135
Epoch [2/100], Loss: 5.1177
FTTransformer F1 score macro: 0.0001
FTTransformer Test Accuracy: 0.004830917874396135
Epoch [3/100], Loss: 4.8917
FTTransformer F1 score macro: 0.0001
FTTransformer Test Accuracy: 0.004830917874396135
Epoch [4/100], Loss: 5.1783
FTTransformer F1 score macro: 0.0004
FTTransformer Test Accuracy: 0.01932367149758454
Epoch [5/100], Loss: 5.0824
FTTransformer F1 score macro: 0.0004
FTTransformer Test Accuracy: 0.01932367149758454
Epoch [6/100], Loss: 4.8439
FTTransformer F1 score macro: 0.0004
FTTransformer Test Accuracy: 0.01932367149758454
Epoch [7/100], Loss: 5.0624
FTTransformer F1 score macro: 0.0004
FTTransformer Test Accuracy: 0.01932367149758454
Epoch [8/100], Loss: 4.7181
FTTransformer F1 score macro: 0.0004
FTTransformer Test Accuracy: 0.01932367149758454
Epoch [9/100], Loss: 4.8031
FTTransformer F1 score macro: 0.0004
FTTransformer Test A

In [11]:
# ============== TabTransformer ==============
# TabTransformer: Tabular Data Modeling Using Contextual Embeddings
def print_gpu_memory():
    if torch.cuda.is_available():
        device = torch.device("cuda")
        allocated = torch.cuda.memory_allocated(device) / 1024**3  # 转换为GB
        total_memory = torch.cuda.get_device_properties(device).total_memory
        print(f"Allocated={allocated:.2f} GB, Total={total_memory / 1024**3:.2f} GB")
    else:
        print("GPU unavaliable")



cat_cardinalities = [len(torch.unique(X_cat_tensor[:, i])) for i in range(X_cat_tensor.shape[1])]
output_dim = 211
model = TabTransformer(
    categories = tuple(cat_cardinalities), 
    num_continuous = len(cont_cols),               
    dim = 16,                            
    dim_out = output_dim,                        
    depth = 4,                           
    heads = 4,                          
    attn_dropout = 0.1,                  
    ff_dropout = 0.1,                    
    mlp_hidden_mults = (2, 1),           
    mlp_act = nn.ReLU(),                 
    continuous_mean_std = None,  
).to(device)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=1e-5)  
num_epochs = 300
batch_size = 32

train_dataset = TensorDataset(X_cat_train, X_cont_train, y_train_nn)
test_dataset = TensorDataset(X_cat_test, X_cont_test, y_test_nn)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# train
for epoch in range(num_epochs):  
    model.train()  
    optimizer.zero_grad()
    for batch_idx, (X_cat, X_cont, y) in enumerate(train_loader):  
        X_cat = X_cat.to(device)
        X_cont = X_cont.to(device)
        y = y.to(device)
        
        output = model(X_cat, X_cont)  
        loss = criterion(output, y)
        loss.backward()  
        optimizer.step()  
        
    if (epoch+1) % 1 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  

        # eval
        model.eval()
        all_preds = []  
        with torch.no_grad():  
            for X_cat, X_cont, y in test_loader:
                X_cat = X_cat.to(device)
                X_cont = X_cont.to(device)
                y = y.to(device)
                
                output = model(X_cat, X_cont)  
                preds = output.argmax(dim=1)
                all_preds.append(preds)
        y_pred_nn = torch.cat(all_preds, dim=0).cpu()
        f1_macro_nn = f1_score(y_test_nn, y_pred_nn, average='macro')
        print(f'TabTransformer F1 score macro: {f1_macro_nn:.4f}') 
        print('TabTransformer Test Accuracy:', accuracy_score(y_test_nn, y_pred_nn))
# ============== TabTransformer ==============

Epoch [1/300], Loss: 6.4211
TabTransformer F1 score macro: 0.0133
TabTransformer Test Accuracy: 0.04830917874396135
Epoch [2/300], Loss: 9.6264
TabTransformer F1 score macro: 0.0004
TabTransformer Test Accuracy: 0.01932367149758454
Epoch [3/300], Loss: 5.6955
TabTransformer F1 score macro: 0.0005
TabTransformer Test Accuracy: 0.028985507246376812
Epoch [4/300], Loss: 6.5320
TabTransformer F1 score macro: 0.0003
TabTransformer Test Accuracy: 0.014492753623188406
Epoch [5/300], Loss: 6.7467
TabTransformer F1 score macro: 0.0004
TabTransformer Test Accuracy: 0.01932367149758454
Epoch [6/300], Loss: 5.7163
TabTransformer F1 score macro: 0.0007
TabTransformer Test Accuracy: 0.03864734299516908
Epoch [7/300], Loss: 3.1973
TabTransformer F1 score macro: 0.0068
TabTransformer Test Accuracy: 0.033816425120772944
Epoch [8/300], Loss: 4.5461
TabTransformer F1 score macro: 0.0131
TabTransformer Test Accuracy: 0.04830917874396135
Epoch [9/300], Loss: 6.4393
TabTransformer F1 score macro: 0.0141
Tab

In [20]:
# ============== NODE ==============
# Neural Oblivious Decision Ensembles for Deep Learning on Tabular Data
import lib

num_classes = 211
model = nn.Sequential(
    lib.DenseBlock(X_train_nn.shape[1], layer_dim=128, num_layers=8, tree_dim=num_classes+1, flatten_output=False, depth=6, choice_function=lib.entmax15, bin_function=lib.entmoid15),
    lib.Lambda(lambda x: x[..., :num_classes].mean(dim=-2)),
).to(device)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=1e-4)  
num_epochs = 200
batch_size = 64

train_dataset = TensorDataset(X_train_nn, y_train_nn)
test_dataset = TensorDataset(X_test_nn, y_test_nn)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# train
for epoch in range(num_epochs):  
    model.train()  
    optimizer.zero_grad()
    for batch_idx, (X, y) in enumerate(train_loader):  
        X = X.to(device)
        y = y.to(device)
        
        output = model(X)  
        loss = criterion(output, y)
        loss.backward()  
        optimizer.step()  
        
    if (epoch+1) % 1 == 0:  
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  

        # eval
        model.eval()
        all_preds = []  
        with torch.no_grad():  
            for X, y in test_loader:
                X = X.to(device)
                y = y.to(device)
                
                output = model(X)  
                preds = output.argmax(dim=1)
                all_preds.append(preds)
        y_pred_nn = torch.cat(all_preds, dim=0).cpu()
        f1_macro_nn = f1_score(y_test_nn, y_pred_nn, average='macro')
        print(f'Node F1 score macro: {f1_macro_nn:.4f}') 
        print('Node Test Accuracy:', accuracy_score(y_test_nn, y_pred_nn))
# ============== NODE ==============

  warn("Data-aware initialization is performed on less than 1000 data points. This may cause instability."


Epoch [1/200], Loss: 5.3299
Node F1 score macro: 0.0095
Node Test Accuracy: 0.043478260869565216
Epoch [2/200], Loss: 5.2955
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [3/200], Loss: 5.2489
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [4/200], Loss: 5.2489
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [5/200], Loss: 5.1917
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [6/200], Loss: 5.2619
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [7/200], Loss: 5.2510
Node F1 score macro: 0.0004
Node Test Accuracy: 0.01932367149758454
Epoch [8/200], Loss: 5.2444
Node F1 score macro: 0.0059
Node Test Accuracy: 0.03864734299516908
Epoch [9/200], Loss: 5.1847
Node F1 score macro: 0.0055
Node Test Accuracy: 0.03864734299516908
Epoch [10/200], Loss: 5.2297
Node F1 score macro: 0.0037
Node Test Accuracy: 0.03864734299516908
Epoch [11/200], Loss: 5.2140
Node F1 s

# ------------------ ML methods ------------------

In [31]:
X = pd.get_dummies(data[features], prefix=None, prefix_sep='_')
y = data['Disease_id']
y = le.fit_transform(y)
label_mappings = dict(zip(le.transform(le.classes_), le.classes_))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [33]:
# ============= XGBoost =============
params = {
    'objective': 'multi:softmax', 
    'eval_metric': 'mlogloss', 
    'num_class': 211,
    'tree_method': 'hist',
    'device': 'cpu',
    "random_state": 5,
    }
dtrain = xgb.DMatrix(X_train, label=y_train, missing=-1)
dtest = xgb.DMatrix(X_test, label=y_test, missing=-1)
model = xgb.train(params, dtrain, evals=[(dtest, 'test')], num_boost_round=200, verbose_eval=False)
y_pred_xgb = model.predict(dtest)
f1_macro_xgb = f1_score(y_test, y_pred_xgb, average='macro')

print(f'XGBoost F1 score macro: {f1_macro_xgb:.2f}')
print('XGBoost Test Accuracy:', accuracy_score(y_test, y_pred_xgb))
# ============= XGBoost =============


# ============= Catboost =============
cat = CatBoostClassifier(iterations=300, learning_rate=0.1, random_seed=5)
cat.fit(X_train, y_train, verbose=0)
y_pred_cat = cat.predict(X_test)
f1_macro_cat = f1_score(y_test, y_pred_cat, average='macro')
print(f'Catboost F1 score macro: {f1_macro_cat:.2f}')
print('catboost Test Accuracy:', accuracy_score(y_test, y_pred_cat))
# ============= Catboost =============

XGBoost F1 score macro: 0.36
XGBoost Test Accuracy: 0.6618357487922706
Catboost F1 score macro: 0.41
catboost Test Accuracy: 0.6570048309178744


In [34]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

# ============== SVM ==============
svm = SVC(kernel='rbf', C=50, gamma='scale')
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
f1_macro_svm = f1_score(y_test, y_pred_svm, average='macro')
print(f'SVM F1 score macro: {f1_macro_svm:.4f}')
print('SVM Test Accuracy:', accuracy_score(y_test, y_pred_svm))
# ============== SVM ==============



# ============== Decision Tree ==============
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
f1_macro_dt = f1_score(y_test, y_pred_dt, average='macro')
print(f'Decision Tree F1 score macro: {f1_macro_dt:.4f}') 
print('Decision Tree Test Accuracy:', accuracy_score(y_test, y_pred_dt))
# ============== Decision Tree ==============



# ============== Logistic Regression ==============
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
f1_macro_lr = f1_score(y_test, y_pred_lr, average='macro')
print(f'Logistic Regression F1 score macro: {f1_macro_lr:.4f}')
print('Logistic Regression Test Accuracy:', accuracy_score(y_test, y_pred_lr))
# ========== Logistic Regression ==========



# ============== Random Forest ==============
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
f1_macro = f1_score(y_test, y_pred, average='macro')
print(f'rf F1 score macro: {f1_macro:.4f}')
print('rf Test Accuracy:', accuracy_score(y_test, y_pred))
# ============== Random Forest ==============

SVM F1 score macro: 0.3352
SVM Test Accuracy: 0.642512077294686
Decision Tree F1 score macro: 0.5137
Decision Tree Test Accuracy: 0.7439613526570048


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression F1 score macro: 0.4142
Logistic Regression Test Accuracy: 0.7053140096618358
rf F1 score macro: 0.5225
rf Test Accuracy: 0.7777777777777778
