In [9]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("encoded_clean_clinical_wav2vec2.csv")

In [5]:
df.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,tb_prior,tb_prior_Pul,hemoptysis,heart_rate,temperature,weight_loss,smoke_lweek,fever,night_sweats,tb_status
0,-0.026476,0.066998,-0.02633,-0.012994,-0.14013,-0.121104,0.082295,-0.000749,-0.034801,-0.41785,...,0,0,0,68,37.5,1,0,1,1,1
1,-0.115683,0.074489,0.014545,-0.00577,-0.184433,-0.109778,0.070073,-0.004235,-0.051047,-0.439018,...,0,0,0,68,37.5,1,0,1,1,1
2,-0.036963,0.027261,-0.097449,-0.025534,-0.055059,-0.135565,0.002773,-0.035429,-0.03201,-0.351857,...,0,0,0,76,37.7,1,1,1,1,1
3,-0.066899,0.018125,-0.133178,-0.038802,-0.032812,-0.119408,-0.008754,-0.031919,-0.104976,-0.353431,...,0,0,0,76,37.7,1,1,1,1,1
4,-0.03268,0.024876,-0.140349,-0.030479,0.017331,-0.111734,-0.022145,-0.028799,-0.051605,-0.321307,...,0,0,0,76,37.7,1,1,1,1,1


In [6]:
df['tb_status'].value_counts()

tb_status
1    4900
0     289
Name: count, dtype: int64

In [13]:
df.shape

(5189, 783)

In [14]:
df.columns

Index(['feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6',
       'feat_7', 'feat_8', 'feat_9',
       ...
       'tb_prior', 'tb_prior_Pul', 'hemoptysis', 'heart_rate', 'temperature',
       'weight_loss', 'smoke_lweek', 'fever', 'night_sweats', 'tb_status'],
      dtype='object', length=783)

In [15]:
print(y.value_counts())

tb_status
1    4900
0     289
Name: count, dtype: int64


In [17]:
df.dtypes

feat_0                float64
feat_1                float64
feat_2                float64
feat_3                float64
feat_4                float64
feat_5                float64
feat_6                float64
feat_7                float64
feat_8                float64
feat_9                float64
feat_10               float64
feat_11               float64
feat_12               float64
feat_13               float64
feat_14               float64
feat_15               float64
feat_16               float64
feat_17               float64
feat_18               float64
feat_19               float64
feat_20               float64
feat_21               float64
feat_22               float64
feat_23               float64
feat_24               float64
feat_25               float64
feat_26               float64
feat_27               float64
feat_28               float64
feat_29               float64
feat_30               float64
feat_31               float64
feat_32               float64
feat_33   

In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# For reproducibility
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x1da935984d0>

In [19]:
X = df.drop("tb_status", axis=1)
y = df["tb_status"]

In [20]:
# Extract only tb-negative samples (minority)
minority_data = X[y == 0]

print("Minority class shape:", minority_data.shape)  # Should be (289, 782)

# Scale features
scaler = StandardScaler()
minority_scaled = scaler.fit_transform(minority_data)

input_dim = minority_scaled.shape[1]  # 782
print("Input dimension:", input_dim)

Minority class shape: (289, 782)
Input dimension: 782


In [21]:
# Generator
class Generator(nn.Module):
    def __init__(self, input_dim, noise_dim=64):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(noise_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, z):
        return self.net(z)

# Discriminator
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


In [22]:
noise_dim = 64

generator = Generator(input_dim=input_dim, noise_dim=noise_dim)
discriminator = Discriminator(input_dim=input_dim)

criterion = nn.BCELoss()
g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)


In [23]:
# Convert scaled data to tensor
minority_tensor = torch.tensor(minority_scaled, dtype=torch.float)

epochs = 3000
batch_size = 64

for epoch in range(epochs):
    # === Train Discriminator ===
    idx = np.random.randint(0, minority_tensor.shape[0], batch_size)
    real_data = minority_tensor[idx]
    real_labels = torch.ones(batch_size, 1)

    z = torch.randn(batch_size, noise_dim)
    fake_data = generator(z).detach()
    fake_labels = torch.zeros(batch_size, 1)

    d_real = discriminator(real_data)
    d_fake = discriminator(fake_data)

    d_loss_real = criterion(d_real, real_labels)
    d_loss_fake = criterion(d_fake, fake_labels)
    d_loss = d_loss_real + d_loss_fake

    discriminator.zero_grad()
    d_loss.backward()
    d_optimizer.step()

    # === Train Generator ===
    z = torch.randn(batch_size, noise_dim)
    generated = generator(z)
    g_loss = criterion(discriminator(generated), torch.ones(batch_size, 1))

    generator.zero_grad()
    g_loss.backward()
    g_optimizer.step()

    # Print every 500 epochs
    if epoch % 500 == 0:
        print(f"Epoch {epoch}: D Loss={d_loss.item():.4f}, G Loss={g_loss.item():.4f}")


Epoch 0: D Loss=1.3577, G Loss=0.6576
Epoch 500: D Loss=0.1475, G Loss=2.2799
Epoch 1000: D Loss=0.0518, G Loss=4.5132
Epoch 1500: D Loss=0.0113, G Loss=5.5092
Epoch 2000: D Loss=0.0022, G Loss=7.3381
Epoch 2500: D Loss=0.0049, G Loss=8.5562


In [24]:
num_to_generate = 4900 - 289  # => 4611

In [25]:
# Number of synthetic samples needed to balance the dataset
num_to_generate = 4900 - 289

# Generate synthetic data using trained Generator
z = torch.randn(num_to_generate, noise_dim)
generated_data = generator(z).detach().numpy()

# Inverse transform back to original scale
generated_original = scaler.inverse_transform(generated_data)

# Convert to DataFrame
synthetic_df = pd.DataFrame(generated_original, columns=X.columns)
synthetic_df['tb_status'] = 0  # Assign minority label

print("Synthetic data shape:", synthetic_df.shape)

Synthetic data shape: (4611, 783)


In [26]:
# Extract original majority class (tb_status = 1)
majority_df = df[df['tb_status'] == 1]

# Combine with synthetic minority and original minority
minority_df = df[df['tb_status'] == 0]
balanced_df = pd.concat([majority_df, minority_df, synthetic_df], ignore_index=True)

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced dataset shape:", balanced_df.shape)
print("Balanced class distribution:\n", balanced_df['tb_status'].value_counts())


Balanced dataset shape: (9800, 783)
Balanced class distribution:
 tb_status
1    4900
0    4900
Name: count, dtype: int64


In [27]:
# Save to CSV
balanced_df.to_csv("balanced_tb_data.csv", index=False)
print("Balanced dataset saved as 'balanced_tb_data.csv'")

Balanced dataset saved as 'balanced_tb_data.csv'


In [None]:
df1 = pd.read_csv("balanced_tb_data.csv")

In [4]:
import pandas as pd
import numpy as np

In [5]:
df3 = pd.read_csv("solicited_test_ini.csv")

In [6]:
df3.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,reported_cough_dur,tb_prior,tb_prior_Pul,hemoptysis,heart_rate,temperature,weight_loss,smoke_lweek,fever,night_sweats
0,-0.034994,0.025785,-0.126753,-0.032978,-0.104685,-0.141457,0.08081,-0.00843,0.025698,-0.357474,...,30,No,No,No,102,36.3,No,No,No,No
1,-0.017907,0.014444,-0.149634,-0.028727,-0.092394,-0.137203,0.039822,-0.006732,0.037899,-0.338116,...,30,No,No,No,102,36.3,No,No,No,No
2,0.019554,0.01716,-0.110186,-0.037742,-0.016508,-0.139233,0.051114,-0.017611,0.010689,-0.309711,...,30,No,No,No,102,36.3,No,No,No,No
3,-0.065054,0.040636,-0.17513,-0.02217,-0.118889,-0.120363,0.030802,-0.013428,-0.00153,-0.35213,...,30,No,No,No,102,36.3,No,No,No,No
4,-0.042754,0.042994,-0.087392,-0.029497,-0.106971,-0.116307,0.063567,-0.000677,-0.016347,-0.395569,...,14,No,No,No,94,36.7,No,No,Yes,Yes
