In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df = pd.read_csv('dataset/diabetes_binary_health_indicators_BRFSS2021.csv')

# Assuming df is your DataFrame and "Diabetes_binary" is your target column
class_0 = df[df.Diabetes_binary == 0]
class_1 = df[df.Diabetes_binary == 1]

# Downsample class 0 to match the number of instances in class 1
class_0_downsampled = class_0.sample(len(class_1))

# Concatenate the downsampled class 0 with class 1
balanced_df = pd.concat([class_0_downsampled, class_1], axis=0)

# Now, balanced_df is your balanced DataFrame

X = balanced_df.drop("Diabetes_binary", axis=1)
y = balanced_df["Diabetes_binary"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

balanced_df.head()


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
184765,0.0,0,0.0,1,27.0,0.0,0.0,1.0,0,0,0,0,1,0.0,1.0,0.0,0.0,0.0,0,12,5.0,6.0
86281,0.0,0,0.0,1,21.0,0.0,0.0,0.0,1,1,1,1,1,0.0,1.0,0.0,0.0,0.0,0,1,6.0,7.0
102346,0.0,0,0.0,1,29.0,1.0,0.0,0.0,1,0,1,0,1,0.0,2.0,5.0,0.0,0.0,1,5,5.0,7.0
180424,0.0,1,1.0,1,38.0,0.0,1.0,0.0,1,1,1,0,1,0.0,2.0,0.0,0.0,0.0,1,11,6.0,9.0
111050,0.0,0,0.0,1,19.0,0.0,0.0,0.0,1,1,0,0,1,0.0,1.0,0.0,0.0,0.0,0,9,6.0,11.0


In [26]:
from gan.synthesizers import CTGAN

real_data = balanced_df

# Names of the columns that are discrete
discrete_columns = ['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 
                 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
                 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 
                 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

ctgan = CTGAN(epochs=10)
ctgan.fit(real_data, discrete_columns)

# Create synthetic data
synthetic_data = ctgan.sample(1000)

In [27]:
synthetic_data.head(100)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0,0.0,1,19.0,0.0,0.0,0.0,1,1,1,0,1,0.0,5.0,1.0,0.0,0.0,1,10,6.0,9.0
1,0.0,1,1.0,1,30.0,1.0,0.0,0.0,1,0,1,0,1,0.0,2.0,5.0,0.0,0.0,1,7,4.0,6.0
2,1.0,1,1.0,1,26.0,1.0,0.0,0.0,0,0,1,0,1,0.0,5.0,0.0,2.0,0.0,0,9,5.0,3.0
3,0.0,0,0.0,1,25.0,0.0,0.0,0.0,1,1,1,0,1,0.0,2.0,0.0,3.0,1.0,0,9,5.0,7.0
4,0.0,1,0.0,1,33.0,1.0,0.0,0.0,1,0,1,0,1,0.0,2.0,0.0,4.0,1.0,1,8,4.0,6.0
5,1.0,1,1.0,1,38.0,0.0,0.0,1.0,0,0,0,0,0,1.0,4.0,30.0,15.0,1.0,1,5,3.0,3.0
6,1.0,1,0.0,1,27.0,1.0,0.0,0.0,1,0,1,1,0,0.0,4.0,0.0,30.0,0.0,0,9,6.0,9.0
7,0.0,0,0.0,1,39.0,1.0,0.0,0.0,0,0,1,0,1,0.0,1.0,7.0,0.0,0.0,0,4,1.0,9.0
8,0.0,1,1.0,1,22.0,1.0,0.0,0.0,1,1,1,0,1,1.0,2.0,0.0,14.0,0.0,0,8,4.0,8.0
9,1.0,1,0.0,1,48.0,0.0,0.0,1.0,1,0,1,0,1,0.0,4.0,0.0,0.0,1.0,1,13,1.0,6.0
