In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
nhis_data_all = pd.read_csv('./data/nhis_data_all.csv', low_memory=False)

# Replace oral_exam and dental_caries values with 0 or 1
nhis_corrected_dict = {
    'oral_exam': {
        0: 0,
        1: 1,
        '0': 0,
        '1': 1,
        'N': 0,
        'Y': 1
    },
    
    'dental_caries': {
        0: 0,
        1: 1,
        'N': 0,
        'Y': 1
    }
}
nhis_data_all['oral_exam'] = nhis_data_all['oral_exam'].map(nhis_corrected_dict['oral_exam'])
nhis_data_all['dental_caries'] = nhis_data_all['dental_caries'].map(nhis_corrected_dict['dental_caries'])

# Create waist_over_weight column
nhis_data_all['waist_over_weight'] = nhis_data_all['waist_circum'] / nhis_data_all['weight']

# If waist_over_weight is less than 0.5 or greater than 3.0, set waist_circum to nan
nhis_data_all.loc[(nhis_data_all['waist_over_weight'] < 0.5) | (nhis_data_all['waist_over_weight'] > 3.0) | (nhis_data_all['waist_circum'] == 999), 'waist_circum'] = np.nan

# If waist_circum or weight is nan, set waist_over_weight to nan
nhis_data_all.loc[(nhis_data_all['waist_circum'].isnull()) | (nhis_data_all['weight'].isnull()), 'waist_over_weight'] = np.nan

# Change left_eye and right_eye value 9.9 to 3.0
nhis_data_all.loc[nhis_data_all['left_eye'] == 9.9, 'left_eye'] = 3.0
nhis_data_all.loc[nhis_data_all['right_eye'] == 9.9, 'right_eye'] = 3.0

# Change left_ear and right_ear value 3.0 to nan
nhis_data_all.loc[nhis_data_all['left_ear'] == 3.0, 'left_ear'] = np.nan
nhis_data_all.loc[nhis_data_all['right_ear'] == 3.0, 'right_ear'] = np.nan

In [4]:
# Assuming the dataset is already loaded into a DataFrame named 'nhis_data_all'
# Define the target variable
# Create a new column 'diabetes_status' based on fasting glucose levels
def diabetes_status(fasting_glucose):
    if fasting_glucose >= 126:
        return 2  # Diabetes
    elif 100 <= fasting_glucose <= 125:
        return 1  # Pre-diabetes
    else:
        return 0  # Normal

nhis_data_all['diabetes_status'] = nhis_data_all['fasting_glucose'].apply(diabetes_status)

# Feature Selection
# Selecting relevant features
features = [
    'age_code', 'total_cholesterol', 'triglycerides', 'hdl_cholesterol', 
    'ldl_cholesterol', 'systolic_bp', 'diastolic_bp', 'serum_creatinine', 
    'weight', 'waist_circum'
]

# Iterate over each feature and train models
results = {}
for feature in features:
    # Drop rows where the feature is NaN
    nhis_data_all_feature = pd.DataFrame()
    nhis_data_all_feature[feature] = nhis_data_all[feature]
    nhis_data_all_feature['diabetes_status'] = nhis_data_all['diabetes_status']
    nhis_data_all_feature = nhis_data_all_feature.dropna()
    
    # Split the data into features (X) and target (y)
    X = nhis_data_all_feature[[feature]]
    y = nhis_data_all_feature['diabetes_status']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Store results for each feature
    results[feature] = {}
    
    # Logistic Regression
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(X_train, y_train)
    y_pred_log_reg = log_reg.predict(X_test)
    results[feature]['Logistic Regression'] = classification_report(y_test, y_pred_log_reg, output_dict=True, zero_division=0)
    
    # Decision Tree
    dec_tree = DecisionTreeClassifier()
    dec_tree.fit(X_train, y_train)
    y_pred_dec_tree = dec_tree.predict(X_test)
    results[feature]['Decision Tree'] = classification_report(y_test, y_pred_dec_tree, output_dict=True, zero_division=0)
    
    # Random Forest
    rand_forest = RandomForestClassifier(n_estimators=100)
    rand_forest.fit(X_train, y_train)
    y_pred_rand_forest = rand_forest.predict(X_test)
    results[feature]['Random Forest'] = classification_report(y_test, y_pred_rand_forest, output_dict=True, zero_division=0)
    
    # Print the results for each feature
    print(f"Results for feature: {feature}")
    print("Logistic Regression Results:")
    print(confusion_matrix(y_test, y_pred_log_reg))
    print(classification_report(y_test, y_pred_log_reg, zero_division=0))
    print("Decision Tree Results:")
    print(confusion_matrix(y_test, y_pred_dec_tree))
    print(classification_report(y_test, y_pred_dec_tree, zero_division=0))
    print("Random Forest Results:")
    print(confusion_matrix(y_test, y_pred_rand_forest))
    print(classification_report(y_test, y_pred_rand_forest, zero_division=0))
    print("-" * 80)

# Optionally, visualize the results or save them for further analysis


Results for feature: age_code
Logistic Regression Results:
[[1212145   43570       0]
 [ 559852   33044       0]
 [ 138934   12455       0]]
              precision    recall  f1-score   support

           0       0.63      0.97      0.77   1255715
           1       0.37      0.06      0.10    592896
           2       0.00      0.00      0.00    151389

    accuracy                           0.62   2000000
   macro avg       0.34      0.34      0.29   2000000
weighted avg       0.51      0.62      0.51   2000000

Decision Tree Results:
[[1255715       0       0]
 [ 592896       0       0]
 [ 151389       0       0]]
              precision    recall  f1-score   support

           0       0.63      1.00      0.77   1255715
           1       0.00      0.00      0.00    592896
           2       0.00      0.00      0.00    151389

    accuracy                           0.63   2000000
   macro avg       0.21      0.33      0.26   2000000
weighted avg       0.39      0.63      0.48   2

In [5]:
# Create a representative sample of the data with 5% of the data
nhis_data_sample = nhis_data_all.sample(frac=0.05, random_state=42)

# Save the sample data to a CSV file
nhis_data_sample.to_csv('./data/nhis_data_sample.csv', index=False)