In [18]:
# Load basic packages
import numpy as np
import pandas as pd
# Install autograd:
#!conda install -c conda-forge autograd

import autograd.numpy as numpy
import autograd.numpy.random as npr

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import metrics
from sklearn import model_selection

# Useful for saving our models.
import pickle
print("Imported all!")

Imported all!


In [24]:
df = pd.read_csv('diabetes.csv')
df = df.dropna()
diabetes = df.values
print(diabetes.shape)

(253680, 22)


In [20]:
diabetes

array([[ 0,  1,  1, ...,  4,  3, 10],
       [ 0,  0,  0, ...,  6,  1, 11],
       [ 0,  1,  1, ...,  4,  8,  2],
       ...,
       [ 0,  0,  0, ...,  5,  2,  5],
       [ 0,  1,  0, ...,  5,  1,  6],
       [ 1,  1,  1, ...,  6,  2,  6]])

In [22]:
# Separate the outcome (y) from the features (X)
y = diabetes[:, 0]  # the outcome is in the first column
X = diabetes[:, 1:]  # all the other columns are my predictors
print(X.shape, y.shape)

(253680, 21) (253680,)


In [25]:
# Split the data.  DO NOT TOUCH THE TEST DATA FROM HERE ON!!
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y, test_size = 0.2) # 0.2 is 20% test data.

In [27]:
# Next we implement gradient boosting, in particular the Adaboost algorithm.
# Remember, gradient boosting algorithms involve iteratively improving the decision trees
# and hence involve a learning rate similar to logistic regressions.
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier

bdt = AdaBoostClassifier(
    # n_estimators is the number of trees
    tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=2000, learning_rate=1
)
bdt.fit(X_train, y_train)

In [28]:
preds = bdt.predict(X_test)
print(np.sum(preds == y_test)/len(preds))

0.867135761589404


In [29]:
# Pickle the AdaBoostClassifier object
with open('adaboost_classifier.pkl', 'wb') as f:
    pickle.dump(bdt, f)

### Handling Categorical Variables:

1. For "Biological sex" and "Zodiac sign," you can use one-hot encoding. This will create binary columns for each category, where 1 indicates the presence of that category and 0 indicates its absence.
For "Age bracket," "Education bracket," and "Income bracket," since they are ordinal categorical variables, you might consider ordinal encoding, where each category is mapped to an integer.
Handling Continuous Variables:

2. For continuous variables like "Body Mass Index" and "General health," you can use standardization or normalization to scale them appropriately.
Standardization (also known as Z-score normalization) transforms the data to have a mean of 0 and a standard deviation of 1. This is suitable when the data follows a Gaussian distribution.
Normalization scales the data to a range between 0 and 1. This is useful when the distribution of the data is not Gaussian and when you have outliers.
Handling Binary Variables:

3. Binary variables like "Diabetes status" and "High blood pressure" are already in a suitable format for many machine learning algorithms.

In [34]:
import numpy as np

# Sample data
data = diabetes

# Define columns
categorical_cols = [17, 21]  # Indices of categorical columns (sex + zodiac sign)
ordinal_cols = [18, 19, 20]  # Indices of ordinal columns (age, education, income)
continuous_cols = [3, 13, 14, 15]  # Indices of continuous columns (BMI, general health, mental health, physical health)

# One-Hot Encoding for categorical variables
one_hot_encoded_data = []
for col_idx in categorical_cols:
    unique_values = np.unique(data[:, col_idx])
    for value in unique_values[:-1]:  # Exclude the last category to avoid the dummy variable trap
        one_hot_encoded_data.append((data[:, col_idx] == value).astype(int))

one_hot_encoded_data = np.array(one_hot_encoded_data).T

# Ordinal Encoding for ordinal variables (not needed here since the data is already ordinal)

# Normalization for continuous variables (using Min-Max scaling to scale between 0 and 1)
for col_idx in continuous_cols:
    min_val = np.min(data[:, col_idx])
    max_val = np.max(data[:, col_idx])
    data[:, col_idx] = (data[:, col_idx] - min_val) / (max_val - min_val)

# Now, combine all the columns back together
processed_data = np.concatenate((one_hot_encoded_data, data[:, ordinal_cols], data[:, continuous_cols]), axis=1)

# Now processed_data contains your fully preprocessed data
processed_data

array([[1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 1, 1],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [35]:
processed_data.shape

(253680, 19)