In [2]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

df = pd.read_csv("./diabetes.csv")

# drop all columns that's not the following ones
df.drop(df.columns.difference(['Glucose', 'Insulin', 'BMI', 'Age', 'Outcome']), axis=1, inplace=True)

# create a new copy
df_copy = df.copy(deep=True)

# isolate all but last columns
cols = df_copy.columns[:-1]
# iterate over them and replace 0s for NaNs
for c in cols:
    df_copy[c] = df_copy[c].replace(0, np.NaN)

# fill NAs with mean and median
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace=True)
df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace=True)
# final clean up
df_copy.dropna(inplace=True)

df_copy.describeribe()

Unnamed: 0,Glucose,Insulin,BMI,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0
mean,121.686763,140.671875,32.455208,33.240885,0.348958
std,30.435949,86.38306,6.875177,11.760232,0.476951
min,44.0,14.0,18.2,21.0,0.0
25%,99.75,121.5,27.5,24.0,0.0
50%,117.0,125.0,32.3,29.0,0.0
75%,140.25,127.25,36.6,41.0,1.0
max,199.0,846.0,67.1,81.0,1.0


In [3]:
def remove_outliers(df, out_cols, T=1.5, verbose=True):
    # Copy of df
    new_df = df.copy()
    init_shape = new_df.shape
    
    # For each column
    for c in out_cols:
        q1 = new_df[c].quantile(.25)
        q3 = new_df[c].quantile(.85)
        col_iqr = q3 - q1
        col_max = q3 + T * col_iqr
        col_min = q1 - T * col_iqr
        
        # Filter data without outliers and ignoring nan
        filtered_df = new_df[(new_df[c] <= col_max) & (new_df[c] >= col_min)]
        if verbose:
            n_out = new_df.shape[0] - filtered_df.shape[0] 
            print(f" Columns {c} had {n_out} outliers removed")
        new_df = filtered_df
            
    if verbose:
        # Print shrink percentage
        lines_red = df.shape[0] - new_df.shape[0]
        print(f"Data reduced by {lines_red} lines, or {lines_red/df.shape[0]*100:.2f} %")
    
    return new_df

df_copy = remove_outliers(df_copy, ["Glucose", "Insulin", "BMI", "Age"])
df_copy.describe()

 Columns Glucose had 0 outliers removed
 Columns Insulin had 346 outliers removed
 Columns BMI had 7 outliers removed
 Columns Age had 5 outliers removed
Data reduced by 358 lines, or 46.61 %


Unnamed: 0,Glucose,Insulin,BMI,Age,Outcome
count,410.0,410.0,410.0,410.0,410.0
mean,121.060359,124.936585,31.653659,34.958537,0.368293
std,29.939237,2.312056,6.224005,11.992081,0.482931
min,44.0,114.0,18.2,21.0,0.0
25%,101.0,125.0,27.225,25.0,0.0
50%,115.0,125.0,31.6,32.0,0.0
75%,138.75,125.0,35.5,42.0,1.0
max,199.0,135.0,48.3,67.0,1.0


In [None]:
# scale the values to unit variance
std_sc = StandardScaler()
features = df_copy.columns.values[:-1]
# features
X = pd.DataFrame(std_sc.fit_transform(df_copy.drop(["Outcome"], axis=1), ), columns=features)
# target
y = df_copy.Outcome

svc = SVC(random_state=100, kernel='linear', gamma=1, degree=3,
          C=0.5, class_weight='balanced', probability=True)
svc.fit(X, y)

file = 'diabetes_prediction_svc.pickle'
pickle.dump(svc, open(file, 'wb'))