In [1]:
# data manipulation
import pandas as pd
import numpy as np

# data plotting
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib import rcParams
import seaborn as sns


# styling
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12,6)

In [2]:
# Import diabetes dataset
dataset = pd.read_csv("CECS456-FinalProject/diabetes_data_upload.csv")
display(dataset)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


In [3]:
# Convert binary categories used in dataset into Int (1,0) referenced from members in part 2
category_mappings = {'Female':1, 'Male':0, 'Yes':1, 'No':0, 'Positive':1, 'Negative':0}
df_toInt = dataset.iloc[:, 1:17]
dataset = dataset.iloc[:, 0:1]
for column in df_toInt:
    df_toInt[column] = df_toInt[column].map(category_mappings)
dataset = dataset.join(df_toInt)
display(dataset)

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,1,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,1,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,1,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,1,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Selecting the class feature as it is meant to be 
X = dataset.drop('class', axis=1)
y = dataset['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [5]:
# Using feature selection via Recursive Feature Elimination(RFE) in order to 
# determine the most prominent feature in the set 
from sklearn.feature_selection import RFE 

# Using Logisitic regression as the learning model for this example
from sklearn.linear_model import LogisticRegression

# n_features_to_select is min feature that is common, step is num of features to remove at each iteration
rf_selctor = RFE(estimator=LogisticRegression(), n_features_to_select = 1, step = 1)
rf_selctor.fit(X,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Display the most prominent feature based on RFE model after pruning which in this case rank 1 feature
# is polydipsia
print(rf_selctor.ranking_)

[16  3  2  1  8 11  9  6 13  5  4 10  7 12 14 15]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=25016314-3530-47b7-95b2-7fd0d302b441' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>