## Installing packages

In [4]:
!pip install git+https://github.com/cleanlab/cleanlab.git
!pip install cleanlab
!pip install 'cleanlab[DataLab]'
!pip install skorch

Collecting git+https://github.com/cleanlab/cleanlab.git
  Cloning https://github.com/cleanlab/cleanlab.git to /tmp/pip-req-build-latm45l8
  Running command git clone --filter=blob:none --quiet https://github.com/cleanlab/cleanlab.git /tmp/pip-req-build-latm45l8
  Resolved https://github.com/cleanlab/cleanlab.git to commit 21b82a325e7828d25292c25c121b908075ab7204
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: cleanlab
  Building wheel for cleanlab (pyproject.toml) ... [?25l[?25hdone
  Created wheel for cleanlab: filename=cleanlab-2.4.1-py3-none-any.whl size=283245 sha256=fc2c7d0bf48360b71bcd1cc3f8ce417dbf4c0511f0a6720668b2252affc6af53
  Stored in directory: /tmp/pip-ephem-wheel-cache-d3fmjy7a/wheels/3d/53/8f/09fde0499e2135d2566a50fe7afd0398373244187206c7d441
Successfully built cleanlab
Installing collected packages: cleanla

## Stroke Prediction

In [52]:
import numpy as np
import pandas as pd
import math
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Read dataset
df_train = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

# Removing irrelevant columns

df_train.drop(columns = 'id', axis = 1, inplace = True)

# Handling null values
mean_bmi = df_train.groupby(['gender', 'age']).mean()['bmi']
mean_bmi = np.around(mean_bmi, decimals = 3)

def fill_bmi(df_train, mean_bmi):
    if math.isnan(df_train['bmi']):
        return mean_bmi[df_train['gender']][df_train['age']]
    else:
        return df_train['bmi']

df_train['bmi'] = df_train.apply(fill_bmi, axis = 1, args = (mean_bmi, ))
df_train['bmi'].iloc[2030] = mean_bmi['Female'][0.48]
df_train.drop(index = 3116, inplace = True)

# One hot encoding
df_objects = df_train.select_dtypes(include = 'object')
df_train = pd.get_dummies(df_train, columns= df_objects.columns, dtype = 'int')

# Model
X = df_train.drop('stroke', axis = 1)
y = df_train['stroke']

# Oversampling
smk = SMOTETomek(random_state = 2, sampling_strategy = 'minority')
x_res, y_res = smk.fit_resample(X, y)

# train test split
X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, test_size = 0.2, random_state = 3)

#Logistic Regression

logistic = LogisticRegression(max_iter=10000)
logistic.fit(X_train, y_train)
pred = logistic.predict(X_test)

#Calculating accuracy
score = np.around(accuracy_score(y_test, pred), decimals = 3) *100


  mean_bmi = df_train.groupby(['gender', 'age']).mean()['bmi']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['bmi'].iloc[2030] = mean_bmi['Female'][0.48]


In [53]:
print('Accuracy: ',score)

Accuracy:  96.1


In [54]:
from cleanlab.classification import CleanLearning
lg = LogisticRegression(max_iter=10000)
cl = CleanLearning(lg)
_ = cl.fit(X_train, y_train)
label_issues = cl.get_label_issues()
preds = cl.predict(X_test)

In [55]:
label_issues[label_issues['is_label_issue'] == True].head()

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label,sample_weight
462,True,0.013292,1,0,0.0
813,True,0.057914,1,0,0.0
926,True,0.07901,1,0,0.0
942,True,0.04048,1,0,0.0
970,True,0.063936,1,0,0.0


In [56]:
acc_cl = accuracy_score(y_test, preds)
score = np.around(acc_cl, decimals = 3) *100

In [57]:
print(f"Test accuracy of cleanlab-trained model: {score}")

Test accuracy of cleanlab-trained model: 96.3


## Heart Disease

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Reading dataset
df_train = pd.read_csv('/content/heart.csv')

# Rename columns for better readability
df_train.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_pressure',
                    'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'slope',
                    'num_major_vessels', 'thal', 'target']

# One hot encoding
df_train = pd.get_dummies(data = df_train, columns = ['sex', 'chest_pain_type', 'fasting_blood_pressure', 'rest_ecg',
                                           'exercise_induced_angina', 'slope', 'num_major_vessels', 'thal'], dtype= 'int')

# Training model
X = df_train.drop('target', axis = 1)
y = df_train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)


# Logistic Regression
logistic = LogisticRegression(max_iter=10000)
logistic.fit(X_train, y_train)
pred = logistic.predict(X_test)

#Calculating accuracy
score = np.around(accuracy_score(y_test, pred), decimals = 3) *100


In [9]:
print('Accuracy: ',score)

Accuracy:  82.0


In [10]:
from cleanlab.classification import CleanLearning
lg = LogisticRegression(max_iter=10000)
cl = CleanLearning(lg)
_ = cl.fit(X_train, y_train)
label_issues = cl.get_label_issues()
preds = cl.predict(X_test)

In [11]:
label_issues[label_issues['is_label_issue'] == True].head()

Unnamed: 0,is_label_issue,label_quality,given_label,predicted_label,sample_weight
0,True,0.168915,0,1,0.0
6,True,0.238579,0,1,0.0
21,True,0.116191,0,1,0.0
31,True,0.109083,0,1,0.0
36,True,0.112261,0,1,0.0


In [13]:
acc_cl = accuracy_score(y_test, preds)
score = np.around(acc_cl, decimals = 3) *100

In [16]:
print(f"Test accuracy of cleanlab-trained model: {score}")

Test accuracy of cleanlab-trained model: 85.2
