# Predict Blood Donation for Future Expectancy

### Loading dataset

In [1]:
import pandas as pd
import numpy as np

filepath="C:/Users/uttam/anaconda3/Technocolabs/MajorProj/transfusion.csv"
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),whether he/she donated blood in March 2007
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Inspecting transfusion DataFrame

In [2]:
df.shape

(748, 5)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 5 columns):
 #   Column                                      Non-Null Count  Dtype
---  ------                                      --------------  -----
 0   Recency (months)                            748 non-null    int64
 1   Frequency (times)                           748 non-null    int64
 2   Monetary (c.c. blood)                       748 non-null    int64
 3   Time (months)                               748 non-null    int64
 4   whether he/she donated blood in March 2007  748 non-null    int64
dtypes: int64(5)
memory usage: 29.3 KB


### Creating target column


In [4]:
#renaming to target
df.rename(
    columns={'whether he/she donated blood in March 2007': 'target'},
    inplace=True)

df.head()

Unnamed: 0,Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),target
0,2,50,12500,98,1
1,0,13,3250,28,1
2,1,16,4000,35,1
3,2,20,5000,45,1
4,1,24,6000,77,0


### Checking target incidence


In [5]:
df.target.value_counts(normalize=True)

0    0.762032
1    0.237968
Name: target, dtype: float64

### Splitting transfusion into train and test datasets


In [6]:
from sklearn.model_selection import train_test_split

#used stratify to maintain same ratio of targets in both train and test datasets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns='target'),
    df.target,
    stratify=df.target,
    test_size=0.2,
    random_state=69)

print(X_train.shape)
X_train.head()

#saving training dataset
X_train.to_csv("Splitted_X_Training_Dataset.csv")

(598, 4)


In [7]:
#saving test dataset
X_test.to_csv("Splitted_X_Test_Dataset.csv")
X_test.shape

(150, 4)

### Selecting model using TPOT


In [8]:
!pip install tpot



In [9]:
from sklearn.metrics import roc_auc_score
from tpot import TPOTClassifier

tpot = TPOTClassifier(
    generations=5,
    population_size=20,
    verbosity=2,
    scoring='roc_auc',
    random_state=42,
    disable_update_check=True,
    config_dict='TPOT light'
)

tpot.fit(X_train, y_train)

# AUC score
tpot_auc_score = roc_auc_score(y_test, tpot.predict_proba(X_test)[:, 1])
print(f'\nAUC score is : {tpot_auc_score:.4f}')

# Print best pipeline steps
print('\nBest pipeline steps:', end='\n')
for idx, (name, transform) in enumerate(tpot.fitted_pipeline_.steps, start=1):
    print(f'{idx}. {transform}')



HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=120.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.7258180858550316
Generation 2 - Current best internal CV score: 0.726832521729716
Generation 3 - Current best internal CV score: 0.7271987730624483
Generation 4 - Current best internal CV score: 0.7298995566973187
Generation 5 - Current best internal CV score: 0.7298995566973187
Best pipeline: LogisticRegression(MultinomialNB(input_matrix, alpha=0.01, fit_prior=True), C=0.5, dual=False, penalty=l2)

AUC score is : 0.8336

Best pipeline steps:
1. StackingEstimator(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                          fit_prior=True))
2. LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


### Checking the variance


In [10]:
X_train.var().round(3)

Recency (months)              69.290
Frequency (times)             33.467
Monetary (c.c. blood)    2091697.338
Time (months)                598.232
dtype: float64

### Log normalization

In [11]:
X_train_norm, X_test_norm = X_train.copy(), X_test.copy()

# Specify which column to normalize
col_norm = 'Monetary (c.c. blood)'

# Log normalization
for df_ in [X_train_norm, X_test_norm]:
    df_['monetary_log'] = np.log(df_[col_norm])
    df_.drop(columns=col_norm, inplace=True)

X_train_norm.var()
#now it is normalized

Recency (months)      69.289953
Frequency (times)     33.467157
Time (months)        598.232201
monetary_log           0.838856
dtype: float64

### Training the logistic regression model


In [14]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(
    solver='liblinear',
    random_state=69
)

# Train the model
logreg.fit(X_train_norm, y_train)

# AUC score for tpot model
logreg_auc_score = roc_auc_score(y_test, logreg.predict_proba(X_test_norm)[:, 1])
print(f'\nAUC score: {logreg_auc_score:.4f}')


AUC score: 0.8480


In [15]:
print("Tpot AUC Score: ",tpot_auc_score.round(3))
print("Logistic Regression AUC Score: ",logreg_auc_score.round(3))

Tpot AUC Score:  0.834
Logistic Regression AUC Score:  0.848


## ---------------------------------------------- END OF PROJECT --------------------------------------------------------