In [17]:
from pathlib import Path
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import pickle
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Read salary data
file_path = Path("Resources/data.csv")
df = pd.read_csv(file_path)

# Display sample data
df.head(10)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BloodSugar,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk
5,23,140,80,7.01,98.0,70,high risk
6,23,130,70,7.01,98.0,78,mid risk
7,35,85,60,11.0,102.0,86,high risk
8,32,120,90,6.9,98.0,70,mid risk
9,42,130,80,18.0,98.0,70,high risk


In [3]:
# printing dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BloodSugar   1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
 6   RiskLevel    1014 non-null   object 
dtypes: float64(2), int64(4), object(1)
memory usage: 55.6+ KB


In [4]:
# value counts for our taarget
df['RiskLevel'].value_counts()

low risk     406
mid risk     336
high risk    272
Name: RiskLevel, dtype: int64

In [6]:
#  transforming categorical values into numerical
df['RiskLevel'][df['RiskLevel'] == "high risk"] = 0
df['RiskLevel'][df['RiskLevel'] == "mid risk"] = 1
df['RiskLevel'][df['RiskLevel'] == "low risk"] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RiskLevel'][df['RiskLevel'] == "high risk"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RiskLevel'][df['RiskLevel'] == "mid risk"] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RiskLevel'][df['RiskLevel'] == "low risk"] = 2


In [7]:
# printing ddataframe head
df.head(10)

Unnamed: 0,Age,SystolicBP,DiastolicBP,BloodSugar,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,0
1,35,140,90,13.0,98.0,70,0
2,29,90,70,8.0,100.0,80,0
3,30,140,85,7.0,98.0,70,0
4,35,120,60,6.1,98.0,76,2
5,23,140,80,7.01,98.0,70,0
6,23,130,70,7.01,98.0,78,1
7,35,85,60,11.0,102.0,86,0
8,32,120,90,6.9,98.0,70,1
9,42,130,80,18.0,98.0,70,0


In [8]:
# specifying target variable
y = df['RiskLevel']
y[:5]
y=y.astype('int')

In [9]:
y.value_counts()

2    406
1    336
0    272
Name: RiskLevel, dtype: int64

In [10]:
# specifying features variables

X = df.drop('RiskLevel', axis=1)
X[:5]

Unnamed: 0,Age,SystolicBP,DiastolicBP,BloodSugar,BodyTemp,HeartRate
0,25,130,80,15.0,98.0,86
1,35,140,90,13.0,98.0,70
2,29,90,70,8.0,100.0,80
3,30,140,85,7.0,98.0,70
4,35,120,60,6.1,98.0,76


In [11]:
# checking the shape of features and target
print(X.shape, y.shape)

(1014, 6) (1014,)


In [12]:
# checking the nmber of outcomes class in our target 
print(Counter(y))

Counter({2: 406, 1: 336, 0: 272})


In [13]:
# splitting our data to training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Logistic Regression Multinomial

In [14]:
scaler = StandardScaler()

In [15]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [16]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1500)
model

LogisticRegression(max_iter=1500, multi_class='multinomial')

In [17]:
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)


In [18]:
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)


In [20]:
# Fit (train) or model using the training data
model.fit(X_train,y_train)

LogisticRegression(max_iter=1500, multi_class='multinomial')

In [21]:
# Score the model using the test data
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.6118421052631579
Testing Data Score: 0.6377952755905512


In [22]:
#  make predictions
predictions = model.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,1
1,2,2
2,2,2
3,0,0
4,2,2
5,0,0
6,2,1
7,1,0
8,1,2
9,1,2


In [23]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.6377952755905512

In [24]:
# input data
row = [17,110,75,12,101,76]

In [25]:
# predicting on our given random data
prediction = model.predict([row])



In [26]:
# checking the result
print('Predicted class:', prediction[0])


Predicted class: 0


In [27]:
# defining the filename to save our model
file_name  = 'saved_model.sav'

In [28]:
# saving the model
pickle.dump(model, open(file_name, 'wb'))

# Fitting the Random forest model
Once the data is scaled, create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [27]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [28]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

## Making Predictions Using the Random Forest Model

Validate the trained model by predicting loan defaults using the testing data (`X_test_scaled`).

In [29]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([2, 2, 0, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 2, 0, 0, 2, 0, 1,
       1, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 0, 0, 2, 1, 2, 1, 2, 2, 1, 1, 1,
       0, 0, 2, 2, 0, 0, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 0, 2, 1, 2, 1,
       0, 2, 0, 1, 2, 0, 2, 1, 2, 2, 1, 0, 1, 0, 1, 2, 1, 1, 0, 2, 1, 1,
       0, 1, 1, 2, 2, 0, 0, 1, 0, 2, 1, 2, 1, 2, 2, 0, 2, 2, 1, 2, 0, 2,
       0, 0, 1, 1, 0, 1, 1, 0, 2, 1, 2, 2, 0, 1, 0, 1, 1, 1, 2, 2, 2, 2,
       1, 1, 1, 2, 0, 1, 0, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 0, 0,
       1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 1, 2, 2, 0, 2, 1, 0,
       0, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2, 0, 2, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 2, 1, 1, 1, 0, 0, 2, 2, 1, 1, 0, 1, 1, 1, 1, 2, 2, 0,
       2, 1, 2, 1, 2, 1, 1, 0, 1, 2, 0, 1, 0, 0, 0, 1, 0, 1, 2, 2, 0, 2,
       2, 0, 0, 0, 2, 2, 2, 0, 0, 1, 2, 1])

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [30]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7992125984251969

## Feature Importance

Important features

In [31]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:6]

[(0.3598393721435654, 'BloodSugar'),
 (0.17929864775473064, 'SystolicBP'),
 (0.1593403580826192, 'Age'),
 (0.1279408352095977, 'DiastolicBP'),
 (0.10452824664718804, 'HeartRate'),
 (0.06905254016229906, 'BodyTemp')]

In [32]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.91      0.90      0.90        68
           1       0.74      0.79      0.76        94
           2       0.78      0.74      0.76        92

    accuracy                           0.80       254
   macro avg       0.81      0.81      0.81       254
weighted avg       0.80      0.80      0.80       254



In [35]:
# input data to test age, SystolicBP, DiastolicBP, Blood Sugar, Body Temp, Heart Rate
actual_test = [38,155,90,12,101,83]

In [40]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [43]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform([actual_test])



In [44]:
# Making predictions using the testing data
predictions = rf_model.predict([actual_test])
predictions

array([0])

In [46]:
# defining the file name for saving our model
file_name  = 'saved_model_rf.sav'

In [48]:
# saving our model
pickle.dump(rf_model, open(file_name, 'wb'))