In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [86]:
data_df = pd.read_csv('dataset.csv')
data_df.head()

Unnamed: 0,age,cholesterol level,blood pressure
0,59,197,168/68
1,24,125,197/73
2,28,129,115/108
3,28,131,148/105
4,40,172,153/62


In [87]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                1000 non-null   int64 
 1   cholesterol level  1000 non-null   int64 
 2   blood pressure     1000 non-null   object
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [88]:
data_df.describe()

Unnamed: 0,age,cholesterol level
count,1000.0,1000.0
mean,39.831,199.679
std,11.638466,45.360462
min,20.0,120.0
25%,30.0,161.0
50%,40.0,201.0
75%,50.0,240.0
max,60.0,280.0


In [89]:
data_df.columns = data_df.columns.str.replace(' ', '_')

In [90]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                1000 non-null   int64 
 1   cholesterol_level  1000 non-null   int64 
 2   blood_pressure     1000 non-null   object
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [91]:
data_df['blood_pressure']

0       168/68
1       197/73
2      115/108
3      148/105
4       153/62
        ...   
995    198/100
996     176/83
997    101/103
998     150/99
999     145/78
Name: blood_pressure, Length: 1000, dtype: object

In [92]:
data_df.duplicated().sum()

0

In [93]:
data_df.isnull().sum()

age                  0
cholesterol_level    0
blood_pressure       0
dtype: int64

In [94]:
data_df['systolic_pressure'] = data_df['blood_pressure'].str.split('/').str[0]
data_df['diastolic_pressure'] = data_df['blood_pressure'].str.split('/').str[1]

data_df.drop('blood_pressure', axis=1, inplace=True)

In [95]:
data_df.head()

Unnamed: 0,age,cholesterol_level,systolic_pressure,diastolic_pressure
0,59,197,168,68
1,24,125,197,73
2,28,129,115,108
3,28,131,148,105
4,40,172,153,62


In [96]:
#task 1 save the data into sqlite database
import sqlite3


conn = sqlite3.connect('database.db')
c = conn.cursor()

data_df.to_sql('healthcare', conn, if_exists='replace', index = False)
conn.close()

In [97]:
#task 2 retrieve data from sqlite database

In [98]:
conn = sqlite3.connect('database.db')
new_df = pd.read_sql('SELECT * FROM healthcare', conn)
conn.close()

new_df

Unnamed: 0,age,cholesterol_level,systolic_pressure,diastolic_pressure
0,59,197,168,68
1,24,125,197,73
2,28,129,115,108
3,28,131,148,105
4,40,172,153,62
...,...,...,...,...
995,24,207,198,100
996,48,228,176,83
997,45,197,101,103
998,30,202,150,99


In [99]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   age                 1000 non-null   int64 
 1   cholesterol_level   1000 non-null   int64 
 2   systolic_pressure   1000 non-null   object
 3   diastolic_pressure  1000 non-null   object
dtypes: int64(2), object(2)
memory usage: 31.4+ KB


In [100]:
# convert to numeric
new_df['systolic_pressure'] = pd.to_numeric(new_df['systolic_pressure'])
new_df['diastolic_pressure'] = pd.to_numeric(new_df['diastolic_pressure'])

In [101]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   age                 1000 non-null   int64
 1   cholesterol_level   1000 non-null   int64
 2   systolic_pressure   1000 non-null   int64
 3   diastolic_pressure  1000 non-null   int64
dtypes: int64(4)
memory usage: 31.4 KB


In [102]:
#task 3... organize data into vectors(features) and lables

#goal is to predict both systolic and diastolic pressure
#features are the other columns

X = new_df[['age', 'cholesterol_level']]
y_systolic = new_df['systolic_pressure']
y_diastolic = new_df['diastolic_pressure']

In [103]:
#task 4: split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_systolic_train, y_systolic_test, y_diastolic_train, y_diastolic_test = train_test_split(
    X, y_systolic, y_diastolic, test_size=0.2, random_state=42
)

In [104]:
#task 5 .. feature scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [105]:
X_test

Unnamed: 0,age,cholesterol_level
521,32,234
737,53,165
740,41,165
660,53,195
411,47,236
...,...,...
408,36,151
332,32,187
208,55,257
613,48,170


In [192]:
#task 6 model selection

from sklearn.linear_model import LinearRegression

systolic_model = LinearRegression()
diastolic_model = LinearRegression()

In [193]:
#task 7 train model

systolic_model.fit(X_train_scaled, y_systolic_train)
diastolic_model.fit(X_train_scaled, y_diastolic_train)

LinearRegression()

In [194]:
#model evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

mse_sys = mean_squared_error(y_systolic_test, systolic_model.predict(X_test_scaled)) 
mse_dia = mean_squared_error(y_diastolic_test, diastolic_model.predict(X_test_scaled))

print('RMSE for systolic pressure: ', np.sqrt(mse_sys))
print('RMSE for diastolic pressure: ', np.sqrt(mse_dia))

RMSE for systolic pressure:  28.601582868309265
RMSE for diastolic pressure:  20.664785624281393


In [195]:
#cross validation
scores_sys = cross_val_score(systolic_model, X_train_scaled, y_systolic_train, scoring='neg_mean_squared_error', cv=5)
scores_dia = cross_val_score(diastolic_model, X_train_scaled, y_diastolic_train, scoring='neg_mean_squared_error', cv=5)


def display_scores(scores):
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('Standard deviation: ', scores.std())

print('Systolic scores')
display_scores(np.sqrt(-scores_sys))
print('\n\nDiastolic scores')
display_scores(np.sqrt(-scores_dia))

Systolic scores
Scores:  [28.9884847  29.78496118 28.64756123 29.18753086 29.66312118]
Mean:  29.254331832445274
Standard deviation:  0.4223704216448313


Diastolic scores
Scores:  [20.85498242 21.29653294 19.43083438 20.34559434 21.23653032]
Mean:  20.632894878784413
Standard deviation:  0.6901601463835166


In [213]:
#task 8 make predictions

def predict(age, cholesterol_level):
    new_data_scaled = scaler.transform([(age, cholesterol_level)])
    predicted_systolic = systolic_model.predict(new_data_scaled)
    predicted_diastolic = diastolic_model.predict(new_data_scaled)

    return predicted_systolic, predicted_diastolic

In [214]:
a, b = predict(24, 125)

print(str(a[0]) + "/"+ str(b[0]))

148.4930509769824/96.8908266020697


  "X does not have valid feature names, but"


In [198]:
#save the models
import joblib

joblib.dump(systolic_model, 'systolic_model.pkl')
joblib.dump(diastolic_model, 'diastolic_model.pkl')

['diastolic_model.pkl']