In [83]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
import joblib

In [84]:
from urllib.request import urlretrieve
urlretrieve('https://raw.githubusercontent.com/itsahyarr/aimastery-doc/main/datasets/weight-height.csv', 'weight-height.csv')

('weight-height.csv', <http.client.HTTPMessage at 0x23e4cf903d0>)

In [85]:
df = pd.read_csv("weight-height.csv")

In [86]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [87]:
df.rename(columns={'Height':'Weight','Weight':'Height'}, inplace=True) # Swapping height and weight column
df = df[['Weight','Height','Gender']]
df

Unnamed: 0,Weight,Height,Gender
0,73.847017,241.893563,Male
1,68.781904,162.310473,Male
2,74.110105,212.740856,Male
3,71.730978,220.042470,Male
4,69.881796,206.349801,Male
...,...,...,...
9995,66.172652,136.777454,Female
9996,67.067155,170.867906,Female
9997,63.867992,128.475319,Female
9998,69.034243,163.852461,Female


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Weight  10000 non-null  float64
 1   Height  10000 non-null  float64
 2   Gender  10000 non-null  object 
dtypes: float64(2), object(1)
memory usage: 234.5+ KB


In [89]:
le = LabelEncoder()
le.fit(df['Gender'])
df['Gender'] = le.transform(df['Gender'])

In [90]:
df

Unnamed: 0,Weight,Height,Gender
0,73.847017,241.893563,1
1,68.781904,162.310473,1
2,74.110105,212.740856,1
3,71.730978,220.042470,1
4,69.881796,206.349801,1
...,...,...,...
9995,66.172652,136.777454,0
9996,67.067155,170.867906,0
9997,63.867992,128.475319,0
9998,69.034243,163.852461,0


In [91]:
df.groupby('Gender').mean()

Unnamed: 0_level_0,Weight,Height
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0,63.708774,135.860093
1,69.026346,187.020621


In [92]:
X = df[["Gender", "Height"]]
X

Unnamed: 0,Gender,Height
0,1,241.893563
1,1,162.310473
2,1,212.740856
3,1,220.042470
4,1,206.349801
...,...,...
9995,0,136.777454
9996,0,170.867906
9997,0,128.475319
9998,0,163.852461


In [93]:
# Round and coverting weight & height to integer
df.round({
    'Weight':0,
    'Height':0
}).astype(int)

Unnamed: 0,Weight,Height,Gender
0,74,242,1
1,69,162,1
2,74,213,1
3,72,220,1
4,70,206,1
...,...,...,...
9995,66,137,0
9996,67,171,0
9997,64,128,0
9998,69,164,0


In [94]:
predict = "Weight"

In [95]:
X = np.array(df.drop(labels=predict, axis=1)) # Axis=1 -> column
y = np.array(df[predict])

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
lr = LinearRegression()
model = lr.fit(X_train, y_train)

In [97]:
print(f"Akurasi Model: {accuracy * 100}%")

Akurasi Model: 85.64612157889509%


In [99]:
joblib.dump(model, 'weight-predict.joblib')

['weight-predict.joblib']

In [100]:
['weight-predict.joblib']
# Testing model
mod = joblib.load('weight-predict.joblib')
prediction = mod.predict(X_test)
for x in range(100):
  print(prediction[x],'||',y_test[x])

70.82567941694545 || 70.4189573442233
71.12379005216424 || 73.1879491114706
66.88905216870624 || 69.4285574073058
68.63164651998378 || 68.4457467280668
62.71548537592761 || 61.7645093762568
63.02542659413401 || 64.4309328307492
62.6990789562762 || 62.4269612065783
71.4035828870734 || 71.3270010663305
69.69654573706819 || 69.576571845724
64.48835006693776 || 64.42841794434
63.054524434846996 || 63.132339355837
69.7582250424844 || 71.1879114486
65.4122566209552 || 66.5294933495484
61.99344473250439 || 59.8601233557773
68.82356054357442 || 65.1928622148426
69.07812727116732 || 67.6971036455845
63.76995566253121 || 63.7454886542419
74.63839150792774 || 73.166638105566
67.73371452146486 || 66.915285627757
60.86331206939955 || 60.9946387314652
73.96591947239492 || 75.8501122218929
63.56947695348637 || 60.8929255610217
68.95414192580128 || 68.4857245400115
65.75935097867342 || 66.6445262797493
67.99819537292436 || 67.4793517580411
64.38696818245454 || 65.3060946626047
60.852759338004866 || 58

In [101]:
std_scaler = StandardScaler()

joblib.dump((model, std_scaler), "Weight-Predictions-using-linear-regression.pkl")

['Weight-Predictions-using-linear-regression.pkl']