<a href="https://colab.research.google.com/github/thekaszsz/ML_book/blob/main/bigdata_2nd_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 2023)

x_test = pd.DataFrame(x_test.reset_index())
x_train = pd.DataFrame(x_train.reset_index())
y_train = pd.DataFrame(y_train.reset_index())

x_test.rename(columns = {'index':'cust_id'}, inplace = True)
x_train.rename(columns = {'index':'cust_id'}, inplace = True)
y_train.columns = ['cust_id', 'target']

In [8]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [9]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(353, 11)
(89, 11)
(353, 2)


In [10]:
print(x_train.head(3))
print(x_test.head(3))
print(y_train.head(3))

   cust_id       age       sex       bmi        bp        s1        s2  \
0        4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596   
1      318  0.088931 -0.044642  0.006728  0.025315  0.030078  0.008707   
2      301 -0.001882  0.050680 -0.024529  0.052858  0.027326  0.030001   

         s3        s4        s5        s6  
0  0.008142 -0.002592 -0.031988 -0.046641  
1  0.063367 -0.039493  0.009434  0.032059  
2  0.030232 -0.002592 -0.021395  0.036201  
   cust_id       age       sex       bmi        bp        s1        s2  \
0      280  0.009016  0.050680  0.018584  0.039087  0.017694  0.010586   
1      412  0.074401 -0.044642  0.085408  0.063187  0.014942  0.013091   
2       68  0.038076  0.050680 -0.029918 -0.040099 -0.033216 -0.024174   

         s3        s4        s5        s6  
0  0.019187 -0.002592  0.016307 -0.017646  
1  0.015505 -0.002592  0.006207  0.085907  
2 -0.010266 -0.002592 -0.012909  0.003064  
   cust_id  target
0        4   135.0
1      318   109

In [12]:
print(x_train.describe().T)
print(x_test.describe().T)
print(y_train.describe().T)

         count        mean         std       min         25%         50%  \
cust_id  353.0  212.634561  126.668903  0.000000  105.000000  210.000000   
age      353.0    0.000804    0.047617 -0.107226   -0.038207    0.005383   
sex      353.0    0.000724    0.047673 -0.044642   -0.044642   -0.044642   
bmi      353.0    0.000640    0.048141 -0.084886   -0.035307   -0.006206   
bp       353.0   -0.000326    0.046585 -0.112399   -0.033213   -0.005670   
s1       353.0    0.001179    0.047891 -0.126781   -0.033216   -0.002945   
s2       353.0    0.001110    0.048248 -0.115613   -0.029184   -0.001314   
s3       353.0   -0.000452    0.048600 -0.102307   -0.039719   -0.006584   
s4       353.0    0.000901    0.048045 -0.076395   -0.039493   -0.002592   
s5       353.0    0.001446    0.047160 -0.126097   -0.033246    0.000272   
s6       353.0    0.000589    0.048122 -0.137767   -0.034215    0.003064   

                75%         max  
cust_id  322.000000  441.000000  
age        0.038076

In [13]:
print(x_train.info())
print(x_test.info())
print(y_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353 entries, 0 to 352
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  353 non-null    int64  
 1   age      353 non-null    float64
 2   sex      353 non-null    float64
 3   bmi      353 non-null    float64
 4   bp       353 non-null    float64
 5   s1       353 non-null    float64
 6   s2       353 non-null    float64
 7   s3       353 non-null    float64
 8   s4       353 non-null    float64
 9   s5       353 non-null    float64
 10  s6       353 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 30.5 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  89 non-null     int64  
 1   age      89 non-null     float64
 2   sex      89 non-null     float64
 3   bmi      89 non-null     float64
 4   bp      

In [14]:
print(y_train.head())

   cust_id  target
0        4   135.0
1      318   109.0
2      301    65.0
3      189    79.0
4      288    80.0


In [16]:
print(y_train.describe().T)

         count        mean         std   min    25%    50%    75%    max
cust_id  353.0  212.634561  126.668903   0.0  105.0  210.0  322.0  441.0
target   353.0  152.943343   75.324692  37.0   90.0  141.0  208.0  346.0


In [17]:
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())

cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
cust_id    0
age        0
sex        0
bmi        0
bp         0
s1         0
s2         0
s3         0
s4         0
s5         0
s6         0
dtype: int64
cust_id    0
target     0
dtype: int64


In [18]:
cust_id = x_test['cust_id'].copy()

In [19]:
x_train = x_train.drop(columns = ['cust_id'])
x_test = x_test.drop(columns = ['cust_id'])

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train['target'], test_size = 0.2, random_state = 2023)

print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)

(282, 10)
(71, 10)
(282,)
(71,)


In [21]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)

In [22]:
y_pred = model.predict(x_val)

In [23]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_val, y_pred)
r2 = r2_score (y_val, y_pred)

In [24]:
print(mse)

3683.0463760563375


In [25]:
print(r2)

0.2756114371832499


In [26]:
rmse = mse**0.5
print(rmse)

60.68810736920651


In [27]:
#pd.DataFrame({'cust_id': cust_id, 'target': y_result}).to_csv('00300000)

In [28]:
y_result = model.predict(x_test)
result = pd.DataFrame({'cust id':cust_id, 'target': y_result})
print(result[:5])

   cust id  target
0      280  173.05
1      412  247.12
2       68   83.08
3      324  175.36
4      101   85.70
