In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


In [52]:
X, y = load_diabetes(return_X_y=True, as_frame=True)
y

0      151.0
1       75.0
2      141.0
3      206.0
4      135.0
       ...  
437    178.0
438    104.0
439    132.0
440    220.0
441     57.0
Name: target, Length: 442, dtype: float64

In [53]:
df = pd.DataFrame(X)
df['labels'] = y
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,labels
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [54]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,labels
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


In [65]:
df.groupby(['age'])['labels'].mean()

age
-0.107226    159.000000
-0.103593     87.000000
-0.099961    102.500000
-0.096328    148.250000
-0.092695    144.500000
-0.089063    163.333333
-0.085430     95.400000
-0.081798     94.000000
-0.078165    154.250000
-0.074533    103.625000
-0.070900    148.666667
-0.067268    143.666667
-0.063635    193.250000
-0.060003    146.285714
-0.056370    114.500000
-0.052738    120.857143
-0.049105    153.714286
-0.045472    159.222222
-0.041840    141.000000
-0.038207    128.666667
-0.034575    110.666667
-0.030942    147.200000
-0.027310    139.666667
-0.023677    139.800000
-0.020045    135.428571
-0.016412    171.142857
-0.012780    127.500000
-0.009147    153.818182
-0.005515    139.916667
-0.001882    126.642857
 0.001751    126.416667
 0.005383    148.384615
 0.009016    166.812500
 0.012648    170.142857
 0.016281    149.157895
 0.019913    167.545455
 0.023546    171.500000
 0.027178    124.111111
 0.030811    185.500000
 0.034443    210.111111
 0.038076    149.000000
 0.041708   

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [56]:
y_test.shape

(89,)

In [57]:
linear_reg = LinearRegression()

In [58]:
linear_reg.fit(X_train, y_train)

In [59]:
preds = linear_reg.predict(X_test)

In [66]:
score = r2_score(y_test, preds) * 100

In [63]:
score

0.040576876952214125