### Import Libraries

In [1]:
import sklearn 

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

### Load Data

In [2]:
from azureml import Workspace

ws = Workspace()
ds = ws.datasets['houseprice_data']

houseprice_df = ds.to_dataframe()

In [3]:
Y = (np.array(houseprice_df['price'], dtype='float64')).reshape(-1, 1)

In [4]:
X = houseprice_df.drop('price', axis = 1)

X = np.array(X)

In [5]:
X.shape, Y.shape

((21611, 15), (21611, 1))

### Kfold Cross Validation
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html    
Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default)

In [6]:
kf = KFold(n_splits=3)

list(kf.split(X))

[(array([ 7204,  7205,  7206, ..., 21608, 21609, 21610]),
  array([   0,    1,    2, ..., 7201, 7202, 7203])),
 (array([    0,     1,     2, ..., 21608, 21609, 21610]),
  array([ 7204,  7205,  7206, ..., 14405, 14406, 14407])),
 (array([    0,     1,     2, ..., 14405, 14406, 14407]),
  array([14408, 14409, 14410, ..., 21608, 21609, 21610]))]

In [7]:
i = 1

for train, test in kf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [ 7204  7205  7206 ..., 21608 21609 21610]
Test1 [   0    1    2 ..., 7201 7202 7203]

Train2 [    0     1     2 ..., 21608 21609 21610]
Test2 [ 7204  7205  7206 ..., 14405 14406 14407]

Train3 [    0     1     2 ..., 14405 14406 14407]
Test3 [14408 14409 14410 ..., 21608 21609 21610]


#### Length of testing data will be 3rd part of the whole data. Here in this case it will be 100/3 = 33

In [8]:
len(X), len(train), len(test)

(21611, 14408, 7203)

In [9]:
kf = KFold(n_splits=4)

i = 1

for train, test in kf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [ 5403  5404  5405 ..., 21608 21609 21610]
Test1 [   0    1    2 ..., 5400 5401 5402]

Train2 [    0     1     2 ..., 21608 21609 21610]
Test2 [ 5403  5404  5405 ..., 10803 10804 10805]

Train3 [    0     1     2 ..., 21608 21609 21610]
Test3 [10806 10807 10808 ..., 16206 16207 16208]

Train4 [    0     1     2 ..., 16206 16207 16208]
Test4 [16209 16210 16211 ..., 21608 21609 21610]


In [10]:
len(X), len(train), len(test)

(21611, 16209, 5402)

In [11]:
kf = KFold(n_splits=15)
i = 1

for train, test in kf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [ 1441  1442  1443 ..., 21608 21609 21610]
Test1 [   0    1    2 ..., 1438 1439 1440]

Train2 [    0     1     2 ..., 21608 21609 21610]
Test2 [1441 1442 1443 ..., 2879 2880 2881]

Train3 [    0     1     2 ..., 21608 21609 21610]
Test3 [2882 2883 2884 ..., 4320 4321 4322]

Train4 [    0     1     2 ..., 21608 21609 21610]
Test4 [4323 4324 4325 ..., 5761 5762 5763]

Train5 [    0     1     2 ..., 21608 21609 21610]
Test5 [5764 5765 5766 ..., 7202 7203 7204]

Train6 [    0     1     2 ..., 21608 21609 21610]
Test6 [7205 7206 7207 ..., 8643 8644 8645]

Train7 [    0     1     2 ..., 21608 21609 21610]
Test7 [ 8646  8647  8648 ..., 10084 10085 10086]

Train8 [    0     1     2 ..., 21608 21609 21610]
Test8 [10087 10088 10089 ..., 11525 11526 11527]

Train9 [    0     1     2 ..., 21608 21609 21610]
Test9 [11528 11529 11530 ..., 12966 12967 12968]

Train10 [    0     1     2 ..., 21608 21609 21610]
Test10 [12969 12970 12971 ..., 14407 14408 14409]

Train11 [    0     1     2 ..., 2

In [12]:
len(X), len(train), len(test)

(21611, 20171, 1440)

In [13]:
kf = KFold(n_splits=2)

indices_list = list(kf.split(X))
indices_list

[(array([10806, 10807, 10808, ..., 21608, 21609, 21610]),
  array([    0,     1,     2, ..., 10803, 10804, 10805])),
 (array([    0,     1,     2, ..., 10803, 10804, 10805]),
  array([10806, 10807, 10808, ..., 21608, 21609, 21610]))]

In [14]:
indices_list[0]

(array([10806, 10807, 10808, ..., 21608, 21609, 21610]),
 array([    0,     1,     2, ..., 10803, 10804, 10805]))

#### Taking the indices stoing the values of training and testing data into the variables.

In [15]:
x_train1, y_train1 = X[indices_list[0][0]], Y[indices_list[0][0]]
x_test1, y_test1 = X[indices_list[0][1]], Y[indices_list[0][1]]

x_train2, y_train2 = X[indices_list[1][0]], Y[indices_list[1][0]]
x_test2, y_test2 = X[indices_list[1][1]], Y[indices_list[1][1]]

In [16]:
x_train1.shape, y_train1.shape

((10805, 15), (10805, 1))

In [17]:
x_test1.shape, y_test1.shape

((10806, 15), (10806, 1))

In [18]:
x_train2.shape, y_train2.shape

((10806, 15), (10806, 1))

In [19]:
x_test2.shape, y_test2.shape

((10805, 15), (10805, 1))

#### Now, we will do the linear regression on the both subset and check wheather the cross validation score is same or not.

In [20]:
reg = LinearRegression().fit(x_train1, y_train1)

reg.score(x_test1, y_test1)

0.64555076379427345

In [21]:
reg = LinearRegression().fit(x_train2, y_train2)

reg.score(x_test2, y_test2)

0.64425185694420173

In [22]:
cross_val_score(reg, X, Y, cv=2)

array([ 0.64555076,  0.64425186])