### Import Libraries

In [1]:
import sklearn 

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold, cross_val_score

### Load Data

In [2]:
from azureml import Workspace

ws = Workspace()
ds = ws.datasets['houseprice_data']
houseprice_df = ds.to_dataframe()

In [3]:
Y = (np.array(houseprice_df['price'], dtype='float64')).reshape(-1,1)

In [4]:
X = houseprice_df.drop('price', axis = 1)
X = np.array(X)

### Repeated KFold Cross Validation
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedKFold.html

In [5]:
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=0)

list(rkf.split(X))

[(array([    4,     6,    10, ..., 21602, 21604, 21607]),
  array([    0,     1,     2, ..., 21608, 21609, 21610])),
 (array([    0,     1,     2, ..., 21608, 21609, 21610]),
  array([    4,     6,    10, ..., 21602, 21604, 21607])),
 (array([    3,     4,     5, ..., 21606, 21607, 21608]),
  array([    0,     1,     2, ..., 21604, 21609, 21610])),
 (array([    0,     1,     2, ..., 21604, 21609, 21610]),
  array([    3,     4,     5, ..., 21606, 21607, 21608]))]

In [6]:
i = 1

for train, test in rkf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [    4     6    10 ..., 21602 21604 21607]
Test1 [    0     1     2 ..., 21608 21609 21610]

Train2 [    0     1     2 ..., 21608 21609 21610]
Test2 [    4     6    10 ..., 21602 21604 21607]

Train3 [    3     4     5 ..., 21606 21607 21608]
Test3 [    0     1     2 ..., 21604 21609 21610]

Train4 [    0     1     2 ..., 21604 21609 21610]
Test4 [    3     4     5 ..., 21606 21607 21608]


In [7]:
len(X), len(train), len(test)

(21611, 10806, 10805)

Here we are not changing the n_splits values so the length of training and testing data will still remain same.

In [8]:
rkf = RepeatedKFold(n_splits=2, n_repeats=3, random_state=0)
i = 1

for train, test in rkf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [    4     6    10 ..., 21602 21604 21607]
Test1 [    0     1     2 ..., 21608 21609 21610]

Train2 [    0     1     2 ..., 21608 21609 21610]
Test2 [    4     6    10 ..., 21602 21604 21607]

Train3 [    3     4     5 ..., 21606 21607 21608]
Test3 [    0     1     2 ..., 21604 21609 21610]

Train4 [    0     1     2 ..., 21604 21609 21610]
Test4 [    3     4     5 ..., 21606 21607 21608]

Train5 [    0     4     5 ..., 21604 21606 21610]
Test5 [    1     2     3 ..., 21607 21608 21609]

Train6 [    1     2     3 ..., 21607 21608 21609]
Test6 [    0     4     5 ..., 21604 21606 21610]


In [9]:
len(X), len(train), len(test)

(21611, 10806, 10805)

In [10]:
rkf = RepeatedKFold(n_splits=3, n_repeats=3, random_state=0)
i = 1

for train, test in rkf.split(X):
    print("\nTrain" + str(i), train)
    print("Test" + str(i), test)
    
    i += 1


Train1 [    0     2     3 ..., 21607 21609 21610]
Test1 [    1     7     8 ..., 21600 21601 21608]

Train2 [    1     4     6 ..., 21604 21607 21608]
Test2 [    0     2     3 ..., 21606 21609 21610]

Train3 [    0     1     2 ..., 21608 21609 21610]
Test3 [    4     6    10 ..., 21594 21604 21607]

Train4 [    3     4     5 ..., 21607 21608 21609]
Test4 [    0     1     2 ..., 21602 21604 21610]

Train5 [    0     1     2 ..., 21607 21608 21610]
Test5 [    4     8     9 ..., 21600 21601 21609]

Train6 [    0     1     2 ..., 21604 21609 21610]
Test6 [    3     5     7 ..., 21606 21607 21608]

Train7 [    0     2     4 ..., 21606 21609 21610]
Test7 [    1     3     9 ..., 21603 21607 21608]

Train8 [    0     1     3 ..., 21606 21607 21608]
Test8 [    2     4     8 ..., 21605 21609 21610]

Train9 [    1     2     3 ..., 21608 21609 21610]
Test9 [    0     5     6 ..., 21598 21599 21606]


In [11]:
len(X), len(train), len(test)

(21611, 14408, 7203)

In [12]:
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=0)

indices_list = list(rkf.split(X))
indices_list

[(array([    4,     6,    10, ..., 21602, 21604, 21607]),
  array([    0,     1,     2, ..., 21608, 21609, 21610])),
 (array([    0,     1,     2, ..., 21608, 21609, 21610]),
  array([    4,     6,    10, ..., 21602, 21604, 21607]))]

In [13]:
indices_list[0]

(array([    4,     6,    10, ..., 21602, 21604, 21607]),
 array([    0,     1,     2, ..., 21608, 21609, 21610]))

In [14]:
x_train1, y_train1 = X[indices_list[0][0]], Y[indices_list[0][0]]
x_test1, y_test1 = X[indices_list[0][1]], Y[indices_list[0][1]]

x_train2, y_train2 = X[indices_list[1][0]], Y[indices_list[1][0]]
x_test2, y_test2 = X[indices_list[1][1]], Y[indices_list[1][1]]

In [15]:
reg = LinearRegression().fit(x_train1, y_train1)

reg.score(x_test1, y_test1)

0.64511443953010517

In [16]:
reg = LinearRegression().fit(x_train2, y_train2)

reg.score(x_test2, y_test2)

0.64992314765685921

In [17]:
cross_val_score(reg, X, Y, cv=rkf)

array([ 0.64511444,  0.64992315])