In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt

iris = load_iris()


In [49]:

from sklearn.model_selection import train_test_split
df= pd.DataFrame(iris.data)
df['target']= iris.target
df
y= df['target']
X= df.drop(['target'],axis=1)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [62]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
kf

KFold(n_splits=2, random_state=None, shuffle=False)

In [51]:
print(y)

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64


In [66]:
for train_index,test_index in kf.split(X):
    print(np.shape(train_index))
    print(np.shape(test_index))
    print(f'The Train {train_index},The test {test_index}')
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("X_train:", X_train.shape, " X_test:", X_test.shape, " y_train:", y_train.shape, " y_test:", y_test.shape)

(75,)
(75,)
The Train [ 75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110
 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
 147 148 149],The test [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74]
X_train: (75, 4)  X_test: (75, 4)  y_train: (75,)  y_test: (75,)
(75,)
(75,)
The Train [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74],The test [ 75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92
  93  94  95  96  97  98  99 100 1

# Why do I need .loc and cannot have access directly by X[train_index] while X[0:3] for example can return a dataframe in which there are 3 rows and all their columns? 
I see, thank you for clarifying. The reason why X[train_index] may not work as expected compared to X[0:3] lies in the difference between the behavior of NumPy-style indexing and pandas-style indexing.

When you use X[0:3], it's a form of NumPy-style slicing, which selects rows based on their integer positions regardless of the indices of the DataFrame. It selects rows 0 to 2 (inclusive) and includes all columns.

However, when you use X[train_index], you're trying to perform NumPy-style indexing on a pandas DataFrame using an array of indices (train_index). This is where the behavior differs.

In pandas, using arrays of indices directly for indexing (without using .iloc or .loc) is generally not supported. This is because pandas emphasizes label-based indexing (.loc) or integer-based indexing (.iloc) for selecting rows and columns.

Therefore, to ensure that you're selecting the correct rows from X based on the indices in train_index, you should use .iloc[train_index] or .loc[train_index]. These methods explicitly tell pandas to select rows based on their integer positions (iloc) or index labels (loc).

In summary, X[train_index] does not work as expected because it tries to perform NumPy-style indexing on a pandas DataFrame, which is not directly supported. Instead, you should use .iloc[train_index] or .loc[train_index] to achieve the desired selection of rows based on the indices in train_index.