In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris

#### Load Data

In [2]:
data_bunch = load_iris(as_frame=True)

In [3]:
df = data_bunch["data"]

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
df["target"] = data_bunch["target"]

In [6]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


#### Statistics of Data

In [7]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [8]:
# Check what are the different classes in target column
df["target"].unique()

array([0, 1, 2])

In [9]:
# Check number of different classes in target column
df["target"].nunique()

3

In [10]:
# Count how many instances we have each class
df["target"].value_counts()

target
0    50
1    50
2    50
Name: count, dtype: int64

#### Split data using numpy and pandas

In [11]:
df.shape

(150, 5)

In [12]:
n_train = round(df.shape[0] * 0.7)
n_test = round(df.shape[0] * 0.15)
n_val = round(df.shape[0] * 0.15)

In [13]:
print(n_train, n_test, n_val)

105 22 22


In [14]:
# What if we split dataset without shuffling
df_train = df.iloc[:n_train, :]
df_val = df.iloc[n_train: n_train + n_val, :]
df_test = df.iloc[n_train + n_val:, :]

In [15]:
# Let's count instances for each class in training data
df_train["target"].value_counts()

target
0    50
1    50
2     5
Name: count, dtype: int64

In [16]:
# Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=1234).reset_index(drop=True)

In [17]:
df_shuffled.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,6.1,3.0,4.6,1.4,1
1,6.1,2.9,4.7,1.4,1
2,6.3,2.9,5.6,1.8,2
3,4.6,3.4,1.4,0.3,0
4,5.2,2.7,3.9,1.4,1


In [18]:
# Ensure value counts are equal
df_shuffled["target"].value_counts()

target
1    50
2    50
0    50
Name: count, dtype: int64

In [19]:
# Split the shuffled dataset
df_train = df_shuffled.iloc[:n_train, :]
df_val = df_shuffled.iloc[n_train: n_train + n_val, :]
df_test = df_shuffled.iloc[n_train + n_val:, :]

In [20]:
# Now, again check the counts in train dataset, we will have a balanced distribution for each class
df_train["target"].value_counts()

target
1    38
0    35
2    32
Name: count, dtype: int64

#### Split dataset using sklearn

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [23]:
features = df.columns.drop("target").tolist()
label = "target"

In [24]:
# First split data to train (70%) and test (15%)
X_train, X_test, y_train, y_test = train_test_split(df[features], df[label], train_size=0.85, test_size=0.15, random_state=1234)

In [25]:
# Then split train data to train (85%) and valid (15%)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.85, test_size=0.15, random_state=1234)

In [26]:
X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
121,5.6,2.8,4.9,2.0
1,4.9,3.0,1.4,0.2
84,5.4,3.0,4.5,1.5
31,5.4,3.4,1.5,0.4
148,6.2,3.4,5.4,2.3
...,...,...,...,...
45,4.8,3.0,1.4,0.3
15,5.7,4.4,1.5,0.4
130,7.4,2.8,6.1,1.9
142,5.8,2.7,5.1,1.9


In [27]:
y_train.value_counts()

target
2    40
0    34
1    33
Name: count, dtype: int64

In [28]:
X_train.shape[0] + X_val.shape[0] + X_test.shape[0]

150