# Sklearn Datasets

In [1]:
# Miscellaneous operating system interfaces
import os

# The fundamental package for scientific computing with Python.
import numpy as np

# Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
import pandas as pd

# Machine learning in Python
from sklearn import datasets
from sklearn.model_selection import train_test_split

# Set the absolute directory path.
BASE_PATH = os.path.abspath('')
dirs = os.path.dirname(BASE_PATH)
CONFIG_PATH = os.path.join(dirs, 'configs')
DATASET_PATH = os.path.join(dirs, 'datasets')
MIDDLEWARE_PATH = os.path.join(dirs, 'middlewares')
MODEL_PATH = os.path.join(dirs, 'models')
ROUTE_PATH = os.path.join(dirs, 'routes')
TEMPLATE_PATH = os.path.join(dirs, 'templates')

## Load Iris

In [2]:
dataset = datasets.load_iris()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, stratify=dataset.target, random_state=2021
)

filename = 'iris'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.3,1.7,0.5,0


## Load Wine

In [3]:
dataset = datasets.load_wine()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, stratify=dataset.target, random_state=2021
)

filename = 'wine'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0,2


## Load Digits

In [4]:
dataset = datasets.load_digits()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, stratify=dataset.target, random_state=2021
)

filename = 'digits'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,6.0,16.0,8.0,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,4.0,16.0,16.0,15.0,16.0,16.0,2


## Load Breast Cancer

In [5]:
dataset = datasets.load_breast_cancer()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, stratify=dataset.target, random_state=2021
)

filename = 'breast_cancer'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0


## Load Diabetes

In [6]:
dataset = datasets.load_diabetes()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

# The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
#stratify=dataset.target
X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, random_state=2021
)

filename = 'diabetes'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,-0.063635,-0.044642,-0.033151,-0.033214,0.001183,0.024051,-0.024993,-0.002592,-0.022512,-0.059067,214.0


## Load Boston

In [7]:
dataset = datasets.load_boston()

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

# The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
#stratify=dataset.target
X_train, X_test, y_train, y_test = train_test_split(
    df, dataset.target, test_size=0.25, random_state=2021
)

filename = 'boston'
X_train.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)), index=False)
X_test.to_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)), index=False)

df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
df.head(1)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,6.53876,0.0,18.1,1.0,0.631,7.016,97.5,1.2024,24.0,666.0,20.2,392.05,2.96,50.0
