In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
import pandas as pd

import random
random.seed(0)

def train_test_split(df, test_size):
    if isinstance(test_size, float):
        test_size = int(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

df = pd.read_csv('./iris.csv')

cols = list(df.columns)
cols[-1] = 'label'
df.columns = cols
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width   label
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
label           150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [12]:
df['label'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [13]:
str2num = {'setosa':0, 'versicolor':1, 'virginica':2}
df['label'] = df['label'].replace(str2num)

In [14]:
df['label'].unique()

array([0, 1, 2], dtype=int64)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal_length    150 non-null float64
sepal_width     150 non-null float64
petal_length    150 non-null float64
petal_width     150 non-null float64
label           150 non-null int64
dtypes: float64(4), int64(1)
memory usage: 5.9 KB


In [16]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [17]:
class Logits():
    @staticmethod
    def index_to_logit(index, length=None):
        size = len(index)
        if length is None:
            length = max(index) + 1
        logits = np.zeros([size, length])
        # logits[range(size), logits.astype(int)] = 1
        for indx in range(size):
            logits[indx, index[indx]] = 1
        return logits

    @staticmethod
    def logit_to_index(logits):
        return np.argmax(logits, axis=1)

In [18]:
Logits.index_to_logit(df.label)

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0