In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [37]:
data = pd.read_csv("gender_classification_v7.csv")
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [38]:
print(x)

[[ 1.  11.8  6.1 ...  0.   1.   1. ]
 [ 0.  14.   5.4 ...  0.   1.   0. ]
 [ 0.  11.8  6.3 ...  1.   1.   1. ]
 ...
 [ 1.  12.9  5.7 ...  0.   0.   0. ]
 [ 1.  13.2  6.2 ...  0.   0.   0. ]
 [ 1.  15.4  5.4 ...  1.   1.   1. ]]


In [39]:
print(y)


['Male' 'Female' 'Male' ... 'Female' 'Female' 'Male']


Menghilangkan Missing Value

In [40]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

In [41]:
print(x)

[[ 1.  11.8  6.1 ...  0.   1.   1. ]
 [ 0.  14.   5.4 ...  0.   1.   0. ]
 [ 0.  11.8  6.3 ...  1.   1.   1. ]
 ...
 [ 1.  12.9  5.7 ...  0.   0.   0. ]
 [ 1.  13.2  6.2 ...  0.   0.   0. ]
 [ 1.  15.4  5.4 ...  1.   1.   1. ]]


Encoding Atribut

In [42]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [43]:
print(x)

[[ 0.   1.  11.8 ...  0.   1.   1. ]
 [ 1.   0.  14.  ...  0.   1.   0. ]
 [ 1.   0.  11.8 ...  1.   1.   1. ]
 ...
 [ 0.   1.  12.9 ...  0.   0.   0. ]
 [ 0.   1.  13.2 ...  0.   0.   0. ]
 [ 0.   1.  15.4 ...  1.   1.   1. ]]


Encoding class / label

In [46]:
le = LabelEncoder()
y = le.fit_transform(y)

In [47]:
print(y)

[1 0 1 ... 0 0 1]


Membagi dataset ke dalam training set dan test set

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [50]:
print(x_train)

[[ 0.   1.  12.3 ...  1.   1.   1. ]
 [ 1.   0.  13.4 ...  0.   0.   0. ]
 [ 0.   1.  11.8 ...  1.   1.   1. ]
 ...
 [ 0.   1.  13.4 ...  0.   0.   0. ]
 [ 0.   1.  13.7 ...  1.   1.   1. ]
 [ 0.   1.  15.1 ...  1.   1.   1. ]]


In [51]:
print(x_test)

[[ 0.   1.  12.8 ...  0.   1.   0. ]
 [ 0.   1.  12.3 ...  1.   1.   1. ]
 [ 0.   1.  11.5 ...  0.   0.   0. ]
 ...
 [ 0.   1.  12.9 ...  1.   1.   1. ]
 [ 0.   1.  12.7 ...  1.   1.   1. ]
 [ 0.   1.  13.6 ...  0.   0.   0. ]]


In [52]:
print(y_train)

[1 0 1 ... 0 1 1]


In [53]:
print(y_test)

[0 1 0 ... 1 1 0]


Feature Scalling

In [57]:
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [58]:
print(x_train)

[[ 0.          1.         12.3        ...  0.97676975  1.01969389
   1.00050013]
 [ 1.          0.         13.4        ... -1.02378273 -0.98068647
  -0.99950012]
 [ 0.          1.         11.8        ...  0.97676975  1.01969389
   1.00050013]
 ...
 [ 0.          1.         13.4        ... -1.02378273 -0.98068647
  -0.99950012]
 [ 0.          1.         13.7        ...  0.97676975  1.01969389
   1.00050013]
 [ 0.          1.         15.1        ...  0.97676975  1.01969389
   1.00050013]]


In [59]:
print(x_test)

[[ 0.          1.         12.8        ... -1.02378273  1.01969389
  -0.99950012]
 [ 0.          1.         12.3        ...  0.97676975  1.01969389
   1.00050013]
 [ 0.          1.         11.5        ... -1.02378273 -0.98068647
  -0.99950012]
 ...
 [ 0.          1.         12.9        ...  0.97676975  1.01969389
   1.00050013]
 [ 0.          1.         12.7        ...  0.97676975  1.01969389
   1.00050013]
 [ 0.          1.         13.6        ... -1.02378273 -0.98068647
  -0.99950012]]
