In [6]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
# Learning SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
print(imp_mean.transform(X))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   3.5  9. ]]


In [8]:
# Checking sklearn version
print("Sklearn version is {}".format(sklearn.__version__))

Sklearn version is 1.3.2


In [11]:
# OneHotEncoder example
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)
print(enc.categories_)
print(enc.transform([['Female', 1], ['Male', 4]]).toarray())
print(enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]))
feature_names = enc.get_feature_names_out(['gender', 'group'])
print(feature_names)

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
[[1. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]]
[['Male' 1]
 [None 2]]
['gender_Female' 'gender_Male' 'group_1' 'group_2' 'group_3']


In [12]:
# Loading dataset from CSV file
dataset = pd.read_csv('../datasets/Data.csv')
X = dataset.iloc[:, :-1].values  # All columns except the last one
Y = dataset.iloc[:, 3].values     # Last column

In [13]:
# Handling missing data using SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [14]:
# Encoding categorical data
ct = ColumnTransformer([("", OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [15]:
# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [16]:
# Feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)