In [2]:
import pandas as pd
import numpy as np
import sklearn

In [11]:
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}

df_raw = pd.DataFrame(data=data)
df_raw

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [12]:
df = df_raw.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    5 non-null      object 
 1   color   5 non-null      object 
 2   gender  5 non-null      object 
 3   price   5 non-null      float64
 4   weight  5 non-null      int64  
 5   bought  5 non-null      object 
dtypes: float64(1), int64(1), object(4)
memory usage: 368.0+ bytes


In [13]:
for col in ['size', 'color', 'gender', 'bought']:
    df[col] = df[col].astype('category')

df['weight'] = df['weight'].astype('float')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   size    5 non-null      category
 1   color   5 non-null      category
 2   gender  5 non-null      category
 3   price   5 non-null      float64 
 4   weight  5 non-null      float64 
 5   bought  5 non-null      category
dtypes: category(4), float64(2)
memory usage: 740.0 bytes


In [14]:
df.describe()

Unnamed: 0,price,weight
count,5.0,5.0
mean,119.0,408.0
std,48.476799,75.299402
min,79.0,300.0
25%,89.0,380.0
50%,99.0,410.0
75%,129.0,450.0
max,199.0,500.0


In [17]:
df.describe(include=['category'])

Unnamed: 0,size,color,gender,bought
count,5,5,5,5
unique,3,3,2,2
top,L,green,female,yes
freq,2,2,3,3


In [18]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,male,99.0,300.0,yes
3,L,green,female,129.0,380.0,no
4,M,red,female,79.0,410.0,yes


### pd.get_dummies()

In [21]:
pd.get_dummies(data=df, columns=['size'], dtype='int')

Unnamed: 0,color,gender,price,weight,bought,size_L,size_M,size_XL
0,red,female,199.0,500.0,yes,0,0,1
1,green,male,89.0,450.0,no,1,0,0
2,blue,male,99.0,300.0,yes,0,1,0
3,green,female,129.0,380.0,no,1,0,0
4,red,female,79.0,410.0,yes,0,1,0


### LabelEncoder

In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
le = LabelEncoder()
le.fit(df['bought'])
le.transform(df['bought'])


array([1, 0, 1, 0, 1])

In [25]:
le.classes_

array(['no', 'yes'], dtype=object)

In [26]:
df['bought'] = le.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,380.0,0
4,M,red,female,79.0,410.0,1


In [27]:
le.inverse_transform(df['bought'])

array(['yes', 'no', 'yes', 'no', 'yes'], dtype=object)

In [29]:
from sklearn.preprocessing import OrdinalEncoder

In [34]:
# Jak zakodowac 2,1,0
oe = LabelEncoder()
oe.fit(df['size'])
el = oe.transform(df['size'])
el.ravel().astype(int)

array([2, 0, 1, 0, 1])

### OneHotEncoder

In [36]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(df[['size']])

In [37]:
encoder.transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [39]:
encoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

### Standaryzacja

- Średnia: Po zastosowaniu StandardScaler, średnia wszystkich cech wynosi 0.
- Odchylenie standardowe: Odchylenie standardowe wynosi 1.
- Zakres wartości: Wartości mogą przyjmować zarówno dodatnie, jak i ujemne liczby, co oznacza, że nie ma ustalonego ograniczenia dla minimalnej i maksymalnej wartości. W praktyce wartości te mogą wahać się od kilku jednostek ujemnych do kilku jednostek dodatnich, w zależności od oryginalnego rozkładu danych1.

In [41]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,1
1,L,green,male,89.0,450.0,0
2,M,blue,male,99.0,300.0,1
3,L,green,female,129.0,380.0,0
4,M,red,female,79.0,410.0,1


In [40]:
print(f"{df['price']}\n")
print(f"Średnia: {df['price'].mean()}")
print(f"Odchylenie standardowe: {df['price'].std():.4f}")

0    199.0
1     89.0
2     99.0
3    129.0
4     79.0
Name: price, dtype: float64

Średnia: 119.0
Odchylenie standardowe: 48.4768


In [42]:
from sklearn.preprocessing import StandardScaler

In [44]:
scaler = StandardScaler()
scaler.fit_transform(df[['price']])

array([[ 1.84506242],
       [-0.69189841],
       [-0.4612656 ],
       [ 0.2306328 ],
       [-0.92253121]])

In [46]:
scaler = StandardScaler()
scaler.fit_transform(df[['weight']])

array([[ 1.3660019 ],
       [ 0.62360956],
       [-1.60356745],
       [-0.41573971],
       [ 0.02969569]])

### MinMaxScaler

In [47]:
from sklearn.preprocessing import MinMaxScaler