In [1]:
import pandas as pd
import numpy as np
import seaborn  as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
# 1. Kỹ thuật Binning
# Mục đích:
# - Sử dụng để biến đổi biến định lượng về biến định tính thông qua việc chia khoảng
# - Giảm thiểu dữ liệu nhiễu
# - Làm mịn (smooth) dữ liệu
# - Hỗ trợ trong nhiều bài toán Machine Learning như Decision Tree

#create DataFrame
df = pd.DataFrame({'points': [4, 4, 7, 8, 12, 13, 15, 18, 22, 23, 23, 25],
                   'assists': [2, 5, 4, 7, 7, 8, 5, 4, 5, 11, 13, 8],
                   'rebounds': [7, 7, 4, 6, 3, 8, 9, 9, 12, 11, 8, 9]})

In [3]:
#perform data binning on points variable
df['points_bin'] = pd.qcut(df['points'], q=3)


#view updated DataFrame
print(df)

    points  assists  rebounds        points_bin
0        4        2         7   (3.999, 10.667]
1        4        5         7   (3.999, 10.667]
2        7        4         4   (3.999, 10.667]
3        8        7         6   (3.999, 10.667]
4       12        7         3  (10.667, 19.333]
5       13        8         8  (10.667, 19.333]
6       15        5         9  (10.667, 19.333]
7       18        4         9  (10.667, 19.333]
8       22        5        12    (19.333, 25.0]
9       23       11        11    (19.333, 25.0]
10      23       13         8    (19.333, 25.0]
11      25        8         9    (19.333, 25.0]


In [4]:
#count frequency of each bin
df['points_bin'].value_counts()

points_bin
(3.999, 10.667]     4
(10.667, 19.333]    4
(19.333, 25.0]      4
Name: count, dtype: int64

In [6]:
#perform data binning on points variable with specific quantiles
df['points_bin'] = pd.qcut(df['points'],
                           q=[0, .2, .4, .6, .8, 1],
                           labels=['A', 'B', 'C', 'D', 'E'])


#view updated DataFrame
print(df)

    points  assists  rebounds points_bin
0        4        2         7          A
1        4        5         7          A
2        7        4         4          A
3        8        7         6          B
4       12        7         3          B
5       13        8         8          C
6       15        5         9          C
7       18        4         9          D
8       22        5        12          D
9       23       11        11          E
10      23       13         8          E
11      25        8         9          E


In [7]:
df['points_bin'] = pd.qcut(df['points'],
                           q=[0, .2, .4, .6, .8, 1])
print(df)

    points  assists  rebounds    points_bin
0        4        2         7  (3.999, 7.2]
1        4        5         7  (3.999, 7.2]
2        7        4         4  (3.999, 7.2]
3        8        7         6   (7.2, 12.4]
4       12        7         3   (7.2, 12.4]
5       13        8         8  (12.4, 16.8]
6       15        5         9  (12.4, 16.8]
7       18        4         9  (16.8, 22.8]
8       22        5        12  (16.8, 22.8]
9       23       11        11  (22.8, 25.0]
10      23       13         8  (22.8, 25.0]
11      25        8         9  (22.8, 25.0]


In [8]:
import pandas as pd


#create DataFrame with one categorical variable and one numeric variable
df = pd.DataFrame({'team': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'],
                   'points': [12, 15, 19, 22, 24, 25, 26, 30]})


#view DataFrame
print(df)

  team  points
0    A      12
1    B      15
2    C      19
3    D      22
4    E      24
5    F      25
6    G      26
7    H      30


In [9]:
#create categorical variable 'status' based on existing numerical 'points' variable
df['status'] = pd.cut(df['points'],
                      bins=[0, 15, 25, float('Inf')],
                      labels=['Bad', 'OK', 'Good'])


#view updated DataFrame
print(df)

  team  points status
0    A      12    Bad
1    B      15    Bad
2    C      19     OK
3    D      22     OK
4    E      24     OK
5    F      25     OK
6    G      26   Good
7    H      30   Good


In [11]:
import pandas as pd


ids = [11, 22, 33, 44, 55, 66, 77]
countries = ['Spain', 'France', 'Spain', 'Germany', 'France']


df = pd.DataFrame(list(zip(ids, countries)),
                  columns=['Ids', 'Countries'])
df

Unnamed: 0,Ids,Countries
0,11,Spain
1,22,France
2,33,Spain
3,44,Germany
4,55,France


In [12]:
dfOneHotEncoding = pd.get_dummies(df.Countries, prefix='Country')
print(dfOneHotEncoding.head())

   Country_France  Country_Germany  Country_Spain
0           False            False           True
1            True            False          False
2           False            False           True
3           False             True          False
4            True            False          False


In [13]:
df = pd.concat([df, dfOneHotEncoding], axis=1)
df


Unnamed: 0,Ids,Countries,Country_France,Country_Germany,Country_Spain
0,11,Spain,False,False,True
1,22,France,True,False,False
2,33,Spain,False,False,True
3,44,Germany,False,True,False
4,55,France,True,False,False


In [18]:
# Encoding to vectorize
from sklearn.preprocessing import LabelBinarizer
y = LabelBinarizer().fit_transform(df.Countries)
y


array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [17]:
from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(df).toarray()
print(y)

[[1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0.]]


In [19]:
# Encoding to vectorize
from sklearn.preprocessing import OneHotEncoder
y = OneHotEncoder().fit_transform(df[['Countries']]).toarray()
print(y)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [21]:
import pandas as pd


df = pd.DataFrame.from_dict({
    'Age': [10, 35, 34, 23, 70, 55, 89],
    'Height_inch': [130, 178, 155, 133, 195, 150, 205],
    'Weight_lb': [80, 200, 220, 150, 140, 95, 180]
})
df

Unnamed: 0,Age,Height_inch,Weight_lb
0,10,130,80
1,35,178,200
2,34,155,220
3,23,133,150
4,70,195,140
5,55,150,95
6,89,205,180


In [23]:
#1.Maximum scale
#xscaled= x/max(|x|)
# Maximum Scale
def absolute_maximum_scale(series):
    return series / series.abs().max()


for col in df.columns:
    df[col] = absolute_maximum_scale(df[col])


print(df)

        Age  Height_inch  Weight_lb
0  0.112360     0.634146   0.363636
1  0.393258     0.868293   0.909091
2  0.382022     0.756098   1.000000
3  0.258427     0.648780   0.681818
4  0.786517     0.951220   0.636364
5  0.617978     0.731707   0.431818
6  1.000000     1.000000   0.818182


In [24]:
# Use Scikit-learn to transform with maximum absolute scaling
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit(df)
scaled = scaler.transform(df)
scaled_df = pd.DataFrame(scaled, columns=df.columns)


print(scaled_df)


        Age  Height_inch  Weight_lb
0  0.112360     0.634146   0.363636
1  0.393258     0.868293   0.909091
2  0.382022     0.756098   1.000000
3  0.258427     0.648780   0.681818
4  0.786517     0.951220   0.636364
5  0.617978     0.731707   0.431818
6  1.000000     1.000000   0.818182


In [26]:
#2.Min-Max scaler
#x scaled=x-min(x)/(max(x)-min(x))
# Normalize a Pandas Column with Min-Max Feature Scaling using scikit-learn
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaler.fit(df)
scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled, columns=df.columns)


print(scaled_df)

        Age  Height_inch  Weight_lb
0  0.000000     0.000000   0.000000
1  0.316456     0.640000   0.857143
2  0.303797     0.333333   1.000000
3  0.164557     0.040000   0.500000
4  0.759494     0.866667   0.428571
5  0.569620     0.266667   0.107143
6  1.000000     1.000000   0.714286


In [29]:
#z-score(standardization)
#x-scale =x-mean/dolechchuan
#độ lệch chuẩn: std
#để so sánh khác đơn vị không tương đồng về cùng hệ quy chiếu so sánh
#ví dụ: cầm 2 tỷ mua nhà củ chi và thành phố đo khoảng cách tới trung tâm  
def z_score_standardization(series):
    return (series - series.mean()) / series.std()


for col in df.columns:
    df[col] = z_score_standardization(df[col])


print(df)

        Age  Height_inch  Weight_lb
0 -1.270474    -1.141772  -1.384428
1 -0.366682     0.483802   0.918383
2 -0.402833    -0.295119   1.302185
3 -0.800502    -1.040174  -0.041122
4  0.898628     1.059526  -0.233023
5  0.356352    -0.464450  -1.096577
6  1.585510     1.398187   0.534581


In [30]:
# Standardize a Pandas Column with Z-Score Scaling using scikit-learn
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.fit_transform(df)


scaled_df = pd.DataFrame(scaled, columns=df.columns)


print(scaled_df)

        Age  Height_inch  Weight_lb
0 -1.372269    -1.233255  -1.495353
1 -0.396061     0.522566   0.991967
2 -0.435110    -0.318765   1.406520
3 -0.864641    -1.123516  -0.044416
4  0.970629     1.144419  -0.251693
5  0.384905    -0.501663  -1.184438
6  1.712547     1.510215   0.577414
