<a href="https://colab.research.google.com/github/zuhayerror3i8/AI-ML-Expert-With-Phitron-Batch-01/blob/main/001%20Machine%20Learning/003_Module_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module 03: Scaling, Encoding, and Distances


## Standardization (Z-Score Scaling)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.DataFrame({'h':[150,160,170,185,190],
                   'w':[50,58,70,80,90]})
df

In [None]:
m=df.mean()
# print(m)
s=df.std()
print(s)
z = (df-m)/s
z.round(2)

In [None]:
z.plot(kind='bar')
plt.title("Z scaling Features")
plt.xlabel("Index"); plt.ylabel('z value')
plt.show()

## Min–Max Scaling (Rescaling to [0,1])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({'h':[150,175,170,180,190],
                   'w':[50,60,70,180,90]})
df

In [None]:
#Step-1: Min, Max, Range
mn = df.min()
mx = df.max()
rg = mx - mn
mn,mx,rg

In [None]:
#Step-2: Shift to zero
ss = df - mn
ss.head()

In [None]:
#Step-3: Divide by range
mm = ss/rg
mm.round(2)

In [None]:
#Plot
mm.plot(kind='bar')
plt.title("Min-Max Scaling [0-1]")
plt.xlabel("Index"); plt.ylabel('Min-max value')
plt.show()

## Robust Scaling (Outlier-Resistant)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({'h':[150,160,170,180,300],
                   'w':[50,60,70,80,200]})
df

In [None]:
# step 1: median, quartiles, IQR
md = df.median()
q1 = df.quantile(.25)
q3 = df.quantile(.75)
iqr = q3-q1
md,q1,q3,iqr

In [None]:
# step 2: center by median
ct = df - md
ct

In [None]:
# step 3: divide by IQR
rb = ct/iqr
rb.round(2)

In [None]:
# visualize robust scaled values
rb.plot(kind='bar')
plt.title("Robust Scaling")
plt.xlabel("Index"); plt.ylabel('Robust value')
plt.show()

In [None]:
# standardization/z score
m = df.mean()
s = df.std()
z = (df - m) / s

# min max
mn = df.min()
mx = df.max()
mm = (df - mn) / (mx - mn)

# side-by-side
out = pd.concat([df,
                 z.add_prefix('z_'),
                 mm.add_prefix('mm_'),
                 rb.add_prefix('rb_')], axis=1)
out.round(2)

## One-Hot Encoding

In [None]:
import pandas as pd

df = pd.DataFrame({
    "id":[1,2,3,4],
    "color":["red","blue","green","red"],
    "size":["Small","Medium","Large","Medium"],
    "price":[10,12,15,11]
})

In [None]:
# Step 2: Apply one-hot encoding to the 'color' column
d_color = pd.get_dummies(df["color"], prefix="C", dtype = int)

In [None]:
# Step 3: Combine the new columns back with the original data
df_encoded = pd.concat([df,d_color], axis = 1)

In [None]:
# Step 4: Drop the old 'color' column if you no longer need it
df_encoded = df_encoded.drop("color", axis = 1)
print(df_encoded)

## Ordinal Encoding (for ordered categories)

In [None]:
import pandas as pd

df = pd.DataFrame({
    "id":[1,2,3,4],
    "color":["red","blue","green","red"],
    "size":["Small","Medium","Large","Medium"],
    "price":[10,12,15,11]
})
df

In [None]:
# Step 2: declare the ordinal order
order = {"Small":1,"Medium":2,"Large":3}

In [None]:
# Step 3: convert the whole feature
# df["size_encoded"]=df["size"].map(order).astype(int)
df["size"]=df["size"].map(order).astype(int)

In [None]:
df

## Understanding Vectors, Dot Product, and Norms — A Hands-on Python Exploration

In [None]:
#Step 1: Creating Vectors
import numpy as np

# Two tiny 3D vectors
a = np.array([2, 1, 3])
b = np.array([1, 3, 3])
print(a)
print(b)

In [None]:
#Step 2: Vector Operations (Addition and Subtraction)
add_ab = a + b
sub_ab = a - b
print(add_ab)
print(sub_ab)

In [None]:
#Step 3: Dot Product (Similarity of Direction)
# Manual dot product: sum of pairwise products
dot = a[0]*b[0] + a[1]*b[1] + a[2]*b[2]

dot_np = a@b # np.dot(a,b)
print(dot)
print(dot_np)

In [None]:
#Step 4: Norms (Length or Magnitude of a Vector)
# L2 norm, the usual length
l2_a = np.linalg.norm(a)
print(l2_a)
l1_a = np.linalg.norm(a, ord = 1)
print(l1_a)

In [None]:
#Step 5: Normalizing a Vector (Unit Vector)
unit_a = a/np.linalg.norm(a)
len_unit_a = np.linalg.norm(unit_a)
print(unit_a)
print(len_unit_a)

## Euclidean and Manhattan Distance

In [None]:
import numpy as np

X = np.array([
    [70, 80],  # S1
    [60, 90],  # S2
    [85, 60],  # S3
    [78, 76],  # S4
    [62, 65],  # S5
], dtype=float)

q = np.array([75, 70], dtype=float)

print("X shape:", X.shape)
print("q:", q.tolist())

In [None]:
# Euclidean distances (L2)
eu = np.linalg.norm(X - q, axis=1)
print("Euclidean:", np.round(eu, 3).tolist())

In [None]:
# Manhattan distances (L1)
ma = np.linalg.norm(X - q, ord=1, axis=1)
print("Manhattan:", ma.tolist())