<a href="https://colab.research.google.com/github/yashsinghal11/Data-Science_-Theory-Coding_tutorial-/blob/main/Encodingin_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# ALL-IN-ONE DATA ENCODING MASTER SCRIPT
# ============================================

import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline


# ------------------------------------------------------
# (1) LOAD A SAMPLE DATASET
# ------------------------------------------------------
df = pd.DataFrame({
    "City": ["Delhi", "Mumbai", "Chennai", "Delhi", "Mumbai"],
    "Gender": ["Male", "Female", "Female", "Male", "Male"],
    "Quality": ["Low", "Medium", "High", "Medium", "Low"],
    "Age": [22, 34, 26, 29, 31],
    "Salary": [40000, 60000, 50000, 65000, 62000],
    "target": [1, 0, 1, 1, 0]
})

print("\nOriginal Dataset:")
print(df)


# ------------------------------------------------------
# (2) LABEL ENCODING (For a single column)
# ------------------------------------------------------
label = LabelEncoder()
df["Gender_Label"] = label.fit_transform(df["Gender"])

print("\nAfter Label Encoding on 'Gender':")
print(df)


# ------------------------------------------------------
# (3) ORDINAL ENCODING (For ordered categories)
# ------------------------------------------------------
order = [["Low", "Medium", "High"]]

ordinal = OrdinalEncoder(categories=order)
df["Quality_Ordinal"] = ordinal.fit_transform(df[["Quality"]])

print("\nAfter Ordinal Encoding on 'Quality':")
print(df)


# ------------------------------------------------------
# (4) ONE-HOT ENCODING (Using pandas get_dummies)
# ------------------------------------------------------
df_onehot = pd.get_dummies(df, columns=["City"], drop_first=True)

print("\nAfter One-Hot Encoding on 'City':")
print(df_onehot)


# ------------------------------------------------------
# (5) FULL PIPELINE â†’ ColumnTransformer + ML Model
# ------------------------------------------------------

X = df.drop(["target"], axis=1)
y = df["target"]

categorical_cols = ["City", "Gender"]
ordinal_cols = ["Quality"]
numeric_cols = ["Age", "Salary"]

# ordinal order
ordinal_order = [["Low", "Medium", "High"]]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_cols),
        ("ord", OrdinalEncoder(categories=ordinal_order), ordinal_cols),
    ],
    remainder="passthrough"
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

print("\nModel Training Completed Successfully!")
print("Pipeline Ready for Production!")


# ------------------------------------------------------
# (6) PREDICTION
# ------------------------------------------------------
pred = model.predict(X_test)
print("\nPredictions:")
print(pred)

