# Generic Estimator Pipeline

In [2]:
import pandas as pd
from typing import List, Optional, Dict, Any
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

## Preprocessing

In [3]:
dataset_path: str = "../datasets/flags/flag.data"
column_names: Optional[List[str]] = [
    "name",
    "landmass",
    "zone",
    "area",
    "population",
    "language",
    "religion",
    "bars",
    "stripes",
    "colours",
    "red",
    "green",
    "blue",
    "gold",
    "white",
    "black",
    "orange",
    "mainhue",
    "circles",
    "crosses",
    "saltires",
    "quarters",
    "sunstars",
    "crescent",
    "triangle",
    "icon",
    "animate",
    "text",
    "topleft",
    "botright",
]

item_separator = ","

# A dictionary with keys for each column that should be translated (for example, from a label to a string).
# Each of these keys yields a translation dictionary.
to_translate: Dict[str, Dict[Any, Any]] = {
    "religion": {0: "Catholic", 1: "Other Christian", 2: "Muslim", 3: "Buddhist", 4: "Hindu", 5: "Ethnic", 6: "Marxist", 7: "Others"},
    "language": {1: "English", 2: "Spanish", 3: "French", 4: "German", 5: "Slavic", 6: "Other Indo-European", 7: "Chinese", 8: "Arabic", 9: "Japanese/Turkish/Finnish/Magyar", 10: "Others"},
}

categorical_columns: List[str] = ["mainhue", "landmass", "zone", "language", "topleft", "botright"]
columns_to_normalize: List[str] = ["area", "population", "bars", "stripes", "colours", "circles", "crosses", "saltires", "quarters", "sunstars"]    

transformers = [
    ("onehot", OneHotEncoder(), categorical_columns),
]
encoder = ColumnTransformer(
    transformers=transformers, 
    remainder="passthrough"
)


transformers = [
        ("scaler", MinMaxScaler(), columns_to_normalize),
    ]
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder="passthrough"
)

## Loading

In [4]:
raw_df = pd.read_csv(dataset_path, sep=item_separator, names=column_names)

## Splitting & Cleaning

In [5]:
translated_df = raw_df.copy()
for col in to_translate:
    translated_df[col] = translated_df[col].replace(to_translate[col])

test_size: float = 0.2
y_name = "religion"
X = translated_df.drop(y_name, axis=1)
y = translated_df[y_name]

X_one_hot = pd.DataFrame(encoder.fit(X)
.transform(X), columns=encoder
.get_feature_names_out())
X_one_hot
X_one_hot.columns = [col.split("__")[1] if ("__" in col) else col for col in X_one_hot.columns]


In [6]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_one_hot, y, test_size=test_size)



In [7]:

preprocessor.fit(X_train_raw)
X_train = pd.DataFrame(preprocessor.transform(X_train_raw), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test_raw), columns=preprocessor.get_feature_names_out())
X_train.head(n=10)

Unnamed: 0,scaler__area,scaler__population,scaler__bars,scaler__stripes,scaler__colours,scaler__circles,scaler__crosses,scaler__saltires,scaler__quarters,scaler__sunstars,...,remainder__blue,remainder__gold,remainder__white,remainder__black,remainder__orange,remainder__crescent,remainder__triangle,remainder__icon,remainder__animate,remainder__text
0,0.001027,0.0,0.0,0.142857,1.0,0.25,0.0,0.0,0.0,0.0,...,1,1,1,1,1,0,0,1,1,1
1,0.004375,0.002924,0.0,0.214286,0.428571,0.0,0.0,0.0,0.0,0.045455,...,0,0,1,1,0,0,1,0,0,0
2,0.010535,0.019006,0.0,0.428571,0.571429,0.25,0.0,0.0,0.0,0.0,...,0,1,1,1,0,0,0,0,1,0
3,0.015266,0.002924,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.045455,...,0,1,0,0,0,0,0,1,1,0
4,0.0,0.0,0.0,0.142857,0.285714,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,1,0,0
5,0.056557,0.00731,0.0,0.214286,0.285714,0.25,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,0,0,0
6,0.010579,0.032164,0.6,0.0,0.857143,0.0,0.0,0.0,0.0,0.090909,...,1,1,1,0,1,0,0,1,1,1
7,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
8,0.012677,0.011696,0.0,0.214286,0.285714,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
9,0.0,0.0,0.0,0.071429,0.571429,0.0,0.0,0.0,0.0,0.045455,...,1,1,1,1,0,0,1,0,0,0


In [8]:
y_train.head(n=10)

17     Other Christian
91              Muslim
179             Ethnic
40              Ethnic
101           Catholic
125             Muslim
142            Marxist
148    Other Christian
50            Catholic
7      Other Christian
Name: religion, dtype: object

# Training