# Generic Estimator Pipeline

In [None]:
import pandas as pd
from typing import List, Optional, Dict, Any
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

## Preprocessing

In [None]:
dataset_path: str = "../datasets/flags/flag.data"
y_name = "religion"
column_names: Optional[List[str]] = [
    "name",
    "landmass",
    "zone",
    "area",
    "population",
    "language",
    "religion",
    "bars",
    "stripes",
    "colours",
    "red",
    "green",
    "blue",
    "gold",
    "white",
    "black",
    "orange",
    "mainhue",
    "circles",
    "crosses",
    "saltires",
    "quarters",
    "sunstars",
    "crescent",
    "triangle",
    "icon",
    "animate",
    "text",
    "topleft",
    "botright",
]

item_separator = ","

# A dictionary with keys for each column that should be translated (for example, from a label to a string).
# Each of these keys yields a translation dictionary.
to_translate: Dict[str, Dict[Any, Any]] = {
    "religion": {0: "Catholic", 1: "Other Christian", 2: "Muslim", 3: "Buddhist", 4: "Hindu", 5: "Ethnic", 6: "Marxist", 7: "Others"},
    "language": {1: "English", 2: "Spanish", 3: "French", 4: "German", 5: "Slavic", 6: "Other Indo-European", 7: "Chinese", 8: "Arabic", 9: "Japanese/Turkish/Finnish/Magyar", 10: "Others"},
}

to_drop = ["name"]

categorical_columns: List[str] = ["mainhue", "landmass", "zone", "language", "topleft", "botright"]
columns_to_normalize: List[str] = ["area", "population", "bars", "stripes", "colours", "circles", "crosses", "saltires", "quarters", "sunstars"]    

transformers = [
    ("onehot", OneHotEncoder(), categorical_columns),
]
encoder = ColumnTransformer(
    transformers=transformers, 
    remainder="passthrough"
)

y_encoder = LabelEncoder()

transformers = [
        ("scaler", MinMaxScaler(), columns_to_normalize),
    ]
preprocessor = ColumnTransformer(
    transformers=transformers,
    remainder="passthrough"
)

## Loading

In [None]:
raw_df = pd.read_csv(dataset_path, sep=item_separator, names=column_names)

## Splitting & Cleaning

In [None]:
translated_df = raw_df.copy().drop(to_drop, axis=1)
for col in to_translate:
    translated_df[col] = translated_df[col].replace(to_translate[col])

test_size: float = 0.2
X = translated_df.drop(y_name, axis=1)
y = translated_df[y_name]

X_one_hot = pd.DataFrame(encoder.fit(X)
.transform(X), columns=encoder
.get_feature_names_out())
X_one_hot
X_one_hot.columns = [col.split("__")[1] if ("__" in col) else col for col in X_one_hot.columns]


y_encoded = pd.Series(y_encoder.fit_transform(y))


In [None]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_one_hot, y_encoded, test_size=test_size)



In [None]:

preprocessor.fit(X_train_raw)
X_train = pd.DataFrame(preprocessor.transform(X_train_raw), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test_raw), columns=preprocessor.get_feature_names_out())
X_train.head(n=10)

In [None]:

y_train.head(n=10)