# 1. Initializations

## 1.1 General imports

In [None]:
### general
import re
import string 

### data management
import pandas as pd
import numpy as np

### machine learning (scikit-learn)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline 
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

### graphical
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline
import seaborn as sns


## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_tit_raw = dfc.load_dataset_from_config('titanic_data', sep=',')

if df_tit_raw is not None and isinstance(df_tit_raw, pd.DataFrame):
    dfc.log_general_info(df_tit_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_tit_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_tit_raw))
    df_tit = df_tit_raw.copy()
    display(df_tit.head())

In [None]:
df_tit_desc = df_tit.select_dtypes(include=np.number).describe()
display(df_tit_desc)
df_tit_cr = df_tit.select_dtypes(include=np.number).corr()
display(df_tit_cr)

## 2.2 features and target split

In [None]:
# Original backup
df_tit_orig = df_tit.copy()

In [None]:
# Restore (optional)
df_tit = df_tit_orig.copy()

In [None]:
y = df_tit['Survived']
X = df_tit.drop('Survived', axis=1)

# 3. Transformateurs unitaires

## 3.1 Transformateurs spécifiques

In [None]:
class AgeCat(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.

    def __init__(self):
        return None
    
    def fit(self, X, y = None):  # Ne fait rien
        return self
    
    def transform(self, X):  # renvoi un dataframe contenant uniquement la colonne remaniée
        return pd.DataFrame(
            pd.cut(
                X.Age, 
                bins = [0, 12, 18, 30, 50, 65, np.max(X.Age)], 
                labels=['Kid','Adolescent','Adult-','Adult','Adult+','Senior']
            )
        )

In [None]:
TestAge = X[['Age']]
# instanciation
age_categorized = AgeCat()
# test
TestAge = age_categorized.fit_transform(TestAge)
display(TestAge.head())
display(TestAge.Age.unique())

In [None]:
class FamilySize(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.

    def __init__(self):
        return None
    
    def fit (self, X, y = None):
        return self
    
    def transform (self, X):  # renvoi un dataframe contenant uniquement la colonne aggregée
        X_t = X.sum(axis=1)+1
        return pd.DataFrame(X_t, columns=["FamilySize"])

In [None]:
TestFamilySize = X[['SibSp','Parch']]
# instanciation
size_family = FamilySize()
# tests
TestFamilySize = size_family.fit_transform(TestFamilySize)
display(TestFamilySize)
display(TestFamilySize.FamilySize.unique())

In [None]:
def extract_groups(name: str) -> list[str]:
    pattern = re.compile(r"^\s*([^,]+),\s*(.+?)\.?\s+(.*)$")
    match = pattern.match(name)
    if match:
        last_name = match.group(1).strip()
        title = match.group(2).strip()
        raw_first_names = match.group(3).strip()
        cleaned = re.sub(r'[\(\)"“”]', '', raw_first_names)
        return [last_name, title, cleaned.strip()]
    else:
        return ["", "", name.strip()]
    
def parse_titanic_names(series: pd.Series) -> pd.DataFrame:
    """
    Applies extract_groups to a Series and returns a DataFrame
    with columns ['Surname', 'Title', 'Firstname(s)'].
    """
    return (
        series
        .apply(extract_groups)
        .apply(pd.Series)
        .set_axis(["Surname", "Title", "Firstname(s)"], axis=1)
    )

class SplitName(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.
    
    def __init__(self, column_name):
        self.column_name = column_name
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):  # renvoi un dataframe contenant uniquement les colonnes additionnelles
        X_t = X.copy()
        X_t[['Surname', 'Title', 'Firstname(s)']] = X[self.column_name].pipe(parse_titanic_names)
        return X_t

In [None]:
TestName=X[['PassengerId','Name','Cabin']]
# instanciation
name_split= SplitName('Name')
# tests
TestName = name_split.fit_transform(TestName)
display(TestName)
display(TestName.Title.unique())

In [None]:
def ReturnCabin(l):
    for elt in l:
        if type(elt) is str: 
            return elt 
    return np.nan

class AddCabins(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.

    def __init__(self, column_id, column_cabin, column_surname):
        self.column_id = column_id        # Nom de la colonne de l'id des individus
        self.column_cabin = column_cabin  # Nom de la colonne des cabines
        self.column_surname = column_surname # nom de la colonne nom de famille à segmenter
        
    def fit(self, X, y=None):

        # on détermine la liste des noms de familles uniques
        surname_list = X[self.column_surname].unique()
        # on initialise une Series de dimension de X avec des liste vides
        self.Cabin_list = X[self.column_cabin].apply(lambda x: [])

        # pour chaque famille, on récupère la liste des cabines de chaque personnes correspondante
        for family in surname_list:
            liste = X.loc[X[self.column_surname] == family][self.column_cabin].tolist()
            # on affecte cette liste de cabine en mémoire a son emplacement via son id et son nom de famille
            for id, name  in zip(X[self.column_id], X[self.column_surname]):
                if name == family:
                    self.Cabin_list.at[id-1] = liste
        return self

    def transform(self, X):  # renvoi un dataframe contenant uniquement la colonne remaniée
        X_t = X.copy()
        X_t.loc[:,self.column_cabin] = self.Cabin_list.apply(ReturnCabin)
        return X_t


In [None]:
# instanciation
complete_cabins = AddCabins('PassengerId','Cabin','Surname')
# tests
display(TestName.loc[TestName['PassengerId'] == 5])
display(complete_cabins.fit_transform(TestName).loc[TestName['PassengerId'] == 5])


In [None]:
def babtri(x):
    if x%2==0.0:
        return('Babord')
    if x%2==1.0:
        return('Tribord')
    else:
        return np.nan


class SplitCabin(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.
    
    def __init__(self, column_name):
        self.column_name = column_name   # nom de la colonne à segmenter
        
    def fit(self, X, y = None):  # Ne fait rien. 
        return self
    
    def transform(self, X):  # renvoi un dataframe contenant uniquement les colonnes additionnelles
        X_t = pd.DataFrame()

        X_t[self.column_name+'_letter'] = X[self.column_name].str.slice(0,1)
        var=X[self.column_name].str.slice(1,5).str.extract("([0-9]+)").astype("float") # variable qui permet d'avoir le numéro de la cabine 
        # on applique la fonction a chaque valeur unitaire d'une serie (var étant un DataFrame, var.iloc[:0] est une Series)
        X_t[self.column_name+"_parite"] = (var.iloc[:,0].apply(babtri))

        return X_t

In [None]:
# instanciation
cabin_split = SplitCabin('Cabin')
# test
display(cabin_split.fit_transform(TestName))

In [None]:
class CategorizeTitle(BaseEstimator, TransformerMixin):
    # BaseEstimator contient les méthodes get_params et set_params.
    # TransformerMixin contient la méthode fit_transform.
    
    def __init__(self, column_name):
        self.column_name = column_name   # nom de la colonne à segmenter
        
    def fit(self, X, y = None):  # Ne fait rien. 
        return self
    
    def transform(self, X):  # renvoi un dataframe contenant uniquement la colonne recategorisee 
        special = ['Don', 'Dr', 'Mme', 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the', 'Jonkheer']
        X_t = X.copy()
        X_t[self.column_name] = X_t[self.column_name].replace(special, 'Special')
        X_t[self.column_name] = X_t[self.column_name].replace(['Rev'], 'Mr')
        X_t[self.column_name] = X_t[self.column_name].replace(['Ms'], 'Miss')
        return X_t[[self.column_name]] # renvoit un dataframe

In [None]:
TestName['Title'].unique()
# instanciation
cat_title = CategorizeTitle('Title')
# test
display(cat_title.fit_transform(TestName)['Title'].unique())

## 3.2 Transformateurs Génériques

In [None]:
cabin_si = SimpleImputer(strategy='constant', fill_value="missing")
cabin_ohe = OneHotEncoder()

In [None]:
title_si=SimpleImputer(strategy='most_frequent')
title_ohe=OneHotEncoder()

In [None]:
size_si = SimpleImputer(strategy='mean')
size_st = StandardScaler()

In [None]:
age_si = SimpleImputer(strategy='most_frequent')
age_ohe = OneHotEncoder()

In [None]:
num = ['Pclass','Fare']
cat = ['Sex','Embarked']
num_si = SimpleImputer()
num_st = StandardScaler()
cat_si = SimpleImputer(strategy = 'most_frequent')
cat_ohe = OneHotEncoder()

# 4 Pipelines

## 4.1 Gestion des données de cabines

In [None]:
# instanciation
CabinsPipeline=Pipeline(
    steps=[
        ('Complétion des Cabines', complete_cabins),
        ('Séparation des Cabines', cabin_split),
        ('Simple Imputer Cabines', cabin_si),
        ('One Hot Encoder Cabines', cabin_ohe)
    ]
)

In [None]:
# test
CabinsTest= X[['PassengerId','Name','Cabin']]
CabinsTest = name_split.fit_transform(CabinsTest)
CabinsPipeline.fit_transform(CabinsTest)

## 4.2 Gestion des données de titres

In [None]:
# instanciation
TitlePipeline = Pipeline(
    steps=[ 
        ('Catégorisation des Titres', cat_title), 
        ('Simple Imputer Titres', title_si),
        ('One Hot Encoder Titres', title_ohe)
    ]
)

In [None]:
# test
TitlePipeline.fit_transform(CabinsTest)

## 4.3 Gestion des données cabines et titres (aggrégation par union)

In [None]:
# instanciation
FeatureUnionPipeline = FeatureUnion(
    transformer_list=[
        ("Cabin", CabinsPipeline),
        ("Title",TitlePipeline )
    ]
)

In [None]:
# test
FeatureUnionTest = X[['PassengerId','Name','Cabin']]
FeatureUnionTest = name_split.fit_transform(FeatureUnionTest)
FeatureUnionPipeline.fit_transform(FeatureUnionTest)

## 4.4 Gestion globale des noms puis des cabines et titres

In [None]:
# instanciation
NamePipeline = Pipeline(
    steps=[
        ('Séparation du nom', name_split),
        ('Feature Union', FeatureUnionPipeline)
    ]
)

In [None]:
# test
NameTest = X[['PassengerId','Name','Cabin']]
NamePipeline.fit_transform(NameTest)

## 4.5 Gestion des familles

In [None]:
# instanciation
SizeFamilyPipeline = Pipeline(
    steps=[
        ('Taille Famille', size_family),
        ('Simple Imputer Size', size_si),
        ('Standardisation Size', size_st)
    ]
)

In [None]:
# test
SizeFamilyTest = X[['SibSp','Parch']]
SizeFamilyPipeline.fit_transform(SizeFamilyTest)[:5]

## 4.6 Gestion des Ages

In [None]:
# instanciation
AgePipeline = Pipeline(
    steps=[
        ('Catégorisation des Ages', age_categorized),
        ('Simple Imputer Ages', age_si),
        ('One Hot Encoder Ages', age_ohe)
    ]
)

In [None]:
# test
AgeTest=X[['Age']]
AgePipeline.fit_transform(AgeTest)

## 4.7 Gestion des données restantes (numériques et catégorielles)

In [None]:
# instanciation
NumericalPipeline = Pipeline(
    steps = [
        ('valeurs_manquantes_num',num_si),
        ('standardisation', num_st)
    ]
)
CategorialPipeline = Pipeline(
    steps = [
        ('valeurs_manquantes_cat',cat_si),
        ('encoder', cat_ohe)
    ]
)

In [None]:
# test
NumericalTest = X[num]
CategorialTest = X[cat]
print(NumericalPipeline.fit_transform(NumericalTest)[:3])
print(CategorialPipeline.fit_transform(CategorialTest)[:3])

# 5. Preprocessor complet (combinaison de toutes les pipelines)

In [None]:
# instanciation (le nom des steps ici)
preprocessor = make_column_transformer( 
    (NamePipeline, ['PassengerId','Name','Cabin']),
    (SizeFamilyPipeline,['SibSp','Parch']),
    (AgePipeline, ['Age']),
    (NumericalPipeline, num),
    (CategorialPipeline, cat)
)

In [None]:
# test
X_copy = X.copy()
preprocessor.fit_transform(X_copy)


# 6. Pipeline complète avec preprocessor et modèle

In [None]:
CompletePipeline = Pipeline(
    steps= [
        ('titanic_preprocessor', preprocessor), 
        ('gradient_boosting_classifier_model',GradientBoostingClassifier())
    ]
)

# 7. Test final

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.2, random_state = 210995)

In [None]:
from sklearn.metrics import accuracy_score
CompletePipeline.fit(X_train, y_train)
y_pred = CompletePipeline.predict(X_test)
accuracy_score(y_test, y_pred)