In [313]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier


In [386]:
train_ds = pd.read_csv("data/train.csv",index_col="PassengerId")
test_ds = pd.read_csv("data/test.csv",index_col="PassengerId")

In [846]:
X = train_ds.copy()
y = X.pop("Survived")

In [847]:

class CategoricalDropper(BaseEstimator,TransformerMixin):
    def __init__(self,dropped):
        self.dropped = dropped
    def fit(self,x):
        return self
    def transform(self,x):
        x = x.drop(self.dropped,axis=1)
        return x

class CabinExtractor(BaseEstimator,TransformerMixin):
    def fit(self,x):
        return self
    
    def transform(self,x):
        x["Cabin"] = x["Cabin"].fillna("N-1")
        x["CabinClass"] = x["Cabin"].str.extract(r"(.)",expand=True)
        x["CabinNumber"] = x["Cabin"].str.extract(r"(-?\d+)",expand=True)
        x["CabinNumber"] = x["CabinNumber"].fillna(0)
        return x
    
class pdSimpleImputer(BaseEstimator,TransformerMixin):
    def __init__(self,imputed):
        self.imputed = imputed
    def fit(self,x):
        return self
    def transform(self,x):
        for impute in self.imputed:
            x[impute] =  pd.DataFrame(SimpleImputer(strategy="most_frequent").fit_transform(x[impute].to_numpy().reshape(-1,1)))
        return x
    
    
class pdOrdinalEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,encoded):
        self.encoded = encoded
    def fit(self,x):
        return self
    def transform(self,x):
        for encode in self.encoded:
            x[encode] =  pd.DataFrame(OrdinalEncoder().fit_transform(x[encode].to_numpy().reshape(-1,1)))
        return x 

In [905]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.base import TransformerMixin, BaseEstimator
       
# get a baseline value
# handling categorical data



In [906]:
from sklearn.impute import SimpleImputer
# handling numerical data

# numerical_columns = [i for i in X.columns if X[i].dtype != "object"]
# numerical_data = pd.DataFrame([X[i].values for i in X.columns if X[i].dtype != "object"]).T
# numerical_data.columns = numerical_columns

# numerical_pipeline = Pipeline([
#     ("impute-age",pdSimpleImputer(["Age"]))
# ])

In [1014]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


categorical_pipline = Pipeline(steps=[
    ("cabin-extractor",CabinExtractor()),
    ("droper",CategoricalDropper(["Ticket","Cabin","Name"])),
    ("imputer" , pdSimpleImputer(["Embarked"])),
    ("encoder",pdOrdinalEncoder(["Embarked","Sex","CabinClass"]))
])
numerical_pipeline = Pipeline([
    ("impute-age",pdSimpleImputer(["Age","Fare"]))
])

def preprocess_num_cat(X):
    categorical_columns = [i for i in X.columns if X[i].dtype == "object"]
    categorical_data = pd.DataFrame([X[i].values for i in X.columns if X[i].dtype == "object"]).T
    categorical_data.columns = categorical_columns
    
    
    numerical_columns = [i for i in X.columns if X[i].dtype != "object"]
    numerical_data = pd.DataFrame([X[i].values for i in X.columns if X[i].dtype != "object"]).T
    numerical_data.columns = numerical_columns    
    
    preprocessed_num = numerical_pipeline.fit_transform(numerical_data)
    preprocessed_cat = categorical_pipline.fit_transform(categorical_data)
    return preprocessed_num.join(preprocessed_cat)


pipeline = Pipeline([
    ("preprocess",FunctionTransformer(preprocess_num_cat)),
    ("model",RandomForestClassifier())
])

pipeline.fit(X,y)

In [1024]:
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(pipeline.named_steps["model"], preprocess_num_cat(X), y,cv=5)
print(scores,scores.mean())

[0.80446927 0.79775281 0.85393258 0.78089888 0.85393258] 0.8181972255351202


In [1016]:
X_tested = test_ds.copy()
y_pred = pipeline.predict(X_tested)

In [1017]:
y_df = [[892 + i , y_pred[i]] for i in range(len(y_pred))]
y_df = pd.DataFrame(y_df,columns= ["PassengerId","Survived"])
y_df.to_csv("data/cross_validated_piplined_xd.csv",index=False)