In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_ds = pd.read_csv("data/train.csv",index_col="PassengerId")
test_ds = pd.read_csv("data/test.csv",index_col="PassengerId")

In [3]:
X = train_ds.copy()
y = X.pop("Survived")

In [4]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OneHotEncoder


class CategoricalDropper(BaseEstimator,TransformerMixin):
    def __init__(self,dropped):
        self.dropped = dropped
    def fit(self,x):
        return self
    def transform(self,x):
        x = x.drop(self.dropped,axis=1)
        return x

class CabinExtractor(BaseEstimator,TransformerMixin):
    def fit(self,x):
        return self
    
    def transform(self,x):
        x["Cabin"] = x["Cabin"].fillna("N-1")
        x["CabinClass"] = x["Cabin"].str.extract(r"(.)",expand=True)
        x["CabinNumber"] = x["Cabin"].str.extract(r"(-?\d+)",expand=True)
        x["CabinNumber"] = x["CabinNumber"].fillna(0)
        return x
    
class pdSimpleImputer(BaseEstimator,TransformerMixin):
    def __init__(self,imputed):
        self.imputed = imputed
    def fit(self,x):
        return self
    def transform(self,x):
        for impute in self.imputed:
            x[impute] =  pd.DataFrame(SimpleImputer(strategy="most_frequent").fit_transform(x[impute].to_numpy().reshape(-1,1)))
        return x
    
    
class pdOrdinalEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,encoded):
        self.encoded = encoded
    def fit(self,x):
        return self
    def transform(self,x):
        for encode in self.encoded:
            x[encode] =  pd.DataFrame(OrdinalEncoder().fit_transform(x[encode].to_numpy().reshape(-1,1)))
        return x 
    
    
class pdOneHotEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,encoded):
        self.encoded = encoded
    def fit(self,x):
        return self
    def transform(self,x):
        OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        
        for encode in self.encoded:
            ohe =   pd.DataFrame(OH_encoder.fit_transform(x[encode].to_numpy().reshape(-1,1)))
            ohe.index = x[encode].index
            ohe.columns = x[encode].unique()
            x.drop(columns=encode,inplace=True)
            x = x.join(ohe)
        return x     

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


categorical_pipline = Pipeline(steps=[
    ("cabin-extractor",CabinExtractor()),
    ("droper",CategoricalDropper(["Ticket","Cabin","Name"])),
    ("imputer" , pdSimpleImputer(["Embarked"])),
    ("encoder",pdOrdinalEncoder(["CabinClass"])),
    ("onehot-encoder",pdOneHotEncoder(["Embarked","Sex"]))
    
])
numerical_pipeline = Pipeline([
    ("impute-age",pdSimpleImputer(["Age","Fare"]))
])

def preprocess_num_cat(X):
    categorical_columns = [i for i in X.columns if X[i].dtype == "object"]
    categorical_data = pd.DataFrame([X[i].values for i in X.columns if X[i].dtype == "object"]).T
    categorical_data.columns = categorical_columns
    
    
    numerical_columns = [i for i in X.columns if X[i].dtype != "object"]
    numerical_data = pd.DataFrame([X[i].values for i in X.columns if X[i].dtype != "object"]).T
    numerical_data.columns = numerical_columns    
    
    preprocessed_num = numerical_pipeline.fit_transform(numerical_data)
    preprocessed_cat = categorical_pipline.fit_transform(categorical_data)
    return preprocessed_num.join(preprocessed_cat)


rf = RandomForestClassifier(n_estimators=500,max_depth=9,)
X_preprocessed = preprocess_num_cat(X)


from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_preprocessed = ss.fit_transform(X_preprocessed)

In [262]:
from sklearn.model_selection import cross_val_score, cross_val_predict
scores = cross_val_score(rf, X_preprocessed, y,cv=5,n_jobs=-1,scoring="accuracy")
print(scores,scores.mean())

[0.79888268 0.78651685 0.85955056 0.78651685 0.84269663] 0.8148327160881301


In [8]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

X_train ,X_test ,y_train, y_test = train_test_split(X_preprocessed,y,test_size=0.25)
cross_validaters = cross_validate(rf,X_train,y_train,return_estimator=True)
print(cross_validaters["test_score"],cross_validaters["test_score"].mean())

[0.80597015 0.81343284 0.85074627 0.79699248 0.83458647] 0.8203456402199528


In [268]:
# test out the whole estimator
scores = []
for i in cross_validaters["estimator"]:
     scores.append(i.score(X_test,y_test))
print(sum(scores) / len(scores))


model.fit(X_preprocessed,y)

0.7704035874439461


In [24]:
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
params = {
        "n_estimators":[10,100,500,1000],
        "max_depth":[3,6,9,11],
        "min_samples_split":[2,10,50]
}

gs = GridSearchCV(estimator=rf,param_grid=params,cv=3)
gs.fit(X_train,y_train)

In [27]:
gs.best_estimator_
pred = gs.best_estimator_.predict(X_test)
accuracy_score(y_test,pred)
gs.best_estimator_.fit(X_preprocessed,y)

In [28]:
X_tested = preprocess_num_cat(test_ds.copy())
ss = StandardScaler()
X_tested = ss.fit_transform(X_tested)
y_pred = gs.best_estimator_.predict(X_tested)

In [29]:
y_df = [[892 + i , y_pred[i]] for i in range(len(y_pred))]
y_df = pd.DataFrame(y_df,columns= ["PassengerId","Survived"])
y_df.to_csv("data/cross_validated_piplined_xd.csv",index=False)