In [None]:
import numpy as np
import pandas as pd 
import os 
import random
import warnings
warnings.filterwarnings(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt

import scipy
from scipy import stats
from scipy.stats import norm, skew, boxcox
from collections import Counter

from sklearn.preprocessing import StandardScaler, MinMaxScaler,MaxAbsScaler,RobustScaler

TRAIN_PATH = "../input/titanic/train.csv"
TEST_PATH = "../input/titanic/test.csv"
SAMPLE_SUBMISSION_PATH = "../input/titanic/gender_submission.csv"
SUBMISSION_PATH = "submission.csv"

ID = "PassengerId"
TARGET = "Survived"

SEED = 2022
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything()

In [None]:
def plot_hist(df,variable):
    print("min {} : {} ".format(variable, min(df[variable])))
    print("max {} : {}".format(variable, max(df[variable])))
    
    plt.figure(figsize=(6,2))
    plt.hist(df[variable], color="darkred")
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.title("{} distribution with hist ".format(variable))
    plt.show()

# Before AutoPreprocess

In [None]:
train = pd.read_csv(TRAIN_PATH)

str_list = [] 
num_list = []
for colname, colvalue in train.iteritems():
    if colname == TARGET or colname == ID:
        continue

    if type(colvalue[1]) == str:
        str_list.append(colname)
    else:
        num_list.append(colname)
        
for col in num_list:
    plot_hist(train,col)

In [None]:
stats = train.describe()
stats.loc['var'] = train.var().tolist()
stats.loc['skew'] = train.skew().tolist()
stats.loc['kurt'] = train.kurtosis().tolist()
stats

In [None]:
train.skew()

In [None]:
train.kurt()

# AutoPreprocess

In [None]:
def autoPreProcess(train,test,DROP_COLS,TARGET):
    
    train_len = len(train)

    train_test = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)
    
    train_test = train_test.drop(DROP_COLS, axis = 1)
    
    def checkNull_fillData(df):
        for col in df.columns:
            if col == TARGET:
                continue
                
            if len(df.loc[df[col].isnull() == True]) != 0:
                if df[col].dtype == "float64" or df[col].dtype == "int64":
                    df.loc[df[col].isnull() == True,col] = df[col].median()
                else:
                    df.loc[df[col].isnull() == True,col] = df[col].mode()[0]
        
            
    checkNull_fillData(train_test)
    
    str_list = [] 
    num_list = []
    for colname, colvalue in train_test.iteritems():
        if colname == TARGET or colname == ID:
            continue
        
        if type(colvalue[1]) == str:
            str_list.append(colname)
        else:
            num_list.append(colname)
    
    scaler = StandardScaler()
    train_test[num_list] = scaler.fit_transform(train_test[num_list])

    train_test = pd.get_dummies(train_test, columns=str_list)
    
    train = train_test[:train_len]
    test = train_test[train_len:]

    test.drop(labels=[TARGET],axis = 1,inplace=True)
    
    return train,test

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
DROP_COLS = [ID,"Name","Ticket","Cabin"]

train,test = autoPreProcess(train,test,DROP_COLS,TARGET)
train.head()

# After AutoPreprocess

In [None]:
str_list = [] 
num_list = []
for colname, colvalue in train.iteritems():
    if colname == TARGET or colname == ID:
        continue

    if type(colvalue[1]) == str:
        str_list.append(colname)
    else:
        num_list.append(colname)
           
for col in num_list:
    plot_hist(train,col)

In [None]:
stats = train.describe()
stats.loc['var'] = train.var().tolist()
stats.loc['skew'] = train.skew().tolist()
stats.loc['kurt'] = train.kurtosis().tolist()
stats

In [None]:
train.skew()

In [None]:
train.kurt()

# After Log Scale

In [None]:
train["Fare"] = np.log(train["Fare"])
test["Fare"] = np.log(test["Fare"])

train["SibSp"] = np.log(train["SibSp"])
test["SibSp"] = np.log(test["SibSp"])

train["Parch"] = np.log(train["Parch"])
test["Parch"] = np.log(test["Parch"])

In [None]:
for col in num_list:
    plot_hist(train,col)

In [None]:
stats = train.describe()
stats.loc['var'] = train.var().tolist()
stats.loc['skew'] = train.skew().tolist()
stats.loc['kurt'] = train.kurtosis().tolist()
stats

In [None]:
train.skew()

In [None]:
train.kurt()

# Predict Data

In [None]:
NEW_TRAIN_PATH = "train.csv"
NEW_TEST_PATH = "test.csv"
train.to_csv(NEW_TRAIN_PATH,index=False)
test.to_csv(NEW_TEST_PATH,index=False)

In [None]:
import h2o
from h2o.automl import H2OAutoML

MAX_RUNTIME_SECS = 60 * 3

h2o.init()

train = h2o.import_file(NEW_TRAIN_PATH)
test = h2o.import_file(NEW_TEST_PATH)

x = train.columns
y = TARGET

x.remove(y)
# x.remove(ID) #remove id  


aml_y = H2OAutoML(max_runtime_secs=MAX_RUNTIME_SECS, seed=SEED)
aml_y.train(x=x, y=y, training_frame=train)

preds_y = aml_y.predict(test)
pred_test = preds_y.as_data_frame().predict

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[TARGET] = (pred_test > 0.5).astype(int)
submission.to_csv(SUBMISSION_PATH, index=False)
submission.head()