In [None]:
import os
import time
import handcalcs.render
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display
from autoviz.AutoViz_Class import AutoViz_Class
from sklearn.preprocessing import FunctionTransformer,  OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from pycaret.classification import *
from pycaret.datasets import get_data
%matplotlib inline

In [None]:
def load_data(name='test'):
    path = os.path.join('../data', name + '.csv')
    data = pd.read_csv(path)
    return data

In [None]:
train_data = load_data('train')
test_data = load_data('test')
df_all = train_data.append(test_data, ignore_index=True)

In [None]:
display(df_all.head(40))

In [None]:
display(df_all.describe())

In [None]:
display(df_all.info())

In [None]:
df_all.isnull().info()

In [None]:
print(df_all.columns)

In [None]:
sns.distplot(df_all['Age'], color='black')
plt.show()
age_cat = pd.cut(df_all['Age'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80], labels=[
                 '0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80'])
fig, axes = plt.subplots(1, 2, figsize=(10, 6))
sns.countplot(age_cat[df_all['Sex'] == 'female'], color='black', ax=axes[0])
sns.countplot(age_cat[(df_all['Survived'] == 1) & (
    df_all['Sex'] == 'female')], color='pink', ax=axes[0]).set_title('Female')
sns.countplot(age_cat[df_all['Sex'] == 'male'], color='black', ax=axes[1])
sns.countplot(age_cat[(df_all['Survived'] == 1) & (
    df_all['Sex'] == 'male')], color='blue', ax=axes[1]).set_title('Male')
plt.show()

In [None]:
sns.countplot(df_all['Pclass'])
plt.show()

In [None]:
grouped = df_all.groupby(['Sex', 'Pclass'])
# display(grouped['Age'].median())
ax = grouped['Age'].median().plot(kind='bar', color='black')
ax.set(ylabel='Median Age')
plt.show()

In [None]:
df_all.drop('PassengerId', axis=1, inplace=True)
df_all.drop('Ticket', axis=1, inplace=True)
df_all.drop('Name', axis=1, inplace=True)
df_all.loc[df_all['Cabin'].isnull(), 'Cabin'] = 0
df_all.loc[df_all['Cabin'] != 0, 'Cabin'] = 1
df_all['Embarked'].fillna(
    df_all['Embarked'].value_counts().index[0], inplace=True)
df_all['Fare'].fillna(df_all['Fare'].median(), inplace=True)
df_all['Age'] = grouped['Age'].apply(lambda x: x.fillna(x.median()))
df_all['Sex'] = df_all['Sex'].map({'male': 0, 'female': 1})
df_all = pd.get_dummies(df_all, columns=['Embarked'])

In [None]:
display(df_all.head(4))
report = pandas_profiling.ProfileReport(df_all)
display(report)

In [None]:
AV = AutoViz_Class()
report_av = AV.AutoViz('../data/train.csv')

In [None]:
scaler = MinMaxScaler()
X = df_all.drop('Survived', axis=1).iloc[:891].values
y = (df_all['Survived'].iloc[:891].values).astype(int)
X = scaler.fit_transform(X)
X_test = df_all.drop('Survived', axis=1).iloc[891:].values
X_test = scaler.transform(X_test)
y_test = (df_all['Survived'].iloc[891:].values).astype(int)

In [None]:
lr = LogisticRegression()
lr.fit(X, y)
Y_pred = lr.predict(X_test)
lr.score(X, y)

In [None]:
svc = SVC()
svc.fit(X, y)
Y_pred = svc.predict(X_test)
svc.score(X, y)

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X, y)
Y_pred = knn.predict(X_test)
knn.score(X, y)

In [None]:
gaussian = GaussianNB()
gaussian.fit(X, y)
Y_pred = gaussian.predict(X_test)
gaussian.score(X, y)

In [None]:
perceptron = Perceptron()
perceptron.fit(X, y)
Y_pred = perceptron.predict(X_test)
perceptron.score(X, y)

In [None]:
linear_svc = LinearSVC()
linear_svc.fit(X, y)
Y_pred = linear_svc.predict(X_test)
linear_svc.score(X, y)

In [None]:
sgd = SGDClassifier()
sgd.fit(X, y)
Y_pred = sgd.predict(X_test)
sgd.score(X, y)

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X, y)
Y_pred = decision_tree.predict(X_test)
decision_tree.score(X, y)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X, y)
Y_pred = random_forest.predict(X_test)
random_forest.score(X, y)

## Logistic Regression
\begin{equation}
P\left[Y=y\ \big|\ x;\omega\right]\approx\sigma(\omega^Tx)
\end{equation}
where
\begin{equation}
\sigma(t)=\frac{1}{1+\exp^{-t}}
\end{equation}
$\omega$ can be obtained using maximum likelihood estimation, i.e. minimizing the negative log-likelihood:
\begin{equation}
J(\omega) = -\frac{1}{m}\sum\limits_{i=1}^{n}y_i\log(\sigma(\omega^Tx_i))+(1-y_i)\log(1-\sigma(\omega^Tx_i))
\end{equation}

In [None]:
def logistic_regression(X, y, alpha=1e-3, num_iter=30, random_state=1):
    np.random.seed(random_state)
    d, m = X.shape
    K = np.max(y)+1  # 0~c-1 => 1~c
    w = np.random.randn(d, K)

    def softmax(x):
        s = np.exp(x)/np.sum(np.exp(x))
        return s

    def one_hot(y, k):
        """
        y=[0,1,2,1]
        k=3
        return: 
        [[1. 0. 0.]
         [0. 1. 0.]
         [0. 0. 1.]
         [0. 1. 0.]]
        """
        y_one_hot = np.eye(k)[y]
        return y_one_hot

    def h(x, w):
        p = softmax(w.T@x)
        return p

    def cost(pred, y):
        c = np.sum(-one_hot(y, K).T*np.log(pred))
        return c

    def grad(w, x, y):
        Y = one_hot(y, K).T
        b = h(x, w)-Y
        b = np.reshape(b, (-1, 1))
        x = x.reshape((-1, 1))
        g = x@b.T
        return g
    for i in range(num_iter):
        for j in np.random.permutation(m):
            gradient = grad(w, X[:, j], y[j])
            w -= alpha*gradient
    return w

## Least Square Ridge Classifier
$X\in R^{m\times d},Y\in R^{m\times K}$
#### closed form solution
\begin{equation}
\begin{aligned}
J(\omega) &= \|X\omega-Y\|_2^2+\lambda\|\omega\|_F^2 \\
&=(X\omega-Y)^T\cdot(X\omega-Y)+\lambda\omega^T\omega \\
&=(\omega^TX^T-Y^T)\cdot(X\omega-Y)+\lambda\omega^T\omega \\
&=\omega^TX^TX\omega-\omega^TX^TY-Y^TX\omega+Y^TY+\lambda\omega^T\omega
\end{aligned}
\end{equation}
To minimize: $\frac{\partial J}{\partial\omega}=0$
\begin{equation}
\begin{aligned}
&\frac{\partial J}{\partial\omega}=2X^TX\omega-2X^TY+2\lambda\omega=0 \\
\implies & 2(X^TX+\lambda I)\omega=2X^TY \\
\implies & \omega=(X^TX+\lambda I)^{-1}X^TY
\end{aligned}
\end{equation}

In [None]:
def ridge_classifier(X, y, lambd=1e-4):
    d, m = X.shape
    k = np.max(y)+1
    w = np.linalg.inv(X@X.T+lambd*np.eye(d))@X@np.eye(k)[y]
    return w

In [None]:
def error(X, y, w):
    m = np.shape(y)
    y_pred = w.T @ X
    y_pred = np.argmax(y_pred, axis=0)
    err = np.sum(y_pred == y) / m
    return err

In [None]:
scores_lr = []
scores_ls = []
fold = 1

for tr, val in KFold(n_splits=5, random_state=42).split(X, y):
    X_train = X[tr]
    X_val = X[val]
    y_train = y[tr]
    y_val = y[val]
    best_W_LR = logistic_regression(
        X_train.T, y_train, alpha=1e-3, num_iter=300, random_state=42)
    val_acc_LR = error(X_val.T, y_val, best_W_LR)
    scores_lr.append(val_acc_LR)
    print(f'Validation acc LR: Fold {fold}:', val_acc_LR)
    W_LS = ridge_classifier(X_train.T, y_train, lambd=1e-4)
    val_acc_LS = error(X_val.T, y_val, W_LS)
    scores_ls.append(val_acc_LS)
    print(f'Validation acc LS: Fold {fold}:', val_acc_LS)
    fold += 1

print('-------------------------------')
print("Accuracy Logistic Regression: %0.2f (+/- %0.2f)" %
      (np.mean(scores_lr), np.std(scores_lr) * 2))
print("Accuracy Least Squares Ridge: %0.2f (+/- %0.2f)" %
      (np.mean(scores_ls), np.std(scores_ls) * 2))

In [None]:
def test_clfs(clfs):
    for clf in clfs:
        start = time()
        clf = clf(random_state=0)
        scores = cross_val_score(clf, X, y, cv=5)
        print(str(clf), 'results:')
        print('Accuracy')

In [None]:
data = train_data.drop('Name',axis=1).drop('Ticket',axis=1).drop('PassengerId',axis=1)
clf = setup(data=data,target='Survived')
top3 = compare_models(n_select=3,exclude=['catboost'])

In [None]:
tuned_top3 = [tune_model(i) for i in top3]

In [None]:
bagged_tuned_top3 = [ensemble_model(i, method = 'Bagging') for i in tuned_top3]

In [None]:
blender = blend_models(estimator_list = top3)

In [None]:
stacker = stack_models(estimator_list = top3[1:], meta_model = top3[0])

In [None]:
best_model = automl(optimize = 'Accuracy')

In [None]:
save_model(best_model, 'model')

In [None]:
plot_model(best_model,plot='boundary')

In [None]:
evaluate_model(best_model)

In [None]:
predict_model(best_model)

In [None]:
model = finalize_model(best_model)

In [None]:
y_test_pred = predict_model(best_model, data=test_data)[['PassengerId','Label']]
y_test_pred['Survived']=y_test_pred['Label']
y_test_pred.drop('Label',axis=1,inplace=True)
print(y_test_pred)
y_test_pred.to_csv('../data/my_submission.csv',header=True,index=None,encoding='utf-8')