In [24]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## LinearClassifier models
### Rigde classification
The loss function is given by
 $$L(w)=\|X_{\text{train}}w -y_{\text{train}}\|_2^2+\alpha \|w\|^2_2$$
To minimize the loss function $L$, we use the gradient method, which is defined as follow
$$w_{n+1}=w_n+\lambda \nabla L(w_n)$$

### Logistic regression
The loss function is defined by 
$$L(w)=\frac{1}{n}\sum \left( -y_i\log (f(w_ix_i))-(1-y_i)\log (1-f(w_ix_i)) \right)+ \text{penalty function} $$
where $f$ is the logistic function, $f(s)=\frac{1}{1+e^{-s}}.$
- The penalty function: $l1, l2 \text{ or Elasticnet} $.
-  Optimization method: lbfgs, liblinear, newton-cg, newton-cholesky, sag, saga. Default is 'lbfgs'

### SGDClassification
 - We use the SGD (mini-batch) to optimize the loss function.
 -Loss function: hinge (that gives the SVM), log-loss (logistic regression), perceptron.
 -Penalty function: $l_1, l_2$ or elasticnet.
 
### Support vector machine
The problem of finding the optimal margin 
\begin{align*}
\min_{\gamma, w, b} &\frac{1}{2} \|w\|^2\\
\text{s.t: }& y^{(i)}(wx^{(i)}+b)\geq 1
\end{align*}
The Lagrangian function is given by
$$L(w,b,\alpha)=\frac{1}{2} \|w\|^2-\sum_{i=1}^{m} \alpha_i\left[y^{(i)} (wx^{(i)}+b)-1\right].$$
By solving the dual optimizatio problem, we obtain the problem
 \begin{align*}
 \max_{\alpha} &\sum_{i=1}^{m} \alpha_i-\frac{1}{2}\sum_{i,j=1}^{m} y^iy^j\alpha_i \alpha_j x^i x^j\\
 \text{s.t: }& \sum \alpha_i y^{(i)}=0\\
 &\alpha_i\geq 0.
 \end{align*}
 The product $x^i x^j$ can by generared by any produit scalar $<x^i, x^j>=\Omega(x^i)\Omega^T(x^j)$  
#### Regularization and non-separable
We study the loss function in the form following
 \begin{align*}
 \min_{\gamma, w, b} &\frac{1}{2} \|w\|^2+C\sum \xi_i \\
 \text{s.t: }& y^{(i)}(wx^{(i)}+b)\geq (1-\xi_i)\\
 & \psi_i\geq 0.
 \end{align*}
 The dual optimization problem of Lagrange function is given by
 \begin{align*}
 \max_{\alpha} &\sum_{i=1}^{m} \alpha_i-\frac{1}{2}\sum_{i,j=1}^{m} y^iy^j\alpha_i \alpha_j x^i x^j\\
 \text{s.t: }& \sum \alpha_i y^{(i)}=0\\
 &0\leq \alpha_i\leq C.
 \end{align*}


In [25]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [26]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
test= pd.read_csv('/kaggle/input/titanic/test.csv')

In [27]:
train=train.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)
test=test.drop(['Cabin', 'Ticket', 'PassengerId'], axis=1)

In [28]:
train['Title']=train['Name'].str.extract(r'(\w+)\.')
test['Title']=test['Name'].str.extract(r'(\w+)\.')
train=train.drop('Name', axis=1)
test=test.drop('Name', axis=1)

In [29]:
train['Title']=train['Title'].replace([ 'Lady','Don', 'Rev', 'Dr', 'Major','Col', 'Sir','Capt', 'Countess','Jonkheer'], 'rare')
train['Title']=train['Title'].replace(['Mlle','Ms'], 'Mrs')
train['Title']=train['Title'].replace('Mme', 'Miss')
test['Title']=test['Title'].replace([ 'Lady','Don','Dona', 'Rev', 'Dr', 'Major','Col', 'Sir','Capt', 'Countess','Jonkheer'], 'rare')
test['Title']=test['Title'].replace(['Mlle','Ms'], 'Mrs')
test['Title']=test['Title'].replace('Mme', 'Miss')

### Label Encoder

In [30]:
Enco=LabelEncoder()
Enco.fit(train['Sex'])
train['Sex']=Enco.transform(train['Sex'])
test['Sex']=Enco.transform(test['Sex'])

In [31]:
Enco.fit(train['Embarked'])
train['Embarked']=Enco.transform(train['Embarked'])
test['Embarked']=Enco.transform(test['Embarked'])

In [32]:
Enco.fit(train['Title'])
train['Title']=Enco.transform(train['Title'])
test['Title']=Enco.transform(test['Title'])

### KNN imputer
Using k-Nearest Neighbors to complete the missing values

In [33]:
from sklearn.impute import KNNImputer
ImpKnn=KNNImputer(n_neighbors=10)
ImpKnn.fit(train)
train['Age']=ImpKnn.fit_transform(train[['Age']])

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [35]:
X=train.drop(['Survived'], axis=1)
y=train['Survived']

### Models

In [36]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

### Linear models
The linear models set of methods in which the target value is expected to be a linear combination of the features. In mathematical notation, the prediced function is given by 
$$ y_{train}=\omega_0+\omega_1 x_1+\omega_2 x_2+... +\omega_n x_n $$
The loss function is given by
$$L(w)=LossFunction (X_{train}, y_{train}, w)+\text{penalty function}.$$
#### Rigde
Penalty function is defined by $p(\omega)=\|\omega\|^2_2$, thus 
$$L(w)=\frac{1}{m}\|wX-y\|^2+\alpha\|\omega\|^2_2.$$
#### Lasso
$p(\omega)=\|\omega\|_1$, we obtain that
$$L(w)=\frac{1}{m}\|wX-y\|^2+\alpha\|\omega\|_1.$$
#### Elastic
$p(\omega)=\frac{1-\rho}{2}\|\omega\|^2_2+\rho \|\omega\|_1$. The loss function is defined by
$$L(w)=\frac{1}{m}\|wX-y\|^2+\alpha\|\omega\|^2_2+\beta \|\omega\|_1.$$
     

### Rigde classification
The loss function is given by
 $$L(w)=\|X_{\text{train}}w -y_{\text{train}}\|_2^2+\alpha \|w\|^2_2$$
To minimize the loss function $L$, we use the gradient method, which is defined as follow
$$w_{n+1}=w_n+\lambda \nabla L(w_n)$$

In [37]:
from sklearn.metrics import confusion_matrix, auc, accuracy_score,roc_auc_score, precision_score, roc_curve
from sklearn import linear_model
LnRg=linear_model.RidgeClassifier()
LnRg.fit(X_train, y_train)

In [38]:
print('Training set')
print('Matrix confusion: ', confusion_matrix(y_train,LnRg.predict(X_train)))
print('accuracy score: ', accuracy_score(y_train, LnRg.predict(X_train)))
print('Validation set')
print(confusion_matrix(y_val, LnRg.predict(X_val)))
print('Accuracy score', accuracy_score(y_val, LnRg.predict(X_val)))

Training set
Matrix confusion:  [[383  58]
 [ 82 189]]
accuracy score:  0.8033707865168539
Validation set
[[93 15]
 [22 49]]
Accuracy score 0.7932960893854749


### Logistic regression
The loss function is defined by 
$$L(w)=\frac{1}{n}\sum \left( -y_i\log (f(w_ix_i))-(1-y_i)\log (1-f(w_ix_i)) \right)+ \text{penalty function} $$
where $f$ is the logistic function, $f(s)=\frac{1}{1+e^{-s}}.$
- The penalty function: $l1, l2 \text{ or Elasticnet} $.
-  Optimization method: lbfgs, liblinear, newton-cg, newton-cholesky, sag, saga. Default is 'lbfgs'

In [39]:
from sklearn.linear_model import LogisticRegression
LogReg=LogisticRegression(max_iter=10000)
LogReg.fit(X_train, y_train)

In [40]:
print('Resultats on the training set:')
print('Congusion matrix: ', confusion_matrix(y_train, LogReg.predict(X_train)))
print('Accuracy score: ', accuracy_score(y_train, LogReg.predict(X_train)))
print('Roc auc score: ', roc_auc_score(y_train,  LogReg.predict(X_train)))
print('Resultats on the val set:')
print('Congusion matrix: ', confusion_matrix(y_val, LogReg.predict(X_val)))
print('Accuracy score: ', accuracy_score(y_val, LogReg.predict(X_val)))
print('Roc-AUC score: ',roc_auc_score(y_val,  LogReg.predict(X_val)))

Resultats on the training set:
Congusion matrix:  [[387  54]
 [ 82 189]]
Accuracy score:  0.8089887640449438
Roc auc score:  0.7874839972889525
Resultats on the val set:
Congusion matrix:  [[91 17]
 [17 54]]
Accuracy score:  0.8100558659217877
Roc-AUC score:  0.8015779864371413


### SGDClassification
 - We use the SGD (mini-batch) to optimize the loss function.
 -Loss function: hinge (that gives the SVM), log-loss (logistic regression), perceptron.
 -Penalty function: $l_1, l_2 \text{ or elasticnet}$


In [41]:
from sklearn.linear_model import SGDClassifier

In [42]:
SgdClf=SGDClassifier(loss='log_loss', tol=None, max_iter=200000)

In [43]:
SgdClf.fit(X_train, y_train)

In [44]:
print('Resultats on the training set:')
print('Congusion matrix: ', confusion_matrix(y_train, SgdClf.predict(X_train)))
print('Accuracy score: ', accuracy_score(y_train, SgdClf.predict(X_train)))
print('Roc auc score: ', roc_auc_score(y_train,  SgdClf.predict(X_train)))
print('Resultats on the val set:')
print('Congusion matrix: ', confusion_matrix(y_val, SgdClf.predict(X_val)))
print('Accuracy score: ', accuracy_score(y_val, SgdClf.predict(X_val)))
print('Roc-AUC score: ',roc_auc_score(y_val,  SgdClf.predict(X_val)))

Resultats on the training set:
Congusion matrix:  [[396  45]
 [ 91 180]]
Accuracy score:  0.8089887640449438
Roc auc score:  0.781082912869945
Resultats on the val set:
Congusion matrix:  [[94 14]
 [24 47]]
Accuracy score:  0.7877094972067039
Roc-AUC score:  0.7661711006781429


### Support vector machine
The problem of finding the optimal margin 
\begin{align*}
\min_{\gamma, w, b} &\frac{1}{2} \|w\|^2\\
\text{s.t: }& y^{(i)}(wx^{(i)}+b)\geq 1
\end{align*}
The Lagrangian function is given by
$$L(w,b,\alpha)=\frac{1}{2} \|w\|^2-\sum_{i=1}^{m} \alpha_i\left[y^{(i)} (wx^{(i)}+b)-1\right].$$
By solving the dual optimizatio problem, we obtain the problem
 \begin{align*}
 \max_{\alpha} &\sum_{i=1}^{m} \alpha_i-\frac{1}{2}\sum_{i,j=1}^{m} y^iy^j\alpha_i \alpha_j x^i x^j\\
 \text{s.t: }& \sum \alpha_i y^{(i)}=0\\
 &\alpha_i\geq 0.
 \end{align*}
 The product $x^i x^j$ can by generared by any produit scalar $<x^i, x^j>=\Omega(x^i)\Omega^T(x^j)$  
#### Regularization and non-separable
We study the loss function in the form following
 \begin{align*}
 \min_{\gamma, w, b} &\frac{1}{2} \|w\|^2+C\sum \xi_i \\
 \text{s.t: }& y^{(i)}(wx^{(i)}+b)\geq (1-\xi_i)\\
 & \psi_i\geq 0.
 \end{align*}
 The dual optimization problem of Lagrange function is given by
 \begin{align*}
 \max_{\alpha} &\sum_{i=1}^{m} \alpha_i-\frac{1}{2}\sum_{i,j=1}^{m} y^iy^j\alpha_i \alpha_j x^i x^j\\
 \text{s.t: }& \sum \alpha_i y^{(i)}=0\\
 &0\leq \alpha_i\leq C.
 \end{align*}

In [45]:
from sklearn.svm import SVC
svc=SVC(kernel='linear', tol=0.00001)
svc.fit(X_train, y_train)

In [46]:
print('Resultats on the training set:')
print('Congusion matrix: ', confusion_matrix(y_train, svc.predict(X_train)))
print('Accuracy score: ', accuracy_score(y_train, svc.predict(X_train)))
print('Roc auc score: ', roc_auc_score(y_train,  svc.predict(X_train)))
print('Resultats on the val set:')
print('Congusion matrix: ', confusion_matrix(y_val, svc.predict(X_val)))
print('Accuracy score: ', accuracy_score(y_val, svc.predict(X_val)))
print('Roc-AUC score: ',roc_auc_score(y_val,  svc.predict(X_val)))

Resultats on the training set:
Congusion matrix:  [[376  65]
 [ 86 185]]
Accuracy score:  0.7879213483146067
Roc auc score:  0.7676322681594161
Resultats on the val set:
Congusion matrix:  [[92 16]
 [23 48]]
Accuracy score:  0.7821229050279329
Roc-AUC score:  0.7639540949400104
