In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data = pd.read_csv("./datasets/titanic/train.csv").drop(["PassengerId","Ticket","Name","Cabin"],axis=1)
test_data = pd.read_csv("./datasets/titanic/test.csv").drop(["PassengerId","Ticket","Name","Cabin"],axis=1)

In [3]:
print(list(train_data.keys()))
#len(train_data)

['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [4]:
k = 0
ind1 = []
for i in train_data["Embarked"]:
    if i != "S" and i != "C" and i != "Q":
        ind1.append(k)
    k += 1
train_data = train_data.drop(ind1)

In [5]:
Xtr = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].iloc[0:711]
Ytr = train_data[['Survived']].iloc[0:711]
Xte = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].iloc[711:887]
Yte = train_data[['Survived']].iloc[711:887]

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown
    def fit(self, X, y=None):
        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' ""or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)
        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or ""'ignore', got %s")
            raise ValueError(template % self.handle_unknown)
        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"" encoding='ordinal'")
        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape
        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]
        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"" during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))
        self.categories_ = [le.classes_ for le in self._label_encoders_]
        return self
    def transform(self, X):
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)
        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])
            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"" during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])
        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)
        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)
        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]
        out = sparse.csc_matrix((data, (row_indices, column_indices)),shape=(n_samples, indices[-1]),dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [8]:
data_cat_tr_pcl = train_data["Pclass"]
data_cat_tr_emb = train_data["Embarked"]
data_cat_tr_sex = train_data["Sex"]
data_cat_te_pcl = test_data["Pclass"]
data_cat_te_emb = test_data["Embarked"]
data_cat_te_sex = test_data["Sex"]

In [9]:
cat_encoder = CategoricalEncoder(encoding="onehot-dense")
data_cat_tr_pcl_1hot = cat_encoder.fit_transform(data_cat_tr_pcl.values.reshape(-1, 1))
data_cat_tr_emb_1hot = cat_encoder.fit_transform(data_cat_tr_emb.values.reshape(-1, 1))
data_cat_tr_sex_1hot = cat_encoder.fit_transform(data_cat_tr_sex.values.reshape(-1, 1))
data_cat_te_pcl_1hot = cat_encoder.fit_transform(data_cat_te_pcl.values.reshape(-1, 1))
data_cat_te_emb_1hot = cat_encoder.fit_transform(data_cat_te_emb.values.reshape(-1, 1))
data_cat_te_sex_1hot = cat_encoder.fit_transform(data_cat_te_sex.values.reshape(-1, 1))

In [10]:
#cat_encoder.categories_

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer,StandardScaler
num_attribs = ["Age","Parch","Fare","SibSp"]
cat_attribs = ["Pclass","Embarked","Sex"]
num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),('std_scaler', StandardScaler()),])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),])

In [13]:
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[("num_pipeline", num_pipeline),("cat_pipeline", cat_pipeline),])

In [14]:
Xtr_prepared = full_pipeline.fit_transform(Xtr)
Xte_prepared = full_pipeline.fit_transform(Xte)
Ytr_prepared = (Ytr["Survived"] == 1).values
Yte_prepared = (Yte["Survived"] == 1).values

In [15]:
from sklearn.svm import SVC,SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

lin_reg = LinearRegression(n_jobs = -1)
svm1_class = SVC(kernel="linear",)
svm2_class = SVC(kernel="rbf")
tree_clf = DecisionTreeClassifier(max_depth=4,)
ran_for = RandomForestClassifier(n_jobs = -1)

lin_reg.fit(Xtr_prepared,Ytr["Survived"].values)
svm1_class.fit(Xtr_prepared,Ytr_prepared)
svm2_class.fit(Xtr_prepared,Ytr_prepared)
tree_clf.fit(Xtr_prepared,Ytr_prepared)
ran_for.fit(Xtr_prepared,Ytr_prepared)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
Ypredlin = lin_reg.predict(Xte_prepared)
Ypredsvm1 = svm1_class.predict(Xte_prepared)
Ypredsvm2 = svm2_class.predict(Xte_prepared)
Ypredtree = tree_clf.predict(Xte_prepared)
Ypredfor = ran_for.predict(Xte_prepared)
print(Ypredfor)

[False False False False  True  True False False  True False False False
 False False  True False False  True  True  True False False False False
 False False  True  True False False  True False  True False False  True
 False False  True  True False False  True  True  True False False  True
 False False False  True False  True False False False False False False
  True  True  True False False  True False  True  True  True  True False
 False False  True False False False False False False False False False
  True  True False  True False  True  True  True False  True False  True
 False  True False False False False  True False  True False False False
  True False False  True False False  True  True False False  True  True
 False False  True False  True  True False False False  True False False
 False False False False  True False False  True  True  True  True  True
 False  True  True False False  True False False  True  True False False
  True False  True False False  True False False Fa

In [17]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

Y_tr_pred = cross_val_predict(ran_for, Xtr_prepared,Ytr_prepared, cv=3)
confusion_matrix(Ytr_prepared, Y_tr_pred)

array([[377,  57],
       [ 92, 185]])

In [18]:
from sklearn.metrics import precision_score, recall_score, f1_score
precision_score(Ytr_prepared, Y_tr_pred)

0.7644628099173554

In [19]:
recall_score(Ytr_prepared, Y_tr_pred)

0.6678700361010831

In [20]:
f1_score(Ytr_prepared, Y_tr_pred)

0.7129094412331407