In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score, GridSearchCV, KFold
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from imblearn.over_sampling import SMOTE, SVMSMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr, f_regression, f_classif
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from collections import Counter
import seaborn as sns

SEED = 5678
np.random.seed(SEED)

In [3]:
train = pd.read_csv(r'C:\Users\Me\Kaggle\Titanic_revisited\data\train.csv')
test = pd.read_csv(r'C:\Users\Me\Kaggle\Titanic_revisited\data\test.csv')
print(train.shape, test.shape)

(891, 12) (418, 11)


# SOLS Classifier
> This is a custom classifier created using Scikit Learn's project template for custom estimators.
> It uses statsmodels OLS for the initial fit. Predict then uses the same OLS model to make a 
> prediction. The median of the predictions is subtracted from the prediction to create an estimated 
> residual. The estimated residual is then divided by the standard deviation of the residuals from the fit
> to create an estimated studentized residual. These studentized residuals are then tested against the
> hyperparameter "threshold" to determine the label. All  values greater than or equal to the threshold are 
> labeled True, and values less than the threshold are labeled False.

In [4]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
import pandas as pd
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.regression.linear_model import OLS

class SOLSClassifier(BaseEstimator, ClassifierMixin ):
    
    def __init__(self, threshold = 'threshold', est_method = 'est_method'):
        
        #decision threshold of studentized residuals
        self.threshold = threshold 
        self.est_method = est_method
        np.random.seed(SEED)
        
    def fit(self, X, y):
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        
        # Store the classes seen during fit
        self.classes_ = unique_labels(y) 
        
        #convert to df
        self.X_ = pd.DataFrame(X)
        self.y_ = pd.DataFrame(y)         
        
        #Fit OLS model
        self.ols_mod = OLS(endog = self.y_, exog = self.X_)
        self.ols_result = self.ols_mod.fit()
        
        # Return the classifier
        return self

    def predict(self, X):
        
        # Check if fit had been called
        check_is_fitted(self, ['X_', 'y_'])
        
        # Input validation
        X = check_array(X)
        X_n = pd.DataFrame(X)
        
        #OLS prediction       
        prediction = self.ols_result.predict(X_n)        
        
        #calculate outlier and influence measures for OLS result
        inf = OLSInfluence(self.ols_result)
        
        #Staandard Deviation of studentized residuals
        std = inf.resid_std
        
        """
        Subtract the median of the predictions from the predictions to create an estimated residual.
        Then divide the estiamted residual by the by the estimated standard deviation, the
        standard deviation of the residuals from training, to create an estimated studentized residual.
           
        """ 
        # estimated residual
        estimated_residual = prediction - np.nanmedian(prediction)
        
        #estiamted studentized residual
        if self.est_method == 'mean':
            stud_res = estimated_residual/np.nanmean(std)    #estimate using mean
        if self.est_method == 'median':
            stud_res = prediction/np.nanmedian(std)         #estimate using median
        
        #create predictions based on the threshold
        self.preds = []        
        for res in stud_res:
            if res >= self.threshold:    
                #self.preds.append(True)
                self.preds.append(1)
            else:
                #self.preds.append(False)
                self.preds.append(0)
                
        return self.preds

# Functions

In [5]:
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

#Get classifier metrics, based on DAND
def test_classifier(clf, dataset, feature_list, folds = 20):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    #data = np.array(dataset)
    labels, features = targetFeatureSplit(data)
    sss = StratifiedShuffleSplit(n_splits = folds, random_state = SEED)
    cv = sss.split(features, labels)
    
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print ("Warning: Found a predicted label not == 0 or 1.")
                print ("All predictions should take value 0 or 1.")
                print ("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print (clf)
        print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print ("")
    except:
        print ("Got a divide by zero when trying out:", clf)
        print ("Precision or recall may be undefined due to a lack of true positive predicitons.")


#convert dictionary to numpy array of features, from DAND       
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    
    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)

# prepares data for tester returns list and dict 
def tester_prep(dfn):
    features_list = dfn.columns.values
    data_dict = dfn.to_dict('index')
    return features_list, data_dict

#split FIRST row as labels, from DAND
def targetFeatureSplit( data ):
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )
    return target, features

#Custom Round
def custom_round(x, base=5):
    return base * round(float(x)/base)

#fits model to kfolds and returns column with predictions
def fit_folds(model, X_train, y_train,  n_folds):
    
    new_col = np.zeros((X_train.shape[0], 1))
    kfold = KFold( n_splits = n_folds, shuffle=True, random_state=SEED) 
        
    for train_index, test_index in kfold.split(X_train):  
        Xt, Xv = X_train.iloc[train_index], X_train.iloc[test_index]
        yt, yv = y_train.iloc[train_index], y_train.iloc[test_index]
        model.fit(Xt, yt)
        pred = np.column_stack([model.predict(Xv)])        
        new_col[test_index] = pred
                        
    return(new_col)

# New Data

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
# Drop Columns
PassengerId =np.array(test["PassengerId"]).astype(int)
train.drop(labels = [ "PassengerId",  "Cabin"], axis = 1, inplace = True)
test.drop(labels = [ "PassengerId",  "Cabin"], axis = 1, inplace = True)

cols = train.columns
train[cols] = train[cols].apply(pd.to_numeric, errors='coerce')
test_cols = test.columns
test[test_cols] = test[test_cols].apply(pd.to_numeric, errors='coerce')

In [9]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
#Split to X,y
X = train.copy()
y = train['Survived'].copy()
X.drop(['Survived'], axis = 1, inplace = True)
print(y.shape, X.shape)

(891,) (891, 9)


In [11]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [12]:
#ticket frequency
X['Ticket_Frequency'] = X.groupby('Ticket')['Ticket'].transform('count')
test['Ticket_Frequency'] = test.groupby('Ticket')['Ticket'].transform('count')

X.drop(labels = [ "Ticket"], axis = 1, inplace = True)
test.drop(labels = [ "Ticket"], axis = 1, inplace = True)

In [13]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [14]:
#Extract title before dropping name
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in X["Name"]]
X["Title"] = pd.Series(dataset_title)
X["Title"].head()

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in test["Name"]]
test["Title"] = pd.Series(dataset_title)
test["Title"].head()

# Convert to categorical values Title train
X["Title"] = X["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir',
                                 'Jonkheer', 'Dona'], 'Rare')
X["Title"] = X["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
X["Title"] = X["Title"].astype(int)

# Convert to categorical values Title test
test["Title"] = test["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 
                                       'Jonkheer', 'Dona'], 'Rare')
test["Title"] = test["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
test["Title"] = test["Title"].astype(int)

X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_Frequency,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1,2
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,1
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,1
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,2


In [15]:
# Create Dummies
dummies = ['Pclass', 'Sex', 'Embarked', 'Title']
for dum in dummies:
    train_dummies = pd.get_dummies(X[dum], prefix = dum)
    test_dummies = pd.get_dummies(test[dum], prefix = dum)
    X = pd.concat([X, train_dummies], axis = 1)
    test = pd.concat([test, test_dummies], axis = 1)
print(X.shape, test.shape)

(891, 22) (418, 22)


In [16]:
X.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Ticket_Frequency,Title,...,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1,2,...,1,0,1,0,0,1,0,0,1,0
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,1,...,0,1,0,1,0,0,0,1,0,0
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1,...,1,1,0,0,0,1,0,1,0,0
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,1,...,0,1,0,0,0,1,0,1,0,0
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,2,...,1,0,1,0,0,1,0,0,1,0


In [17]:
# Drop Columns
X.drop(labels = ['Pclass', 'Sex', 'Embarked', 'Name', 'Title'], axis = 1, inplace = True)
test.drop(labels =  ['Pclass', 'Sex', 'Embarked', 'Name', 'Title'], axis = 1, inplace = True)

In [18]:
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3
0,22.0,1,0,7.25,1,0,0,1,0,1,0,0,1,0,0,1,0
1,38.0,1,0,71.2833,1,1,0,0,1,0,1,0,0,0,1,0,0
2,26.0,0,0,7.925,1,0,0,1,1,0,0,0,1,0,1,0,0
3,35.0,1,0,53.1,2,1,0,0,1,0,0,0,1,0,1,0,0
4,35.0,0,0,8.05,1,0,0,1,0,1,0,0,1,0,0,1,0


In [19]:
#round fare to nearest 10
X['Fare'] = X['Fare'].apply(lambda x: round(x, -1))
test['Fare'] = test['Fare'].apply(lambda x: round(x, -1))
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3
0,22.0,1,0,10.0,1,0,0,1,0,1,0,0,1,0,0,1,0
1,38.0,1,0,70.0,1,1,0,0,1,0,1,0,0,0,1,0,0
2,26.0,0,0,10.0,1,0,0,1,1,0,0,0,1,0,1,0,0
3,35.0,1,0,50.0,2,1,0,0,1,0,0,0,1,0,1,0,0
4,35.0,0,0,10.0,1,0,0,1,0,1,0,0,1,0,0,1,0


# Impute Age and fare for test

In [20]:
X.isna().any()

Age                  True
SibSp               False
Parch               False
Fare                False
Ticket_Frequency    False
Pclass_1            False
Pclass_2            False
Pclass_3            False
Sex_female          False
Sex_male            False
Embarked_C          False
Embarked_Q          False
Embarked_S          False
Title_0             False
Title_1             False
Title_2             False
Title_3             False
dtype: bool

In [21]:
test.isna().any()

Age                  True
SibSp               False
Parch               False
Fare                 True
Ticket_Frequency    False
Pclass_1            False
Pclass_2            False
Pclass_3            False
Sex_female          False
Sex_male            False
Embarked_C          False
Embarked_Q          False
Embarked_S          False
Title_0             False
Title_1             False
Title_2             False
Title_3             False
dtype: bool

In [22]:
df_train_test = pd.concat([X, test ], axis = 0)
df_train_test.shape

(1309, 17)

In [23]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
#imp = IterativeImputer(random_state = SEED)
all_imp = imp.fit_transform(df_train_test)

col = X.columns.values
all_data = pd.DataFrame(all_imp, columns = col)

X = all_data.iloc[ :891,:]
test = all_data.iloc[891:,: ]
print(X.shape, test.shape)

(891, 17) (418, 17)


In [24]:
X.isna().any()

Age                 False
SibSp               False
Parch               False
Fare                False
Ticket_Frequency    False
Pclass_1            False
Pclass_2            False
Pclass_3            False
Sex_female          False
Sex_male            False
Embarked_C          False
Embarked_Q          False
Embarked_S          False
Title_0             False
Title_1             False
Title_2             False
Title_3             False
dtype: bool

In [25]:
test.isna().any()

Age                 False
SibSp               False
Parch               False
Fare                False
Ticket_Frequency    False
Pclass_1            False
Pclass_2            False
Pclass_3            False
Sex_female          False
Sex_male            False
Embarked_C          False
Embarked_Q          False
Embarked_S          False
Title_0             False
Title_1             False
Title_2             False
Title_3             False
dtype: bool

In [26]:
X.describe()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,29.361582,0.523008,0.381594,33.01908,1.787879,0.242424,0.20651,0.551066,0.352413,0.647587,0.188552,0.08642,0.722783,0.044893,0.349046,0.580247,0.025814
std,13.019697,1.102743,0.806057,49.25032,1.361142,0.42879,0.405028,0.497665,0.47799,0.47799,0.391372,0.281141,0.447876,0.207186,0.476936,0.493796,0.158668
min,0.42,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,10.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,28.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,35.0,1.0,0.0,30.0,2.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
max,80.0,8.0,6.0,510.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [27]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3
891,34.5,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
892,47.0,1.0,0.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
893,62.0,0.0,0.0,10.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
894,27.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
895,22.0,1.0,1.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [28]:
Counter(y)

Counter({0: 549, 1: 342})

# Outliers

In [29]:
#studentized OLS for outliers
ols_mod = OLS(endog = y.values, exog = X)
ols_result = ols_mod.fit()

In [30]:
out_test = ols_result.outlier_test()['bonf(p)']

In [31]:
outliers = list(out_test[out_test<1e-3].index) 

outliers

[]

In [32]:
inf = OLSInfluence(ols_result)
stud = inf.resid_studentized_internal
less_outliers = list(stud[stud < -3 ].index)  #2
more_outliers = list(stud[stud > 3 ].index) 
print(less_outliers, more_outliers)

[] []


In [33]:
print(len(y.loc[more_outliers]))
#print(y.loc[more_outliers])
#print(X.loc[more_outliers])

0


In [34]:
print(len(y.loc[less_outliers]))
print(y.loc[less_outliers])

0
Series([], Name: Survived, dtype: int64)


In [35]:
drop_list = set(X.loc[less_outliers].index )
drop_list.update(X.loc[more_outliers].index)
print(len(drop_list))

0


In [36]:
X.drop(drop_list, axis = 0, inplace = True)
X.reset_index(drop = True, inplace = True)
y.drop(drop_list, axis = 0, inplace = True)
y = y.reset_index(drop = True)
print(X.shape, y.shape)

(891, 17) (891,)


In [37]:
Counter(y)

Counter({0: 549, 1: 342})

# Feature Engineering

In [38]:
#Family Total
X['tot_fam'] = X['Parch'] + X['SibSp']
test['tot_fam'] = test['Parch'] + test['SibSp']
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3,tot_fam
0,22.0,1.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,38.0,1.0,0.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,26.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,35.0,1.0,0.0,50.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,35.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [39]:
#Solo

X['solo'] = X['tot_fam'].apply(lambda x: 1 if x == 0 else 0 )
test['solo'] = test['tot_fam'].apply(lambda x: 1 if x == 0 else 0 )
X.head(10)

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3,tot_fam,solo
0,22.0,1.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0
1,38.0,1.0,0.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
2,26.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
3,35.0,1.0,0.0,50.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0
4,35.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
5,28.0,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
6,54.0,0.0,0.0,50.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
7,2.0,3.0,1.0,20.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0,0
8,27.0,0.0,2.0,10.0,3.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0
9,14.0,1.0,0.0,30.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [40]:
#Child
age_var = 10

X["Child"] = float('NaN')
test["Child"] = float('NaN')

X["Child"][X["Age"] < age_var] = 1
X["Child"][X["Age"] >= age_var] = 0

test["Child"][test["Age"] < age_var] = 1
test["Child"][test["Age"] >= age_var] = 0

#round age 
X['Age'] = X['Age'].apply(lambda x: custom_round(x, base=20))
test['Age'] = test['Age'].apply(lambda x: custom_round(x, base=20))
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3,tot_fam,solo,Child
0,20,1.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0,0.0
1,40,1.0,0.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0.0
2,20,0.0,0.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1,0.0
3,40,1.0,0.0,50.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,0.0
4,40,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.0


In [41]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Ticket_Frequency,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3,tot_fam,solo,Child
891,40,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
892,40,1.0,0.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,0.0
893,60,0.0,0.0,10.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.0
894,20,0.0,0.0,10.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1,0.0
895,20,1.0,1.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,0,0.0


In [42]:
print(X.shape, test.shape)

(891, 20) (418, 20)


# Scaling

In [43]:
df_tt2 = pd.concat([X.copy(), test.copy() ], axis = 0)
df_tt2.reset_index(drop = True, inplace = True)
df_tt2.shape

(1309, 20)

In [44]:
length = len(X)
cols = X.columns.values

sc = preprocessing.MinMaxScaler()
scaled_cols = sc.fit_transform(df_tt2[['Age', 'SibSp', 'Parch', 'Fare']])


col = ['Age', 'SibSp', 'Parch', 'Fare']
sc_cols = pd.DataFrame(scaled_cols, columns = col)
sc_cols.reset_index(drop = True, inplace = True)
df_tt2.drop(labels =  ['Age', 'SibSp', 'Parch', 'Fare'], axis = 1, inplace = True)
df_tt3 = pd.concat([sc_cols, df_tt2], axis = 1, ignore_index = True)

X =    df_tt3.iloc[ :length,:]
X.columns = cols
test = df_tt3.iloc[length:,: ]
test.columns = cols
print(X.shape, test.shape)

(891, 20) (418, 20)


# Models

In [45]:
#Combine for tester
df_all = pd.concat([y, X ], axis = 1)
df_all.shape

(891, 21)

In [46]:
 # ols pipe smt
k_final = 16  #16
thresh_f = 0.61  #0.61

ols_m = SOLSClassifier(thresh_f, est_method = 'mean') #0.59 0.82978  0.82933 0.82822 0.61

#model = Pipeline([  ('kBest', SelectKBest( chi2, k = k_final)), ('ols', ols_m)])
model = ols_m
feat, dat = tester_prep(df_all)
test_classifier(model, dat, feat, folds = 50)

SOLSClassifier(est_method='mean', threshold=0.61)
	Accuracy: 0.82689	Precision: 0.80401	Recall: 0.73371	F1: 0.76725	F2: 0.74677
	Total predictions: 4500	True positives: 1284	False positives:  313	False negatives:  466	True negatives: 2437



In [47]:
kb = SelectKBest(chi2, k='all')
kb.fit(X, y)

names = X.columns.values[kb.get_support()]
scores = kb.scores_[kb.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns = ['Feature','f_score'])
ns_df_sorted = ns_df.sort_values(['f_score','Feature'], ascending =
[False, True])
print(ns_df_sorted)

             Feature     f_score
8         Sex_female  170.348127
14           Title_1  166.384570
15           Title_2  112.805785
9           Sex_male   92.702447
5           Pclass_1   55.175151
7           Pclass_3   41.553071
10        Embarked_C   20.464401
18              solo   14.640793
19             Child   13.755156
3               Fare    8.454830
13           Title_0    6.180425
6           Pclass_2    6.160767
12        Embarked_S    5.984840
4   Ticket_Frequency    1.349147
2              Parch    1.121944
17           tot_fam    0.709091
1              SibSp    0.322733
16           Title_3    0.126121
0                Age    0.092273
11        Embarked_Q    0.010847


# Final Data

In [48]:
ols_f = SOLSClassifier(thresh_f, est_method = 'mean')
k_mod = Pipeline([  ('kBest', SelectKBest( chi2, k = k_final)), ('ols', ols_f)])
k_mod.fit(X, y)
final_submit = k_mod.predict(test)

In [49]:
from collections import Counter #{0: 259, 1: 159}{0: 260, 1: 158} //{0: 268, 1: 150}{0: 260, 1: 158}
Counter(final_submit)

Counter({0: 259, 1: 159})

In [50]:
#Submission

my_solution = pd.DataFrame(final_submit, PassengerId, columns = ["Survived"])

my_solution.to_csv("solution.csv", index_label = ["PassengerId"])