In [1]:
import warnings
warnings.filterwarnings('ignore')
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr, f_regression, f_classif
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from collections import Counter

SEED = 5678
np.random.seed(SEED)

In [3]:
train = pd.read_csv(r'C:\Users\Me\Kaggle\Titanic_revisited\data\train.csv')
test = pd.read_csv(r'C:\Users\Me\Kaggle\Titanic_revisited\data\test.csv')
print(train.shape, test.shape)

(891, 12) (418, 11)


# SOLS Classifier
> This is a custom classifier created using Scikit Learn's project template for custom estimators.
> It uses statsmodels OLS for the initial fit. Predict then uses the same OLS model to make a 
> prediction. The median of the predictions is subtracted from the prediction to create an estimated 
> residual. The estimated residual is then divided by the standard deviation of the residuals from the fit
> to create an estimated studentized residual. These studentized residuals are then tested against the
> hyperparameter "threshold" to determine the label. All  values greater than or equal to the threshold are 
> labeled True, and values less than the threshold are labeled False.

In [4]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
import pandas as pd
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.regression.linear_model import OLS

class SOLSClassifier(BaseEstimator, ClassifierMixin ):
    
    def __init__(self, threshold = 'threshold', est_method = 'est_method'):
        
        #decision threshold of studentized residuals
        self.threshold = threshold 
        self.est_method = est_method
        np.random.seed(SEED)
        
    def fit(self, X, y):
        
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        
        # Store the classes seen during fit
        self.classes_ = unique_labels(y) 
        
        #convert to df
        self.X_ = pd.DataFrame(X)
        self.y_ = pd.DataFrame(y)         
        
        #Fit OLS model
        self.ols_mod = OLS(endog = self.y_, exog = self.X_)
        self.ols_result = self.ols_mod.fit()
        
        # Return the classifier
        return self

    def predict(self, X):
        
        # Check if fit had been called
        check_is_fitted(self, ['X_', 'y_'])
        
        # Input validation
        X = check_array(X)
        X_n = pd.DataFrame(X)
        
        #OLS prediction       
        prediction = self.ols_result.predict(X_n)        
        
        #calculate outlier and influence measures for OLS result
        inf = OLSInfluence(self.ols_result)
        
        #Staandard Deviation of studentized residuals
        std = inf.resid_std
        
        """
        Subtract the median of the predictions from the predictions to create an estimated residual.
        Then divide the estiamted residual by the by the estimated standard deviation, the
        standard deviation of the residuals from training, to create an estimated studentized residual.
           
        """ 
        # estimated residual
        estimated_residual = prediction - np.nanmedian(prediction)
        
        #estiamted studentized residual
        if self.est_method == 'mean':
            stud_res = estimated_residual/np.nanmean(std)    #estimate using mean
        if self.est_method == 'median':
            stud_res = prediction/np.nanmedian(std)         #estimate using median
        
        #create predictions based on the threshold
        self.preds = []        
        for res in stud_res:
            if res >= self.threshold:    
                #self.preds.append(True)
                self.preds.append(1)
            else:
                #self.preds.append(False)
                self.preds.append(0)
                
        return self.preds

# Functions

In [5]:
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

#Get classifier metrics, based on DAND
def test_classifier(clf, dataset, feature_list, folds = 20):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    #data = np.array(dataset)
    labels, features = targetFeatureSplit(data)
    sss = StratifiedShuffleSplit(n_splits = folds, random_state = SEED)
    cv = sss.split(features, labels)
    
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print ("Warning: Found a predicted label not == 0 or 1.")
                print ("All predictions should take value 0 or 1.")
                print ("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print (clf)
        print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print ("")
    except:
        print ("Got a divide by zero when trying out:", clf)
        print ("Precision or recall may be undefined due to a lack of true positive predicitons.")


#convert dictionary to numpy array of features, from DAND       
def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
    
    return_list = []

    # Key order - first branch is for Python 3 compatibility on mini-projects,
    # second branch is for compatibility on final project.
    if isinstance(sort_keys, str):
        import pickle
        keys = pickle.load(open(sort_keys, "rb"))
    elif sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    for key in keys:
        tmp_list = []
        for feature in features:
            try:
                dictionary[key][feature]
            except KeyError:
                print ("error: key ", feature, " not present")
                return
            value = dictionary[key][feature]
            if value=="NaN" and remove_NaN:
                value = 0
            tmp_list.append( float(value) )

        # Logic for deciding whether or not to add the data point.
        append = True
        # exclude 'poi' class as criteria.
        if features[0] == 'poi':
            test_list = tmp_list[1:]
        else:
            test_list = tmp_list
        ### if all features are zero and you want to remove
        ### data points that are all zero, do that here
        if remove_all_zeroes:
            append = False
            for item in test_list:
                if item != 0 and item != "NaN":
                    append = True
                    break
        ### if any features for a given data point are zero
        ### and you want to remove data points with any zeroes,
        ### handle that here
        if remove_any_zeroes:
            if 0 in test_list or "NaN" in test_list:
                append = False
        ### Append the data point if flagged for addition.
        if append:
            return_list.append( np.array(tmp_list) )

    return np.array(return_list)

# prepares data for tester returns list and dict 
def tester_prep(dfn):
    features_list = dfn.columns.values
    data_dict = dfn.to_dict('index')
    return features_list, data_dict

#split FIRST row as labels, from DAND
def targetFeatureSplit( data ):
    target = []
    features = []
    for item in data:
        target.append( item[0] )
        features.append( item[1:] )
    return target, features

#Custom Round
def custom_round(x, base=5):
    return base * round(float(x)/base)

# New Data

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
# Drop Columns
train.drop(labels = ["Name", "PassengerId", "Ticket", "Cabin"], axis = 1, inplace = True)
test.drop(labels = ["Name", "PassengerId", "Ticket", "Cabin"], axis = 1, inplace = True)

cols = train.columns
train[cols] = train[cols].apply(pd.to_numeric, errors='coerce')
test_cols = test.columns
test[test_cols] = test[test_cols].apply(pd.to_numeric, errors='coerce')

In [9]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
#Split to X,y
X = train.copy()
y = train['Survived'].copy()
X.drop(['Survived'], axis = 1, inplace = True)
print(y.shape, X.shape)

(891,) (891, 7)


In [11]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


#One Hot Cols
train_class = pd.get_dummies(X['Pclass'], prefix = ['Pclass'])
test_class = pd.get_dummies(test['Pclass'], prefix = ['Pclass'])

train_sex = pd.get_dummies(X['Sex'], prefix = ['Sex'])
test_sex = pd.get_dummies(test['Sex'], prefix = ['Sex'])

train_emb = pd.get_dummies(X['Embarked'], prefix = ['Embarked'])
test_emb = pd.get_dummies(test['Embarked'], prefix = ['Embarked'])

X = pd.concat([X, train_class, train_sex, train_emb], axis = 1)
test = pd.concat([test, test_class, test_sex, test_emb], axis = 1)

X.head()

In [12]:
# Create Dummies
dummies = ['Pclass', 'Sex', 'Embarked']
for dum in dummies:
    train_dummies = pd.get_dummies(X[dum], prefix = dum)
    test_dummies = pd.get_dummies(test[dum], prefix = dum)
    X = pd.concat([X, train_dummies], axis = 1)
    test = pd.concat([test, test_dummies], axis = 1)
print(X.shape, test.shape)

(891, 15) (418, 15)


In [13]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,male,22.0,1,0,7.25,S,0,0,1,0,1,0,0,1
1,1,female,38.0,1,0,71.2833,C,1,0,0,1,0,1,0,0
2,3,female,26.0,0,0,7.925,S,0,0,1,1,0,0,0,1
3,1,female,35.0,1,0,53.1,S,1,0,0,1,0,0,0,1
4,3,male,35.0,0,0,8.05,S,0,0,1,0,1,0,0,1


In [14]:
# Drop Columns
X.drop(labels = ['Pclass', 'Sex', 'Embarked'], axis = 1, inplace = True)
test.drop(labels =  ['Pclass', 'Sex', 'Embarked'], axis = 1, inplace = True)

In [15]:
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [16]:
#round fare to nearest 10
X['Fare'] = X['Fare'].apply(lambda x: round(x, -1))
test['Fare'] = test['Fare'].apply(lambda x: round(x, -1))
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,10.0,0,0,1,0,1,0,0,1
1,38.0,1,0,70.0,1,0,0,1,0,1,0,0
2,26.0,0,0,10.0,0,0,1,1,0,0,0,1
3,35.0,1,0,50.0,1,0,0,1,0,0,0,1
4,35.0,0,0,10.0,0,0,1,0,1,0,0,1


# Impute Age and fare for test

In [17]:
X.isna().any()

Age            True
SibSp         False
Parch         False
Fare          False
Pclass_1      False
Pclass_2      False
Pclass_3      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [18]:
test.isna().any()

Age            True
SibSp         False
Parch         False
Fare           True
Pclass_1      False
Pclass_2      False
Pclass_3      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [20]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_test = SimpleImputer(missing_values=np.nan, strategy='mean')

X_imp = imp.fit_transform(X)
test_imp = imp_test.fit_transform(test)

col = X.columns.values
X = pd.DataFrame(X_imp, columns = col)
test = pd.DataFrame(test_imp, columns = col)

In [21]:
X.isna().any()

Age           False
SibSp         False
Parch         False
Fare          False
Pclass_1      False
Pclass_2      False
Pclass_3      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [22]:
test.isna().any()

Age           False
SibSp         False
Parch         False
Fare          False
Pclass_1      False
Pclass_2      False
Pclass_3      False
Sex_female    False
Sex_male      False
Embarked_C    False
Embarked_Q    False
Embarked_S    False
dtype: bool

In [23]:
X.describe()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,29.699118,0.523008,0.381594,33.01908,0.242424,0.20651,0.551066,0.352413,0.647587,0.188552,0.08642,0.722783
std,13.002015,1.102743,0.806057,49.25032,0.42879,0.405028,0.497665,0.47799,0.47799,0.391372,0.281141,0.447876
min,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29.699118,0.0,0.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,35.0,1.0,0.0,30.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
max,80.0,8.0,6.0,510.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
#round age
X['Age'] = X['Age'].apply(lambda x: custom_round(x, base=5))
test['Age'] = test['Age'].apply(lambda x: custom_round(x, base=5))
X.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,20,1.0,0.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,40,1.0,0.0,70.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,25,0.0,0.0,10.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,35,1.0,0.0,50.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,35,0.0,0.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [25]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,35,0.0,0.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,45,1.0,0.0,10.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,60,0.0,0.0,10.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,25,0.0,0.0,10.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,20,1.0,1.0,10.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [None]:
Counter(y)

# Outliers

# Old Code
> Some of this code is from a submission made over two years ago and I don't recall what is borrowed and what is original. If you see your code in this section, thank you.

In [None]:
#Improved Age Interpolation based on Pclass, Parch, Sibsp
index_NaN_age = list(train["Age"][train["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = train["Age"].median()
    age_pred = train["Age"][((train['SibSp'] == train.iloc[i]["SibSp"]) & 
                             (train['Parch'] == train.iloc[i]["Parch"]) & 
                             (train['Pclass'] == train.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        train['Age'].iloc[i] = age_pred
    else :
        train['Age'].iloc[i] = age_med
        
# Filling missing value of Age in test
index_NaN_age = list(test["Age"][test["Age"].isnull()].index)

for i in index_NaN_age :
    age_med = test["Age"].median()
    age_pred = test["Age"][((test['SibSp'] == test.iloc[i]["SibSp"]) & 
                            (test['Parch'] == test.iloc[i]["Parch"]) & 
                            (test['Pclass'] == test.iloc[i]["Pclass"]))].median()
    if not np.isnan(age_pred) :
        test['Age'].iloc[i] = age_pred
    else :
        test['Age'].iloc[i] = age_med

#Add title variable
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in train["Name"]]
train["Title"] = pd.Series(dataset_title)
train["Title"].head()

dataset_title = [i.split(",")[1].split(".")[0].strip() for i in test["Name"]]
test["Title"] = pd.Series(dataset_title)
test["Title"].head()

# Convert to categorical values Title train
train["Title"] = train["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train["Title"] = train["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
train["Title"] = train["Title"].astype(int)

# Convert to categorical values Title test
test["Title"] = test["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test["Title"] = test["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
test["Title"] = test["Title"].astype(int)

# Drop Name variable
train.drop(labels = ["Name"], axis = 1, inplace = True)
test.drop(labels = ["Name"], axis = 1, inplace = True)

# Create a family size descriptor from SibSp and Parch

train["Fsize"] = train["SibSp"] + train["Parch"] + 1
# Create new feature of family size
train['Single'] = train['Fsize'].map(lambda s: 1 if s == 1 else 0)
train['SmallF'] = train['Fsize'].map(lambda s: 1 if s == 2  else 0)
train['MedF']   = train['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
train['LargeF'] = train['Fsize'].map(lambda s: 1 if s >= 5 else 0)

test["Fsize"] = test["SibSp"] + test["Parch"] + 1
test['Single'] = test['Fsize'].map(lambda s: 1 if s == 1 else 0)
test['SmallF'] = test['Fsize'].map(lambda s: 1 if s == 2  else 0)
test['MedF']   = test['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
test['LargeF'] = test['Fsize'].map(lambda s: 1 if s >= 5 else 0)


# Create the column Child and assign to 'NaN'
train["Child"] = float('NaN')
test["Child"] = float('NaN')

# Assign 1 to passengers < 20, 0 to those >= 20*******************************
age_var = 11 #9
train["Child"][train["Age"] < age_var] = 1
train["Child"][train["Age"] >= age_var] = 0

test["Child"][test["Age"] < age_var] = 1
test["Child"][test["Age"] >= age_var] = 0

# Convert male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna("S")
# Embarked to int
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test.Fare[152] = test.Fare.median()

In [None]:

for f in train.columns: 
    if train[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder() 
        lbl.fit(list(train[f].values)) 
        train[f] = lbl.transform(list(train[f].values))
        
for f in test.columns: 
    if test[f].dtype=='object': 
       lbl = preprocessing.LabelEncoder() 
       lbl.fit(list(test[f].values)) 
       test[f] = lbl.transform(list(test[f].values))

In [None]:
train.head()

In [None]:
#make survived the first column for tester
train = train.reindex(columns=(['Survived'] + list([a for a in train.columns if a != 'Survived']) ))
train.head()

# Models

In [None]:
train.shape

In [None]:
 # ols pipe smt
k = 14 #11
smt = SMOTE(random_state = SEED)
ols_m = SOLSClassifier(0.747, est_method = 'mean')  #82933 0.747, mean    82978 w/age 11
#model = Pipeline([('fpr', SelectFpr()),  ('ols', ols_m)]) 
#model = Pipeline([  ('kBest', SelectKBest( f_classif,k = k)), ('ols', ols_m)])
#model = Pipeline([ ('smt', smt),  ('ols', ols_m)])
model = ols_m
feat, dat = tester_prep(train)
test_classifier(model, dat, feat, folds = 50)


In [None]:
X = train.copy()
y = train['Survived'].copy()
X.drop(['Survived'], axis = 1, inplace = True)
print(y.shape, X.shape)

In [None]:
y.head()

In [None]:
X.head()

In [None]:
kfold = StratifiedKFold(n_splits=10)

#Scoring Function**********************************************************************************
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 10, scoring=scoring)
    return np.mean(xval)

In [None]:
clf = SOLSClassifier(0.747, est_method = 'mean')

In [None]:
res = compute_score(clf, X, y, scoring='accuracy')
res

In [None]:
#adapted from https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/
models = []
models.append(('QDA', QuadraticDiscriminantAnalysis()))
#models.append(('GPC', GaussianProcessClassifier(1.0 * RBF(1.0),random_state = SEED)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier(random_state = SEED)))
models.append(('GNB', GaussianNB()))
models.append(('SVM', SVC(random_state = SEED)))
models.append(('ABC', AdaBoostClassifier(random_state = SEED)))
models.append(('MLP', MLPClassifier( random_state = SEED, max_iter=1000)))
models.append(('RID', RidgeClassifier(random_state = SEED)))
models.append(('log', LogisticRegression(random_state = SEED)))
models.append(('SOLS', SOLSClassifier(0.745, est_method = 'mean')))
models.append(('Pipe', Pipeline([  ('kBest', SelectKBest( f_classif,k = k)), ('ols', ols_m)])))


# test and plot all models in models list
def test_models(models, data, target):
    results = []
    names = []
    scoring = 'accuracy'

    for name, model in models:
        kfold = StratifiedKFold(n_splits=5, random_state=SEED)
        cv_results = cross_val_score(model, data, target, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        #classification_metrics(y_val, rfc_pred)

    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()


#run test    
test_models(models, X, y)

# Final Data

In [None]:
sols_mod = SOLSClassifier(0.745, est_method = 'mean')
sols_mod.fit(X, y)
final_submit = sols_mod.predict(test)

In [None]:
from collections import Counter
Counter(final_submit)

In [None]:
#Final array*************************************************************************************
PassengerId =np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(final_submit, PassengerId, columns = ["Survived"])

my_solution.to_csv("solution_SOLSClass_oldData_1.csv", index_label = ["PassengerId"])

# Score
>0.77990 Previous best Score of 0.80861 with ExtraTrees on old data