In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

X_train = train_data.drop("Survived", axis=1)
Y_train = train_data['Survived']
X_test = test_data

X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [3]:
print("===========================Training Data==============================")
X_train.head(5)




Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print("===========================Test Data==============================")
X_test.head()



Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
# import the BaseEstimator
from sklearn.base import BaseEstimator

class DropColumns(BaseEstimator):
    def __init__(self, cols):
        self.cols = cols
        print(self.cols)
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        
        X.drop(self.cols,axis = 1, inplace = True)
        return X
    
#===============================Cabin Columns Feature Engineerong
class CabinTransform(BaseEstimator):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, data):
        cabinValues = [1,2,3,4,5,6,7,8]
        deck = data['Cabin'].dropna(axis = 0).str[0:1].unique()
        cabinKey = np.sort(deck, kind = 'quick')
        self.deckPair = {cabinKey[i]: cabinValues[i] for i in range(len(cabinKey))} 
        data['Cabin'].fillna('0',inplace = True)
        data['Deck'] = data['Cabin'].apply(lambda x : self.setDeck(x))
        data['Deck'] = data['Deck'].astype('int32')
        return data
    
    def setDeck(self,x):
        char1 = x[0:1]
        if char1 == '0':
            return 0
        else:
            return self.deckPair[char1]


    

In [6]:
# ===================================Pipe Line for Age columns============================
class AgeTransform(BaseEstimator):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, data):
        nullIndex = data['Age'].index[data['Age'].apply(np.isnan)]
        rand_age = self.getStatistic(data)
        sr1 = pd.Series(rand_age, index = nullIndex) # It is necessary to set indexof NaN to series,so that it can set values according to index
        data['Age'].fillna(value = sr1,axis = 0,inplace = True) ## fillNa with with same length and on same index
        data['Age'] = data['Age'].astype('int32')
        return data
    
    def getStatistic(self,data):
        mean = data['Age'].mean()
        std = data['Age'].std()
        is_null = data["Age"].isnull().sum()
        print("Mean:{mean},Std:{std},isNull:{is_null}".format(mean = mean,std = std,is_null = is_null))
        rand_age = np.random.randint(mean - std, mean + std, size = is_null)
        return rand_age
    

class filEmbark(BaseEstimator):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    def transform(self, data):
        data['Embarked'].fillna(value = 'S',axis = 0,inplace = True)
        ports = {"S": 0, "C": 1, "Q": 2}
        data['Embarked'] = data['Embarked'].map(ports)
        return data
    
class Gender(BaseEstimator):
    def __init__(self):
        self.genders = {"male": 0, "female": 1}
        
    def fit(self, X, y=None):
        return self
    def transform(self, data):
        data['Sex'] = data['Sex'].map(self.genders)
        return data
    
class SibspParch(BaseEstimator):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, data):
        data['relatives'] = data['SibSp'] + data['Parch']
        data.loc[data['relatives'] > 0,'not_alone'] = 1
        data.loc[data['relatives'] == 0,'not_alone'] = 0
        return data
    

In [7]:
# Drop columns sequence
#1.['PassengerId','Name'];(no Operation)
#2. Cabin
#. Ticket(No operation)
#SibSp','Parch','not_alone'

In [8]:
non_operatinal_columns = ['PassengerId','Name','Ticket'];
operatinalcolumns = ['Cabin','SibSp','Parch','not_alone'];
## Total column to be droped
drop_columns = non_operatinal_columns + operatinalcolumns
# Drop columns 


In [9]:
pipe = Pipeline([ 
    ('setCabin', CabinTransform()),
    ('setAge', AgeTransform()),
    ('filEmbark', filEmbark()),
     ('Gender', Gender()),
    ('Relatives', SibspParch()),
    ('dropColumns', DropColumns(drop_columns)),
#     ('scaler', StandardScaler()), 
])


pipe.fit_transform(X_train)

['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'not_alone']
Mean:29.69911764705882,Std:14.526497332334044,isNull:177


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,relatives
0,3,0,22,7.2500,0,0,1
1,1,1,38,71.2833,1,3,1
2,3,1,26,7.9250,0,0,0
3,1,1,35,53.1000,0,3,1
4,3,0,35,8.0500,0,0,0
5,3,0,23,8.4583,2,0,0
6,1,0,54,51.8625,0,5,0
7,3,0,2,21.0750,0,0,4
8,3,1,27,11.1333,0,0,2
9,2,1,14,30.0708,1,0,1


In [10]:
pipe.fit_transform(X_test)

Mean:30.272590361445783,Std:14.18120923562442,isNull:86


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Deck,relatives
0,3,0,34,7.8292,2,0,0
1,3,1,47,7.0000,0,0,1
2,2,0,62,9.6875,2,0,0
3,3,0,27,8.6625,0,0,0
4,3,1,22,12.2875,0,0,2
5,3,0,14,9.2250,0,0,0
6,3,1,30,7.6292,2,0,0
7,2,0,26,29.0000,0,0,2
8,3,1,18,7.2292,1,0,0
9,3,0,21,24.1500,0,0,2


In [19]:
# X_test.shape
X_train.isna().sum()

Pclass       0
Sex          0
Age          0
Fare         0
Embarked     0
Deck         0
relatives    0
dtype: int64

In [12]:
sc = StandardScaler()
# trainCleanData = sc.fit_transform(trainCleanData)
# testCleanData = sc.fit_transform(testCleanData)

In [13]:
# # Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB



In [14]:
scoreList =[]
alogoName =[]

# logreg = LogisticRegression()
# logreg.fit(trainCleanData, Y_train)

# Y_pred = logreg.predict(testCleanData)

# acc_log = round(logreg.score(trainCleanData, Y_train) * 100, 2)
# scoreList.append(acc_log)
# alogoName.append('Logistic regression')
# print(round(acc_log,2,), "%")