In [3]:
import numpy as np 
import pandas as pd
import opendatasets as od

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [4]:
#read the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

X_train, y_train = train_data.loc[:, train_data.columns != 'Survived'], train_data['Survived']

X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
#show the NA values
pd.DataFrame(X_train.isnull().sum(), columns=['Missing values'])

Unnamed: 0,Missing values
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0
Cabin,687


In [6]:
#get the age mean.
X_train.Age.mean()

29.69911764705882

In [7]:
#get the age mean with more precise classification
train_data.groupby(['Pclass', 'Sex'])['Age'].agg([len, 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,len,mean
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,94,34.611765
1,male,122,41.281386
2,female,76,28.722973
2,male,108,30.740707
3,female,144,21.75
3,male,347,26.507589


In [8]:
#class for input the age.
class AgeImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        self.age_means_ = {}

    def fit(self, X, y=None):
        self.age_means_ = X.groupby(['Pclass', 'Sex']).Age.mean()

        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        for key, value in self.age_means_.items():
            X_.loc[((np.isnan(X_["Age"])) & (X.Pclass == key[0]) & (X_.Sex == key[1])), 'Age'] = value

        return X_

In [9]:
#creating the pipeline
numerical_cols = ['Pclass', 'Fare']
categorial_cols = ['Sex', 'Embarked']

model = GradientBoostingClassifier()

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer())
])

categorial_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

age_transformer = Pipeline(steps=[
    ('imputer', AgeImputer())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_cols),
        ('categorial', categorial_transformer, categorial_cols)
    ])

pipeline = Pipeline(steps=[
    ('Age Imputer', age_transformer),
    ('preprocess', preprocessor),
    ('model', model)
])

In [10]:
#find the best parameters in the given parameters.
params = { 
           'model__n_estimators': [300],
           'model__max_depth': [1, 2, 3],
           'model__random_state': [42],
         }

grid_search = GridSearchCV(pipeline, params, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

print('Best accuracy score: {}'.format(grid_search.best_score_))

print('Best params: {}'.format(grid_search.best_params_))

Best accuracy score: 0.8238202247191012
Best params: {'model__max_depth': 2, 'model__n_estimators': 300, 'model__random_state': 42}


In [11]:
predictions = grid_search.predict(test_data)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)

In [12]:
print(predictions)

[0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 0 1 1 1 0 1 0 0 0]


In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [49]:
#try a different to treat the data
train_data2 = train_data.drop(['PassengerId','Survived','Ticket','Cabin'], axis=1)
test_data2 = test_data.drop(['PassengerId','Ticket','Cabin'], axis=1)
#concat the train data with the test data
data1 = pd.concat([train_data2, test_data2], axis=0).reset_index(drop=True)
data1.info()
print('-'*40)
data1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1223 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Fare      1308 non-null   float64
 7   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 81.9+ KB
----------------------------------------


Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
1304,3,"Spector, Mr. Woolf",male,,0,0,8.0500,S
1305,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C
1306,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,S
1307,3,"Ware, Mr. Frederick",male,,0,0,8.0500,S


In [54]:
data2 = data1.copy()
#check how many NA data in the dataset
data2.isna().sum()

Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     2
dtype: int64

In [55]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1223 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Fare      1308 non-null   float64
 7   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 81.9+ KB


In [56]:
data2['Embarked'] = data2["Embarked"].fillna(data2['Embarked'].mode()[0])

In [57]:
#cleaning the numeric missing values
def knn_impute(df,na_target):
    df=df.copy()
    
    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[:,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna()==False,na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna()==False,non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna()==True,non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train,y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna()==True,na_target] = y_pred
    
    return df

In [59]:
for column in ['Age','Fare'
]:
    data2 = knn_impute(data2, column)

In [63]:
data2.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [64]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1309 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Fare      1309 non-null   float64
 7   Embarked  1309 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 81.9+ KB


In [85]:
data3 = data2.copy()

In [89]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   int32  
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1309 non-null   float64
 6   Embarked  1309 non-null   int32  
 7   Title     1309 non-null   int64  
dtypes: float64(2), int32(2), int64(4)
memory usage: 71.7 KB


In [87]:
#mapping Sex
data3['Sex'] = data3['Sex'].map({'male':1,'female':0}).astype(int)

#mapping Embarked
data3['Embarked'] = data3['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

In [88]:
#replace the Name with Title, to change the data type to float64 for more convenient use
data3['Title'] = data3['Name'].str.extract('([A-Za-z]+)\.',expand=False)

data3['Title'].replace(['Mlle','Ms','Lady'], 'Miss',inplace=True)
data3['Title'].replace(['Mme'], 'Mrs',inplace=True)
data3['Title'].replace(['Countess','Capt','Col','Don','Dr',
                        'Major','Rev',"Sir","Jonkheer",'Dona'], 'Rare',inplace=True)
data3.drop(['Name'],inplace=True, axis=1)
Tempdate = data3.copy()
Tempdate['Survived'] = y_train
Tempdate.loc[:train_data.index.max(),:][['Title','Survived']].groupby(['Title'],as_index=False).mean().sort_values(by='Survived',ascending=True)

data3['Title'] = data3['Title'].map({'Master':0,'Miss':1,'Mrs':1,'Mr':2,'Rare':3})

In [90]:
data4 = data3.copy()

In [93]:
#standardize data
scaler = StandardScaler()
scaler.fit(data4)
data4 = pd.DataFrame(scaler.transform(data4),index=data4.index,columns=data4.columns)

In [96]:
#split the data
train_final = data4.loc[:train_data.index.max(), :].copy()
test_final = data4.loc[train_data.index.max() + 1:, :].reset_index(drop=True).copy()

In [97]:
train_final.info()
print('_'*40)
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    float64
 1   Sex       891 non-null    float64
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    float64
 4   Parch     891 non-null    float64
 5   Fare      891 non-null    float64
 6   Embarked  891 non-null    float64
 7   Title     891 non-null    float64
dtypes: float64(8)
memory usage: 55.8 KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    float64
 1   Sex       418 non-null    float64
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    float64
 4   Parch     418 non-null    float64
 5   Fare      418 non-null    float64
 6   Emb

In [99]:
max_features = list(range(1,train_final.shape[1]))

In [101]:
#find the best parameters in the given parameters.
parameters = {
    'max_features':max_features,
    'n_estimators':[5,10,20,100,250],
    'max_depth':[1,3,5,7,9],
    'learning_rate':[0.01,0.05,0.1]
}


gbc = GridSearchCV(GradientBoostingClassifier(), parameters, cv=10)


gbc.fit(train_final, y_train)

GridSearchCV(cv=10, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth': [1, 3, 5, 7, 9],
                         'max_features': [1, 2, 3, 4, 5, 6, 7],
                         'n_estimators': [5, 10, 20, 100, 250]})

In [103]:
print(gbc.score(train_final, y_train))
print(gbc.best_params_)

0.9326599326599326
{'learning_rate': 0.05, 'max_depth': 5, 'max_features': 6, 'n_estimators': 100}


In [108]:
gbc = GradientBoostingClassifier(learning_rate= 0.05, loss= 'deviance', max_depth=5, n_estimators= 100,max_features=6)
gbc.fit(train_final,y_train)
pred = gbc.predict(train_final)
print(classification_report(y_train,pred))

              precision    recall  f1-score   support

           0       0.92      0.97      0.95       549
           1       0.95      0.86      0.90       342

    accuracy                           0.93       891
   macro avg       0.94      0.92      0.93       891
weighted avg       0.93      0.93      0.93       891



In [109]:
final_predictions = gbc.predict(test_final)

output = pd.concat([test_ids,pd.Series(final_predictions,name="Survived")],axis=1)
output.to_csv("./my_submission2.csv",index=False,header=True)

[0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 1]
