In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [2]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.isnull().sum() * 100 / len(df)

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [4]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string is np.NaN or big_string.find(substring) != -1:
            return substring
    print(big_string)
    return np.nan


In [5]:
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin


class CusttomAttribTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        title_list = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev', 'Dr','Ms', 
            'Mlle','Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']
        X['Title'] = X['Name'].map(lambda x: substrings_in_string(x, title_list))
        X['Title'] = X.apply(replace_titles, axis=1)
        
        cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
        X['Deck'] = X['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
        X['Family_Size'] = X['SibSp'] + X['Parch']
        X = X.drop(columns=['Cabin', 'Name', 'Parch', 'SibSp', 'Sex'])
        X[['Deck', 'Title']] = OrdinalEncoder().fit_transform(X[['Deck', 'Title']])
        imp = SimpleImputer(missing_values=np.nan, strategy='median')
        X["Age"] = imp.fit_transform(X[["Age"]])
        X["Age"] = pd.cut(X["Age"], bins=[0., 18, 30, 40, 50, 90], labels=[1, 2, 3, 4, 5])

        return X

In [7]:
from sklearn.model_selection import train_test_split

y = df['Survived']
X = df.drop(['Survived'], axis=1)


# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler


categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                    df[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaling', MinMaxScaler())
])


# custom_columns = ['Name', 'SibSp', 'Parch', 'Cabin', 'Sex', 'Age']
# categorical_cols = [c for c in categorical_cols if c not in custom_columns]
# numerical_cols = [c for c in numerical_cols if c not in custom_columns]

my_cols = numerical_cols + categorical_cols

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
#         ('custom', CusttomAttribTransformer(), custom_columns),
        ('num', SimpleImputer(strategy='median'), numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# cols = ['Age', 'Title', 'Deck', 'Family_Size'] + my_cols

In [9]:
X_train_full = pd.DataFrame(preprocessor.fit_transform(X_train_full), columns=my_cols)
X_valid_full = pd.DataFrame(preprocessor.fit_transform(X_valid_full), columns=my_cols)

In [10]:
X_train_full

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,141.0,3.0,29.0,0.0,2.0,15.2458,0.0,0.0
1,440.0,2.0,31.0,0.0,0.0,10.5000,1.0,2.0
2,818.0,2.0,31.0,1.0,1.0,37.0042,1.0,0.0
3,379.0,3.0,20.0,0.0,0.0,4.0125,1.0,0.0
4,492.0,3.0,21.0,0.0,0.0,7.2500,1.0,2.0
...,...,...,...,...,...,...,...,...
707,836.0,1.0,39.0,1.0,1.0,83.1583,0.0,0.0
708,193.0,3.0,19.0,1.0,0.0,7.8542,0.0,2.0
709,630.0,3.0,29.0,0.0,0.0,7.7333,1.0,1.0
710,560.0,3.0,36.0,1.0,0.0,17.4000,0.0,2.0


In [11]:
X_train_full.corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
PassengerId,1.0,-0.048783,0.059819,-0.057809,0.001263,0.040035,0.030596,0.002252
Pclass,-0.048783,1.0,-0.337849,0.093648,0.017336,-0.534776,0.128916,0.136414
Age,0.059819,-0.337849,1.0,-0.222256,-0.167534,0.098592,0.092772,-0.027153
SibSp,-0.057809,0.093648,-0.222256,1.0,0.423676,0.145538,-0.119615,0.078071
Parch,0.001263,0.017336,-0.167534,0.423676,1.0,0.213608,-0.241624,0.039821
Fare,0.040035,-0.534776,0.098592,0.145538,0.213608,1.0,-0.164523,-0.20086
Sex,0.030596,0.128916,0.092772,-0.119615,-0.241624,-0.164523,1.0,0.110669
Embarked,0.002252,0.136414,-0.027153,0.078071,0.039821,-0.20086,0.110669,1.0


In [12]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data = pd.DataFrame(preprocessor.transform(test_data), columns=my_cols)
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,892.0,3.0,34.5,0.0,0.0,7.8292,1.0,1.0
1,893.0,3.0,47.0,1.0,0.0,7.0,0.0,2.0
2,894.0,2.0,62.0,0.0,0.0,9.6875,1.0,1.0
3,895.0,3.0,27.0,0.0,0.0,8.6625,1.0,2.0
4,896.0,3.0,22.0,1.0,1.0,12.2875,0.0,2.0


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error

In [14]:
print(X_valid_full.shape, y_valid.shape)


(179, 8) (179,)


In [15]:
my_model = LogisticRegression()
my_model.fit(X_train_full, y_train)
predictions = my_model.predict(X_valid_full)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 0.2122905027932961


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
my_model = RandomForestClassifier()
my_model.fit(X_train_full.append(X_valid_full), y_train.append(y_valid)) 
predictions = my_model.predict(test_data)


In [17]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId.astype(int), 'Survived': predictions.round().astype(int)})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
