In [349]:
import numpy as np 
import pandas as pd
import matplotlib
import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [350]:
df = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
gender = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

In [351]:
df.isnull().sum()
#Returns the number of missing values in each column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [352]:
df.shape
#Shape of the Dataset is 891 rows and 12 columns

(891, 12)

In [353]:
df.head()
#Returns the first 5 rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [354]:
df.describe()
#Gives a rough idea of the dataset using some important statistical metrics

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [355]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

### Combining the Sibling and Parents columns to one column named Family

In [356]:
df['Family'] = df['SibSp'] + df['Parch']

In [357]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family'],
      dtype='object')

### Dropping the unnecessary columns

In [358]:
df1 = df.drop(labels = ['Parch', 'SibSp', 'Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1)
df1.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family'], dtype='object')

### One Hot Encoding of the Categorical Columns

In [359]:
df1_encoded = pd.get_dummies(df1, columns = ['Sex', 'Embarked'], drop_first='True')
df1_encoded.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,7.25,1,True,False,True
1,1,1,38.0,71.2833,1,False,False,False
2,1,3,26.0,7.925,0,False,False,True
3,1,1,35.0,53.1,1,False,False,True
4,0,3,35.0,8.05,0,True,False,True


In [360]:
df1_encoded['Age'].isnull().sum()
#Number of missing values in the Age column

177

In [361]:
df1_encoded.columns

Index(['Survived', 'Pclass', 'Age', 'Fare', 'Family', 'Sex_male', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

### Using SimpleImputer to impute the missing values with Median

In [362]:
from sklearn.impute import SimpleImputer
med_imputer = SimpleImputer(missing_values = np.nan, strategy='median')
df1_imputed = med_imputer.fit_transform(df1_encoded)

### Checking if the null values are hanndled properly

In [363]:
df1_imputed = pd.DataFrame(df1_imputed)
df1_imputed.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64

### Renaming the Column Indices

In [364]:
df1_imputed.columns = ['Survived', 'Pclass', 'Age', 'Fare', 'Family', 'Sex_male', 'Embarked_Q',
       'Embarked_S']
df1_imputed.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S
0,0.0,3.0,22.0,7.25,1.0,1.0,0.0,1.0
1,1.0,1.0,38.0,71.2833,1.0,0.0,0.0,0.0
2,1.0,3.0,26.0,7.925,0.0,0.0,0.0,1.0
3,1.0,1.0,35.0,53.1,1.0,0.0,0.0,1.0
4,0.0,3.0,35.0,8.05,0.0,1.0,0.0,1.0


### Dropping the target variable from the dataset

In [365]:
X = df1_imputed.drop(labels=['Survived'], axis=1)
y = df1_imputed['Survived']

### Splitting the DataFrame into 75% Training Data and 25% Test Data

In [366]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 5)
X_train.shape, X_test.shape

((668, 7), (223, 7))

### Accuracy of LogisticRegression Model

In [367]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(random_state = 5, max_iter = 500)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8251121076233184


### Accuracy of Decision Tree Classifier

In [368]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state = 5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8251121076233184


### Accuracy of Random Forest Classifier

In [369]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier(random_state = 5)
classifier1.fit(X_train, y_train)
y_pred = classifier1.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8295964125560538


### Accuracy of Support Vector Classifier

In [370]:
from sklearn.svm import SVC
classifier2 = SVC(random_state = 5)
classifier2.fit(X_train, y_train)
y_pred = classifier2.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7040358744394619


### Accuracy of K-Nearest Neighbors Classifier

In [371]:
from sklearn.neighbors import KNeighborsClassifier
classifier3 = KNeighborsClassifier(n_neighbors = 4)
classifier3.fit(X_train, y_train)
y_pred = classifier3.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.726457399103139


### Different classification metrics

In [372]:
from sklearn.metrics import classification_report as cr
print(cr(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.94      0.81       140
         1.0       0.78      0.37      0.50        83

    accuracy                           0.73       223
   macro avg       0.75      0.65      0.66       223
weighted avg       0.74      0.73      0.70       223



## Doing similar preprocessing on the Test Data

In [373]:
test['Family'] = test['SibSp'] + test['Parch']
test1 = test.drop(labels = ['Parch', 'SibSp', 'Name', 'Cabin', 'Ticket', 'PassengerId'], axis=1)
test1.columns
#Combining Siblings and Parents column
#Dropping the other unnecessary columns

Index(['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Family'], dtype='object')

In [374]:
test_encoded = pd.get_dummies(test1, columns = ['Sex', 'Embarked'], drop_first='True')
test_encoded.head()
#One Hot Encoding

Unnamed: 0,Pclass,Age,Fare,Family,Sex_male,Embarked_Q,Embarked_S
0,3,34.5,7.8292,0,True,True,False
1,3,47.0,7.0,1,False,False,True
2,2,62.0,9.6875,0,True,True,False
3,3,27.0,8.6625,0,True,False,True
4,3,22.0,12.2875,2,False,False,True


In [375]:
test_encoded['Age'].isnull().sum()
#Shows the number of null values present in the age column

86

In [376]:
test_encoded.columns
#Getting the index names of the dataset

Index(['Pclass', 'Age', 'Fare', 'Family', 'Sex_male', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [377]:
test_imputed = med_imputer.fit_transform(test_encoded)
test_imputed = pd.DataFrame(test_imputed)
test_imputed.columns = ['Pclass', 'Age', 'Fare', 'Family', 'Sex_male', 'Embarked_Q',
       'Embarked_S']
test_imputed.isnull().sum()

#Checking if there are any missing values left

Pclass        0
Age           0
Fare          0
Family        0
Sex_male      0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [378]:
prediction = classifier1.predict(test_imputed)
prediction1 = prediction.astype(int)

In [379]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': prediction1
})

# Save the submission file
submission.to_csv('submission.csv', index=False)