## Titanic 2

### Importing the libraries

In [37]:
import pandas as pd 
import numpy as np 

### Importing the dataset

In [38]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
titanic = train.append(test)
train_idx = len(train)
test_idx = len(titanic) - len(train)

In [39]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [40]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [41]:
senior = train.loc[train.Age >= 60]['Survived']
rate = sum(senior)/len(senior)
print(len(senior))
print(rate*100)

26
26.923076923076923


### Data preprocessing

#### Taking care of missing Categorical data

In [42]:
# Check where in which rows the values are missing 
np.where(pd.isnull(titanic.Embarked))

(array([ 61, 829], dtype=int64),)

In [43]:
# Check the most occuring value in the column embarked
titanic.Embarked.mode()

0    S
dtype: object

In [44]:
# Filling the missing values of Embarked column with most occuring element
titanic.Embarked = titanic.Embarked.fillna('S')

In [45]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


#### Taking care of missing numerical data 

In [46]:
# Filling the missing data of Age column with the median of Age
titanic.Age = titanic.Age.fillna(titanic.Age.median())

In [47]:
# Filling the missing data of Fare column with the median of Age 
titanic.Fare = titanic.Fare.fillna(titanic.Fare.median())

In [48]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


#### Encoding the Categotical Data

In [49]:
# Making dummy variables for Pclass, Embarked
pclass_dummies = pd.get_dummies(titanic.Pclass, prefix = 'Pclass')
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix = 'Embarked')

In [50]:
# Label Encoder for the Sex column 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
titanic.Sex = le.fit_transform(titanic.Sex)

In [51]:
print(titanic.Sex)

0      1
1      0
2      0
3      0
4      1
      ..
413    1
414    0
415    1
416    1
417    1
Name: Sex, Length: 1309, dtype: int32


#### Adding the dummy columns and dropping some unnecessary features

In [52]:
# Adding the dummy columns
dataset = pd.concat([titanic, pclass_dummies, embarked_dummies], axis = 1)

In [53]:
# Dropping the Passenger_ID, Name, Ticket, Pclass, Cabin and Embarked column
dataset.drop(['PassengerId', 'Pclass', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis = 1, inplace = True)

In [54]:
dataset.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0.0,1,22.0,1,0,7.25,0,0,1,0,0,1
1,1.0,0,38.0,1,0,71.2833,1,0,0,1,0,0
2,1.0,0,26.0,0,0,7.925,0,0,1,0,0,1
3,1.0,0,35.0,1,0,53.1,1,0,0,0,0,1
4,0.0,1,35.0,0,0,8.05,0,0,1,0,0,1


In [55]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    float64
 1   Sex         1309 non-null   int32  
 2   Age         1309 non-null   float64
 3   SibSp       1309 non-null   int64  
 4   Parch       1309 non-null   int64  
 5   Fare        1309 non-null   float64
 6   Pclass_1    1309 non-null   uint8  
 7   Pclass_2    1309 non-null   uint8  
 8   Pclass_3    1309 non-null   uint8  
 9   Embarked_C  1309 non-null   uint8  
 10  Embarked_Q  1309 non-null   uint8  
 11  Embarked_S  1309 non-null   uint8  
dtypes: float64(3), int32(1), int64(2), uint8(6)
memory usage: 74.1 KB


#### Splitting the data into the training set and the test set 

In [56]:
data_train = dataset[ :train_idx]
data_test = dataset[train_idx: ]

In [57]:
# Saving the Survivied column as int type
data_train.Survived = data_train.Survived.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [58]:
# Dropping the survived column from the training set and adding it to a new y_train variable
X_train = data_train.drop(['Survived'], axis = 1).values
y_train = data_train.Survived.values

In [59]:
# Dropping the survived column from the test set
X_test = data_test.drop(['Survived'], axis = 1).values

In [60]:
Z = pd.DataFrame(X_train)
Z

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,26.0,0.0,0.0,7.9250,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,1.0,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,0.0,1.0
887,0.0,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,0.0,1.0
888,0.0,28.0,1.0,2.0,23.4500,0.0,0.0,1.0,0.0,0.0,1.0
889,1.0,26.0,0.0,0.0,30.0000,1.0,0.0,0.0,1.0,0.0,0.0


In [61]:
Z = pd.DataFrame(X_test)
Z

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,34.5,0.0,0.0,7.8292,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,47.0,1.0,0.0,7.0000,0.0,0.0,1.0,0.0,0.0,1.0
2,1.0,62.0,0.0,0.0,9.6875,0.0,1.0,0.0,0.0,1.0,0.0
3,1.0,27.0,0.0,0.0,8.6625,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,22.0,1.0,1.0,12.2875,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
413,1.0,28.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
414,0.0,39.0,0.0,0.0,108.9000,1.0,0.0,0.0,1.0,0.0,0.0
415,1.0,38.5,0.0,0.0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
416,1.0,28.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0


#### Feature Scaling

In [67]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [62]:
Z = pd.DataFrame(X_train)
Z

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,22.0,1.0,0.0,7.2500,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,26.0,0.0,0.0,7.9250,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,0.0,1.0
4,1.0,35.0,0.0,0.0,8.0500,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,1.0,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,0.0,1.0
887,0.0,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,0.0,1.0
888,0.0,28.0,1.0,2.0,23.4500,0.0,0.0,1.0,0.0,0.0,1.0
889,1.0,26.0,0.0,0.0,30.0000,1.0,0.0,0.0,1.0,0.0,0.0


### Model Selection

#### XG Boost

In [93]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)





#### Logistic Regression Model

In [89]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#### Random Forest Model

In [91]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

#### Kernel SVM Model

In [69]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [127]:
print(y_pred)

[0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 0]


#### Applying the k-fold cross validation

In [70]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.05 %
Standard Deviation: 3.31 %


#### Saving the predictions in a .csv file

In [33]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_pred})
output.to_csv('submission_7.csv', index=False)