# Classification Benchmark

## Importing Libraries

In [1]:
#importing libraries 
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

from sklearn.

## Importing Dataset

In [3]:
data=pd.read_csv('train.csv')
data.shape

(891, 12)

In [4]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Shuffling and Creating Train and Test Set

In [8]:
from sklearn.utils import shuffle

# Shuffling the Dataset
data = shuffle(data, random_state = 42)

#creating 4 divisions
div = int(data.shape[0]/4)

# 3 parts to train set and 1 part to test set
train = data.loc[:3*div+1,:]
test = data.loc[3*div+1:]

train.shape, test.shape

((725, 12), (167, 12))

In [9]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
330,331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
181,182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C
640,641,0,3,"Jensen, Mr. Hans Peder",male,20.0,0,0,350050,7.8542,,S
522,523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C
206,207,0,3,"Backstrom, Mr. Karl Alfred",male,32.0,1,0,3101278,15.85,,S


In [10]:
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
667,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S
434,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S
338,339,1,3,"Dahl, Mr. Karl Edwart",male,45.0,0,0,7598,8.05,,S
535,536,1,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S
230,231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35.0,1,0,36973,83.475,C83,S


## Simple Mode

In [11]:
test['simple_mode'] = train['Survived'].mode()[0]
test['simple_mode'].head()

667    0
434    0
338    0
535    0
230    0
Name: simple_mode, dtype: int64

In [12]:
simple_mode_accuracy = accuracy_score(test['Survived'], test['simple_mode'])
simple_mode_accuracy

0.6287425149700598

## Mode Based on Gender

In [13]:
gender_mode = pd.crosstab(train['Survived'],train['Sex'])
gender_mode

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,66,379
1,192,88


In [14]:
test['gender_mode'] = test['Survived']

# for every unique value in column
for i in test['Sex'].unique():
  # Calculate and Assign mode to new column, corresponding to unique values in "Sex"
  test['gender_mode'][test['Sex'] == str(i)] = train['Survived'][train['Sex'] == str(i)].mode()[0]

In [15]:
gender_accuracy = accuracy_score(test['Survived'], test['gender_mode'])
gender_accuracy

0.7844311377245509

In [16]:
embarked_mode = pd.crosstab(train['Survived'],train['Embarked'])
embarked_mode

Embarked,C,Q,S
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,63,34,348
1,76,21,181


In [17]:
test['embarked_mode'] = test['Survived']

# for every unique value in column
for i in test['Embarked'].unique():
  # Calculate and Assign mode to new column, corresponding to unique values in "embark"
  test['embarked_mode'][test['Embarked'] == str(i)] = train['Survived'][train['Embarked'] == str(i)].mode()[0]

In [18]:
embarked_accuracy = accuracy_score(test['Survived'], test['embarked_mode'])
embarked_accuracy

0.6586826347305389