<a href="https://colab.research.google.com/github/ylfoo/ERA2036/blob/main/Learn_Classification_thru_Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Example for Classification
In this example, the modified Titanic dataset is used to construct two classifiers using k-Nearest Neighbours and Decision Tree to predict the survivality of the passengers.
The modified Titanic dataset consists of the following columns:
- Pclass - ticket class
- Sex - gender of the passenger
- Age - age of passenger
- SibSp - number of siblings / spouses aboard the Titanic
- Parch - number of parents / children aboard the Titanic
- Fare - passenger fare
- Survived - survival (0 = No, 1 = Yes)

In [6]:
# Import the necessary modules and packages
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [7]:
# load the dataset from CSV file
df = pd.read_csv("https://raw.githubusercontent.com/wooihaw/datasets/main/titanic.csv")

In [8]:
# Check the number of columns and rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [9]:
# Randomly view 5 data samples from the dataset
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
562,563,0,2,"Norman, Mr. Robert Douglas",male,28.0,0,0,218629,13.5,,S
448,449,1,3,"Baclini, Miss. Marie Catherine",female,5.0,2,1,2666,19.2583,,C
466,467,0,2,"Campbell, Mr. William",male,,0,0,239853,0.0,,S
751,752,1,3,"Moor, Master. Meier",male,6.0,0,1,392096,12.475,E121,S
139,140,0,1,"Giglio, Mr. Victor",male,24.0,0,0,PC 17593,79.2,B86,C


In [10]:
# Check for missing data
# if there is any missing data, they must be handled first
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
# Replace missing values with the median value
df['Age'] = df['Age'].fillna(df['Age'].median())

# Check whether there is any more missing data
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
# Calculate descriptive statistics
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [13]:
# Apply one-hot encoding to convert nominal categorical data to numerical data
df2 = pd.get_dummies(df, drop_first=True)
df2.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
711,712,0,1,28.0,0,0,26.55,0,0,0,...,0,0,0,0,0,0,0,0,0,1
695,696,0,2,52.0,0,0,13.5,0,0,0,...,0,0,0,0,0,0,0,0,0,1
332,333,0,1,38.0,0,1,153.4625,0,0,0,...,0,0,0,0,0,0,0,0,0,1
831,832,1,2,0.83,1,1,18.75,0,0,0,...,0,0,0,0,0,0,0,0,0,1
842,843,1,1,30.0,0,0,31.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Extract the "charges" column (targets) into y
y = df2['Survived'].values

# Delete the "charges" column
del df2['Survived']

# Extract the remaining columns (features) into X
X = df2.values

# Print the dimensions of X and y
print(f"Dimension of X: {X.shape}")
print(f"Dimension of y: {y.shape}")

Dimension of X: (891, 1725)
Dimension of y: (891,)


In [15]:
# Split 75% of the dataset for training and the remaining 25% for testing
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

# Print the number of data samples for training and testing
print(f"Number of data samples for training: {X_train.shape[0]}")
print(f"Number of data samples for testing: {X_test.shape[0]}")

Number of data samples for training: 668
Number of data samples for testing: 223


In [16]:
# Train a k-NN model with the training data to predict the survivality of the passengers
knn = KNeighborsClassifier().fit(X_train, y_train)

# Evaluate the k-NN model with the testing data and print the accuracy
print(f"knn accuracy: {knn.score(X_test, y_test)}")

knn accuracy: 0.6591928251121076


In [17]:
# Train a decision tree model with the training data to predict the survivality of the passengers
dtc = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)

# Evaluate the decision tree model with the testing data and print the accuracy
print(f"knn accuracy: {dtc.score(X_test, y_test)}")

knn accuracy: 0.820627802690583
