Classes Recap

In [1]:
class KaggleMember():

  kaggle_member_count = 0

  def __init__(self, name, surname, level = None):
    self.name = name.capitalize()
    self.surname = surname.upper()
    self.set_level(level)

    KaggleMember.kaggle_member_count += 1
    self.kaggle_id = KaggleMember.kaggle_member_count

  def display_member_number(self):
    print("There are {} members on Kaggle".format(KaggleMember.kaggle_member_count))

  def display_member_info(self):
    print("Full Name: {} {}".format(self.name, self.surname))
    print("Level : {}".format(self.level))
    print("Kaggle ID: ", self.kaggle_id)

  def set_level(self, level):
    if level is None:
      self.level = "Beginner"
    else:
      self.level = level.title()

kaggler1 = KaggleMember('Uday', 'Gill')
kaggler1.display_member_info()
kaggler1.display_member_number()
print()

Full Name: Uday GILL
Level : Beginner
Kaggle ID:  1
There are 1 members on Kaggle



Explaination

1. Created a Class named `KaggleMember()`.
2. Class variable `kaggle_member_count` defined just below the Class name.
3. Constructor method defined using `__init__()` to initialize attributes.
4. First parameter `self` is a reference to current instance of the Class.
5. Three more parameters used, two of which do not have default values which must be passed when the function is called.
6. Assigned values to instance variables from the constructor method.
7. A function is used to assign value to the `level` instance.
8. Remaining other methods are defined to be applied on the attributes.



---


Titanic Project Introduction

Problem - To create a model that predicts which passengers survived the Titanic shipwreck.

Datasets - Two, one for training and one for testing. Here, we are only using the training dataset as we are not submitting our solution for testing.

Metric - Percentage of passengers correctly predicted (also known as Accuracy).

We begin by importing the necessary modules and the dataset.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [3]:
dataset = pd.read_csv('/workspaces/codespaces-jupyter/data/train.csv')
datasetcopy = dataset

dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Created a Class `Passenger()` representing a passenger on the Titanic ship with attributes derived from the header row of the dataset.

In [4]:
class Passenger():
  def __init__(self, passenger_id, pclass, name, sex, age, sibsp, parch, ticket, fare, cabin, embarked):
    self.passenger_id = passenger_id
    self.pclass = pclass
    self.name = name
    self.sex = sex
    self.age = age
    self.sibsp = sibsp
    self.parch = parch
    self.ticket = ticket
    self.fare = fare
    self.cabin = cabin
    self.embarked = embarked
    self.survived = None

  def set_survival_status(self, status):
    self.survived = status

  def __str__(self):
    return f"Passenger({self.passenger_id}, {self.name}, Survived: {self.survived})"

Created a class for processing and preparing a `dataframe`. For processing, we fill in the missing values using `median` and `mode` values, encode the categories containing string values by mapping the string values to integer values, and drop the columns that are not required.

In [5]:
class DataProcessor():
  def __init__(self, dataframe):
    self.dataframe = dataframe

  def fill_missing_values(self):
    self.dataframe['Age'] = self.dataframe['Age'].fillna(self.dataframe['Age'].median())
    self.dataframe['Embarked'] = self.dataframe['Embarked'].fillna(self.dataframe['Embarked'].mode()[0])
    self.dataframe['Fare'] = self.dataframe['Fare'].fillna(self.dataframe['Fare'].median())

    return self

  def encode_categories(self):
    self.dataframe['Sex'] = self.dataframe['Sex'].map({'male' : 0, 'female' : 1})
    self.dataframe['Embarked'] = self.dataframe['Embarked'].map({'C' : 0, 'Q' : 1, 'S' : 2})

    return self

  def drop_unnecessary_columns(self):
    self.dataframe = self.dataframe.drop(['Name', 'Ticket', 'Cabin'], axis = 1)

    return self

  def get_processed_data(self):
    return self.dataframe

A child Class is inherited from parent `DataProcessor()`.

In [6]:
class TitanicDataProcessor(DataProcessor):
  def __init__(self, dataframe):
    super().__init__(dataframe)

  def preprocess(self):
    self.fill_missing_values().encode_categories().drop_unnecessary_columns()

    return self.get_processed_data()

Created a Class for handling the ML model with training, prediction and evaluation methods.

In [7]:
class Model():
  def __init__(self, model):
    self.model = model

  def train(self, X_train, y_train):
    self.model.fit(X_train, y_train)

  def predict(self, X_test):
    return self.model.predict(X_test)

  def evaluate(self, y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)

    return f"Accuracy: {accuracy}"

Addionally, created a Class for fine-tuning (i.e., finding the best hyperparameters) of the ML model being used. Hyperparameters are the parameters of which value we choose before training the model.

In [8]:
class HyperparameterTuner():
  def __init__(self, tool):
    self.tool = tool

  def train(self, X_train, y_train):
    self.tool.fit(X_train, y_train)

  def search(self):
    best_hyperparameters = self.tool.best_params_

    return f"Best Parameters: {best_hyperparameters}"

Finally, a `Pipeline()` class is created for managing and executing the dataset, the model and the fine-tuning.

In [9]:
class Pipeline():
  def __init__(self, processor, model, tool):
    self.processor = processor
    self.model = model
    self.tool = tool

  def run(self, dataset):
    processed_dataset = self.processor(dataset).preprocess()

    X = processed_dataset.drop('Survived', axis = 1)
    y = processed_dataset['Survived']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

    self.model.train(X_train, y_train)
    predictions = self.model.predict(X_test)
    evaluations_metric = self.model.evaluate(y_test, predictions)
    print(evaluations_metric)

    self.tool.train(X_train, y_train)
    finetune = self.tool.search()
    print(finetune)

We pick the Random Forest classifier and run the pipeline for the results.

In [10]:
processor = TitanicDataProcessor
model_train = Model(RandomForestClassifier())
print("Model: Random Forest Classifier")

model_finetune = RandomForestClassifier()
hyperparameters = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 5, 10, 20],
}
finetuner = GridSearchCV(estimator = model_finetune, param_grid = hyperparameters, scoring = 'accuracy')
tool = HyperparameterTuner(finetuner)

pipeline = Pipeline(processor, model_train, tool)
pipeline.run(dataset)

Model: Random Forest Classifier
Accuracy: 0.8212290502793296
Best Parameters: {'max_depth': 5, 'n_estimators': 300}
