### Imports

In [None]:
import os
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

#### Verify credentials & env setup


In [23]:
os.path.exists('credentials')

False

In [24]:
# Make sure the service account credentials are available in the credentials folder

if not os.path.exists('../credentials/service-account.json'):
    
    raise ImportError("""Service account credentials not found,\
        \nplease make sure the credentials are available in the credentials folder\
        \nand named as service-account.json""")
else : 
    print("🔥 You are good to go! 🔥")

🔥 You are good to go! 🔥


In [25]:
GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID')
GCP_BUCKET = os.environ.get('GCP_BUCKET')
BQ_DATASET = os.environ.get('BQ_DATASET')
BQ_TABLE = os.environ.get('BQ_TABLE')

print(f"GCP_PROJECT_ID: {GCP_PROJECT_ID}")
print(f"GCP_BUCKET: {GCP_BUCKET}")
print(f"BQ_DATASET: {BQ_DATASET}")
print(f"BQ_TABLE: {BQ_TABLE}")

GCP_PROJECT_ID: mlflowserverbatch55-eb77
GCP_BUCKET: my-awesome-bucket-batch551
BQ_DATASET: titanic
BQ_TABLE: raw-data


# Loading Data

In [26]:

# Install the required package


client = bigquery.Client.from_service_account_json('../credentials/service-account.json')


dataset_ref = bigquery.DatasetReference(GCP_PROJECT_ID, BQ_DATASET)
table_ref = dataset_ref.table(BQ_TABLE)
table = client.get_table(table_ref)

df = client.list_rows(table).to_dataframe()
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,6145_01,Europa,,C/231/S,55 Cancri e,,False,3478.0,10.0,0.0,105.0,2383.0,Benebah Asolipery,False
1,0052_01,Earth,False,G/6/S,TRAPPIST-1e,,False,4.0,0.0,2.0,4683.0,0.0,Elaney Hubbarton,False
2,0068_01,Mars,False,E/4/S,TRAPPIST-1e,,False,793.0,0.0,2.0,253.0,0.0,Cinst Binie,False
3,0202_02,Europa,False,A/2/P,55 Cancri e,,False,0.0,2433.0,,878.0,443.0,Vegas Embleng,True
4,0206_01,Europa,False,C/9/S,55 Cancri e,,False,2.0,1720.0,12.0,1125.0,122.0,Nuson Brugashed,True


In [27]:
df.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

# Preprocessing

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   boolean
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   boolean
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   boolean
dtypes: boolean(3), float64(6), object(5)
memory usage: 798.1+ KB


In [29]:
# (a) Extract Deck, Cabin_num, Side from the 'Cabin' column
df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
df.drop(columns=['Cabin'], inplace=True)  # We won't use the original Cabin anymore

# Convert 'CabinNum' to numeric
df['CabinNum'] = pd.to_numeric(df['CabinNum'], errors='coerce')

# (b) Drop columns that may not be useful (like PassengerId, Name)
df.drop(columns=['PassengerId', 'Name',"CabinNum"], inplace=True, errors='ignore')

# (c) Target variable
# 'Transported' is boolean; convert to 0/1 for classification
df['Transported'] = df['Transported'].astype(int)

# (d) Separate features (X) and target (y)
y = df['Transported']
X = df.drop(columns=['Transported'])

In [30]:
X

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Side
0,Europa,,55 Cancri e,,False,3478.0,10.0,0.0,105.0,2383.0,C,S
1,Earth,False,TRAPPIST-1e,,False,4.0,0.0,2.0,4683.0,0.0,G,S
2,Mars,False,TRAPPIST-1e,,False,793.0,0.0,2.0,253.0,0.0,E,S
3,Europa,False,55 Cancri e,,False,0.0,2433.0,,878.0,443.0,A,P
4,Europa,False,55 Cancri e,,False,2.0,1720.0,12.0,1125.0,122.0,C,S
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Earth,False,55 Cancri e,78.0,False,0.0,213.0,202.0,288.0,0.0,F,S
8689,Earth,True,55 Cancri e,78.0,False,0.0,0.0,0.0,0.0,0.0,G,P
8690,Europa,False,TRAPPIST-1e,79.0,False,5649.0,235.0,7.0,2579.0,2175.0,B,S
8691,Europa,False,TRAPPIST-1e,79.0,False,0.0,0.0,0.0,2210.0,2.0,C,S


### Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

cat_cols = X.select_dtypes(include='object').columns.tolist()


### Numerical Features

In [35]:
num_cols = X.select_dtypes(include='number').columns.tolist()
numeric_transformer = Pipeline(steps=[
    ('num_imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
numeric_transformer

### Categorical Features

In [37]:
#   - Impute missing with most frequent
#   - OneHotEncode (handle_unknown='ignore' will ignore new categories in test data)
categorical_transformer = Pipeline(steps=[
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
categorical_transformer


### Full Preprocessing Pipeline


In [42]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]).set_output(transform="pandas")
preprocessor

In [43]:
preprocessor.fit_transform(X_train)

Unnamed: 0,num__Age,num__RoomService,num__FoodCourt,num__ShoppingMall,num__Spa,num__VRDeck,cat__HomePlanet_Earth,cat__HomePlanet_Europa,cat__HomePlanet_Mars,cat__HomePlanet_None,...,cat__Deck_C,cat__Deck_D,cat__Deck_E,cat__Deck_F,cat__Deck_G,cat__Deck_T,cat__Deck_None,cat__Side_P,cat__Side_S,cat__Side_None
3164,-0.463853,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
941,-1.229242,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8641,2.806449,6.329228,-0.279493,0.582656,-0.272926,0.173263,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4847,-0.046367,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5010,0.023214,0.145080,-0.279493,0.617070,-0.272926,-0.263366,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3757,-0.324691,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4708,-0.046367,-0.336686,-0.279493,-0.277706,0.337875,-0.155082,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7190,0.927765,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
329,-1.994632,-0.336686,-0.279493,-0.277706,-0.272926,-0.263366,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


# Model Selection

In [76]:
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier



# Define the pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression() )
])

# Define the hyperparameter grid
param_grids = [
    {
        'classifier': [LogisticRegression(max_iter=1000)],
        'classifier__C': [0.01, 0.1, 1, 10]
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)], # 42 for reproducibility
        'classifier__n_estimators': [50, 100,150],
        'classifier__max_depth': [None, 5, 10, 15]
    },
    {   'classifier': [AdaBoostClassifier(random_state=42)],
        'classifier__n_estimators': [50, 100,150],
        'classifier__learning_rate': [0.01, 0.1, 1]
    },
    {
        'classifier': [SVC()],
        'classifier__C': [0.1, 1, 10],
        'classifier__kernel': ['linear', 'rbf']
    },
    {
        'classifier': [KNeighborsClassifier()],
        'classifier__n_neighbors': range(1,25)
    }
]

for param_grid in param_grids:
    grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1,verbose=1)
    grid_search.fit(X_train, y_train)
    print(f"Best params:         {grid_search.best_params_}")
    print(f"Best score on Train: {round(grid_search.best_score_,4)}")
    print(f"Test score on Test : {round(grid_search.score(X_test, y_test),4)}")
    print(42*"🌳")
    


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params:         {'classifier': LogisticRegression(max_iter=1000), 'classifier__C': 1}
Best score on Train: 0.7847
Test score on Test : 0.7901
🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best params:         {'classifier': RandomForestClassifier(random_state=42), 'classifier__max_depth': 10, 'classifier__n_estimators': 100}
Best score on Train: 0.7971
Test score on Test : 0.812
🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best params:         {'classifier': AdaBoostClassifier(random_state=42), 'classifier__learning_rate': 1, 'classifier__n_estimators': 150}
Best score on Train: 0.7831
Test score on Test : 0.7936
🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params:         {'classifier': SVC(), 'classifier__C': 10, 'classifier__kernel': 'rbf'}
Be

### Ensemble Models

The Voting Classifier is a simple way to combine the predictions of multiple machine learning algorithms. It works by first creating two or more standalone models from your training dataset. A Voting Classifier can then be used to wrap your models and average the predictions of the sub-models when asked to make predictions for new data.

In [71]:
from sklearn.ensemble import VotingClassifier

# Define the pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('lr', LogisticRegression(C=1, max_iter=1000)),
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)),
        ('svc', SVC(C=10, kernel='rbf')),
        ('knn', KNeighborsClassifier(n_neighbors=24)),
        ('ada', AdaBoostClassifier(n_estimators=150, learning_rate=1,random_state=42))
    ], voting='hard'))
])

pipe.fit(X_train, y_train)


In [73]:
pipe.score(X_test, y_test) # Not better than RandomForest alone 🥲

0.8056354226566993

In [75]:
print(42*"🌳")

🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳🌳


## Load Final model to GCS 

[READ THE DOC](https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python)

In [81]:
import joblib 
model_path = '../models/titanic_pipeline.joblib'
if not os.path.exists('../models'):
    os.makedirs('../models')
with open(model_path, 'wb') as f:
    joblib.dump(pipe, f)
    

In [None]:
blob_name = "titanic_pipeline.joblib"
storage_client = storage.Client.from_service_account_json('../credentials/service-account.json')
bucket = storage_client.get_bucket(GCP_BUCKET)
blob = bucket.blob(blob_name)
generation_match_precondition = 0
blob.upload_from_filename(model_path, if_generation_match=generation_match_precondition)

print(
    f"File {model_path} uploaded to gs:{blob_name}."
)


File ../models/titanic_pipeline.joblib uploaded to ../models/titanic_pipeline.joblib.
