## **Imports**

In [14]:
import os
import json
import joblib
import pandas as pd

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

## **Paths & Global Variables**

In [15]:
ROOT_PATH = r"C:\Users\mario\OneDrive\Documents\Work\Side Hustles\Kaggle\titanic"

import sys

sys.path.append(ROOT_PATH)

from titanic.config import PROCESSED_DATA_DIR, MODELS_DIR

## **Reading Data**

In [16]:
os.chdir(PROCESSED_DATA_DIR)

X_test = pd.read_parquet("X_test.parquet")
y_test = pd.read_parquet("y_test.parquet")

os.chdir(MODELS_DIR)
lda_model = joblib.load("lda-partial-model.joblib")
lda_transformer = joblib.load("lda-partial-transformer.joblib")

In [17]:
cols_to_reduce = ['cabin_level_1_F', 'embarked_S', 'passenger_class_ord',
       'cabin_level_1_C', 'cabin_level_1_B', 'embarked_Q',
       'cabin_level_2_lcode', 'num_rooms', 'age_min_max', 'siblings_spouses']

X_high_corr = X_test.drop(columns=cols_to_reduce)
X_low_corr = X_test[cols_to_reduce]

## **Model Validation**

In [18]:
COMPONENTS = 1

X_test_lda = lda_transformer.transform(X_low_corr)

for i in range(COMPONENTS):
    X_low_corr[f'lda_{i+1}'] = X_test_lda[:, i]

X_low_corr.drop(columns=cols_to_reduce, inplace=True)
X_new = pd.merge(
    X_high_corr, X_low_corr, left_index=True, right_index=True,
)

y_predicted = lda_model.predict(X_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_low_corr[f'lda_{i+1}'] = X_test_lda[:, i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_low_corr.drop(columns=cols_to_reduce, inplace=True)


## **Evaluation Metrics**

In [19]:
test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_predicted),
    index=["actual no", "actual yes"],
    columns=["predicted no", "predicted yes"],
)

test_conf_matrix

Unnamed: 0,predicted no,predicted yes
actual no,232,186
actual yes,0,0


In [20]:
metrics_dict = {
    "Accuracy": accuracy_score(y_test, y_predicted),
    "Precision": precision_score(y_test, y_predicted),
    "Recall": recall_score(y_test, y_predicted),
    "F1 Score": f1_score(y_test, y_predicted),
}

metrics_df = pd.DataFrame(
    metrics_dict.values(), index=metrics_dict.keys(), columns=["Value"]
)

metrics_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Value
Accuracy,0.555024
Precision,0.0
Recall,0.0
F1 Score,0.0


## **Model & Prediction Export**

In [21]:
os.chdir(MODELS_DIR)

kaggle_df = pd.DataFrame(y_predicted, index=X_test.index, columns=["Survived"])
kaggle_df.index.name = "PassengerId"
kaggle_df.replace({False: 0, True: 1}, inplace=True)
kaggle_df.to_csv("lda-partial-kaggle_submission.csv")
kaggle_df.head()

  kaggle_df.replace({False: 0, True: 1}, inplace=True)


Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,1
895,0
896,1
