# Project 2 - Multiple Myeloma Survival


In [None]:
import pandas as pd
import missingno as msno
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import os

In [None]:
df = pd.read_csv('./data/train_data.csv')

In [None]:

def create_submission_file(predictions, filename):
    """
    Create a submission file with predictions.
    
    Args:
        predictions (array-like): The predicted values for SurvivalTime.
        filename (str): The name of the output file.
    """
    # Load the sample submission to get the 'Id' column structure
    sample_submission = pd.read_csv('./data/sample_submission.csv')

    # Ensure that predictions are a single column (reshape if necessary)
    predictions = pd.Series(predictions).values
    
    # Create the submission DataFrame
    submission = pd.DataFrame(columns=sample_submission.columns) 
    submission['SurvivalTime'] = predictions  # Add the predictions to the 'SurvivalTime' column

    # Save the DataFrame to CSV
    os.makedirs("./results", exist_ok=True)
    submission.to_csv(f'./results/{filename}', index=False)

    print(f"File Created: ./results/{filename}")


## Task 1 - Setting the baseline

### Task 1.1 - Data preparation and validation pipeline


In [None]:
# Simple visualization of missing values

msno.bar(df)
plt.show()

In [None]:
#Simple visualization of missing values

msno.heatmap(df)
plt.show()

In [None]:
#Simple visualization of missing values

msno.matrix(df)
plt.show()

In [None]:
#Simple visualization of missing values

msno.dendrogram(df)
plt.show()

In [None]:
# Drop rows with missing 'SurvivalTime' values
df_cleaned = df[df['SurvivalTime'].notnull()]

# Drop columns with missing data (for the baseline task 1.1)
df_cleaned = df_cleaned.dropna(axis=1)  # Drop columns with any missing data

# Drop rows where 'Censored' is 1 (Censoring occurs when the exact time of an event of interest (such as death or disease recurrence) is unknown)
df_cleaned = df_cleaned[(df_cleaned['Censored']== 0)]

#answering how many data points remain after dropping?
print(f"Remaining data points after dropping: {df_cleaned.shape[0]}")


msno.matrix(df_cleaned)
plt.show()

In [None]:
#scattter plots and diagonal histograms

#we dont put here the Censored because its a label indicator(which is not a feature for prediction, but a flag for censoring)
sns.pairplot(df_cleaned, vars=['Age', 'Gender', 'Stage','TreatmentType', 'SurvivalTime'])
plt.show()

In [None]:
# Define the feature matrix (X) and target vector (y)
X = df_cleaned.drop(['SurvivalTime', 'Censored'], axis=1)  # Drop target and censoring indicator
y = df_cleaned['SurvivalTime']  # Target variable: survival time
censored = df_cleaned['Censored']  # Censoring indicator

In [None]:
#80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# split training data into 80% training and 20% validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

# Step 4: Evaluate on the validation set
y_val_pred = model1.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
print(f'Validation Mean Squared Error: {val_mse}')

# Step 5: Test the model on the test set
y_test_pred = model1.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f'Test Mean Squared Error: {test_mse}')

In [None]:
# we can improve cv
# Use cross-validation with a linear regression model
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation with default scoring (R^2)
print(f"Cross-validation scores: {cv_scores}")

average_cv_score = np.mean(cv_scores)
print(f"Average Cross-validation Score: {average_cv_score}")

# Step 6: Train the model on the full dataset and test on a holdout test set (optional)
# If you still want to do a final test evaluation, split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the model
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
print(f"Validation MSE: {val_mse}")

# Evaluate on the test set
y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Test MSE: {test_mse}")

Comparing the Avarage MSE Cross validation value and the MSE value simple, with Cross Validation is more efficient without Cross Validation because in cross-validation, the model is trained and validated multiple times using different splits of the dataset. This means that the model gets to train on almost all of the data, which helps the model generalize better.


In [None]:
# Censored Mean Squared Error(we used this because for the censored cases we only know that the survival time is greater than some observed time)
# c = 0 for uncensored data points
# c = 1 for censored data points

def error_metric(y, y_hat, c):
    import numpy as np
    err = y-y_hat
    err = (1-c)*err**2 + c*np.maximum(0,err)**2
    return np.sum(err)/err.shape[0]

# c is the censored variable 
#  y is the true Survival Time, as determined by the ground truth.
#  The variable y_hat contains the predicted Survival Time.

In [None]:
# Calculate Censored Mean Squared Error 

# Without cross-validation
print("Without cross-validation")
y_val_pred = model1.predict(X_val)
val_cmse = error_metric(y_val, y_val_pred,censored)
print(f'Validation cMSE: {val_cmse}')

y_test_pred = model1.predict(X_test)
test_cmse = error_metric(y_test, y_test_pred,censored)
print(f'Test cMSE: {test_cmse}\n')


#With cross-validation
print("With cross-validation")
y_val_pred = model.predict(X_val)
val_cmse = error_metric(y_val, y_val_pred,censored)
print(f"Validation cMSE: {val_cmse}")

y_test_pred = model.predict(X_test)
test_cmse = error_metric(y_test, y_test_pred,censored)
print(f"Test cMSE: {test_cmse}")



we saw without cross-validation validation worst then test because of 2 possible reasons. Overfitting of the Valisdation Set or Random variablity in a single train/test split. Shows again that with cross-validation we try to coverage this cases with that solution. 

Note: we can change the split percentage to see if we can have a better result without cross-validation

### Task 1.2 - Learn the baseline model


In [None]:
# Create the pipeline with scaling and Linear Regression
pipeline = make_pipeline(
    StandardScaler(),  # Feature scaling
    LinearRegression()  # Linear regression model
)

# first try with r^2
cv_scores = cross_val_score(pipeline, X, y, cv=5)  # 5-fold cross-validation


pipeline.fit(X, y)
y_pred = pipeline.predict(X)


mse = mean_squared_error(y, y_pred)
print(f"MSE: {mse}")
cMSE = error_metric(y, y_pred,censored)
print(f"cMSE: {cMSE}")

create_submission_file(y_pred, 'baseline-submission-01.csv')

plt.scatter(y, y_pred)
plt.xlabel('True Values (y)')
plt.ylabel('Predicted Values (y_hat)')
plt.title('True vs Predicted Survival Time')
plt.show()

In [None]:
plt.boxplot([y, y_pred], tick_labels=["True Values", "Predicted Values"])
plt.title("Boxplot of True vs Predicted Values")
plt.show()

In [None]:
# Check the unique values in the 'Censored' column
print(df_cleaned['Censored'].value_counts())
