In [1]:
#import modules
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import HuberRegressor
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
#loading training dataset
url = 'https://raw.githubusercontent.com/AnuragPhukan/Breast_Cancer_Detection/main/TrainDataset2023.csv'
train_df = pd.read_csv(url)
train_df

Unnamed: 0,ID,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1,144.000000,41.0,0,0,0,1,3,3,...,0.517172,0.375126,3.325332,0.002314,3.880772e+06,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0,142.000000,39.0,1,1,0,0,3,3,...,0.444391,0.444391,3.032144,0.005612,2.372010e+06,59.459710,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1,135.000000,31.0,0,0,0,1,2,1,...,0.534549,0.534549,2.485848,0.006752,1.540027e+06,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0,12.000000,35.0,0,0,0,1,3,3,...,0.506185,0.506185,2.606255,0.003755,6.936741e+06,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0,109.000000,61.0,1,0,0,0,2,1,...,0.462282,0.462282,2.809279,0.006521,1.265399e+06,39.621023,0.006585,0.034148,0.001083,0.005626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,TRG002948,0,54.500000,58.5,1,0,1,0,3,2,...,0.476493,0.476493,2.453583,0.003229,2.327038e+06,18.562377,0.013766,0.018042,0.000288,0.012257
396,TRG002954,0,49.250000,34.3,0,0,0,1,3,3,...,0.418382,0.418382,2.995603,0.004243,1.005061e+06,156.627179,0.002228,0.136015,0.022148,0.002098
397,TRG002958,0,48.500000,53.3,0,0,0,1,2,1,...,0.527779,0.527778,1.500000,0.003728,2.132007e+05,0.996746,0.252582,0.007380,0.000037,0.231059
398,TRG002961,0,47.500000,68.8,1,0,0,0,3,3,...,0.313693,0.313693,3.573557,0.001112,2.008034e+07,204.864200,0.001372,0.054063,0.003697,0.001368


In [3]:
#loading training data
url = 'https://raw.githubusercontent.com/AnuragPhukan/Breast_Cancer_Detection/main/TrainDataset2023.csv'
train_df = pd.read_csv(url)
# Removing unwanted columns
train_df = train_df.drop(columns=['ID'])
train_df = train_df.drop(columns=['pCR (outcome)'])
train_df

#Removing rows with 999 in outputs and replacing 999 values in data with NaN
train_df = train_df[train_df['RelapseFreeSurvival (outcome)'] != 999]
train_df = train_df.replace(999, np.nan)

In [4]:
# Replace 999 with NaN
train_df = train_df.replace(999, np.nan)

# Dropping rows with missing target variable
train_df = train_df.dropna(subset=['RelapseFreeSurvival (outcome)'])

# Splitting dataset into features and target variable
X = train_df.drop(columns=['RelapseFreeSurvival (outcome)'])
y = train_df['RelapseFreeSurvival (outcome)']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Imputation and Scaling
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
#Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

In [5]:
threshold = 0.9
#Correlation Matrix
corr_matrix = train_df.corr()
corr_matrix
# Identify pairs of highly correlated features
high_corr_pairs = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            high_corr_pairs.add(colname)

print("Highly correlated features to be removed:", high_corr_pairs)

# Number of features to be removed
num_features_removed = len(high_corr_pairs)
print(f"Number of features to be removed: {num_features_removed}")

Highly correlated features to be removed: {'original_firstorder_TotalEnergy', 'original_glcm_Id', 'original_gldm_DependenceNonUniformityNormalized', 'original_glcm_InverseVariance', 'original_firstorder_Median', 'original_gldm_LargeDependenceEmphasis', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_glszm_SizeZoneNonUniformity', 'original_glszm_SmallAreaLowGrayLevelEmphasis', 'original_glrlm_GrayLevelVariance', 'original_glcm_Contrast', 'original_glcm_SumAverage', 'original_gldm_HighGrayLevelEmphasis', 'original_glrlm_RunLengthNonUniformity', 'original_glrlm_ShortRunLowGrayLevelEmphasis', 'original_glcm_Idm', 'original_shape_Maximum3DDiameter', 'original_gldm_GrayLevelNonUniformity', 'original_firstorder_Variance', 'original_glrlm_ShortRunEmphasis', 'original_gldm_LargeDependenceLowGrayLevelEmphasis', 'original_gldm_GrayLevelVariance', 'original_glcm_DifferenceEntropy', 'original_glrlm_LongRunHighGrayLevelEmphasis', 'original_glcm_JointEnergy', 'original_firstorder_Entro

In [6]:
# Drop highly correlated features from X_train and X_test
X_train = X_train.drop(columns=high_corr_pairs)
X_test = X_test.drop(columns=high_corr_pairs)

In [7]:
# Imputation
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Scaling
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

## Generating Test Ouptut File

### Ex - FinalTestExampleRFS.csv

In [8]:
#For Uploading the Test File to Colab
from google.colab import files
uploaded = files.upload()

Saving TestDatasetExample.xls to TestDatasetExample.xls


In [9]:
#import modules
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
#Generating Test DataFrame from the file uploaded
test_df = pd.read_excel('/content/TestDatasetExample.xls') #Change the file name that you are uploading
ID = test_df['ID']
test_df = test_df.drop(columns=['ID'])
test_df = test_df.drop(columns=high_corr_pairs)

#Imputation and Scaling
imputer = SimpleImputer(strategy='median')
imputed_data = imputer.fit_transform(test_df)
scaled_data = scaler.fit_transform(imputed_data)

#Best Model that we got after training with hyperparameters
model = GradientBoostingRegressor(learning_rate=0.01, max_depth=3, n_estimators=200, random_state=42)

#Fitting models
model.fit(X_train_scaled, y_train)

#Predicting the output
y_pred=model.predict(scaled_data)

In [10]:
#Generating a CSV File (The File will be automatically created)
print(y_pred)
df = pd.DataFrame({'ID': ID,
                   'RelapseFreeSurvival (outcome)': y_pred})
df.to_csv('FinalTestExampleRFS.csv', index=False)

[61.59463511 56.20875009 47.97301279]


In [11]:
df

Unnamed: 0,ID,RelapseFreeSurvival (outcome)
0,TRG002728,61.594635
1,TRG002649,56.20875
2,TRG002628,47.973013
