In [1]:
# goal: predict tumor size via tree methods.

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breast-cancer/Breast_Cancer.csv


In [3]:
dataset=pd.read_csv('/kaggle/input/breast-cancer/Breast_Cancer.csv')

In [4]:
dataset.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [5]:
dataset = dataset.drop('Status', axis=1).drop('Survival Months',axis=1)

In [6]:
dataset.keys()

Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive'],
      dtype='object')

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
dtypes: int64(4), object(10)
memory usage: 440.2+ KB


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('Tumor Size',axis=1),
                                                   dataset['Tumor Size'],
                                                   test_size=0.25, random_state=42)

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
categorical_columns=['Race','Marital Status','T Stage ', 'N Stage',
                    '6th Stage', 'differentiate', 'Grade', 'A Stage',
                    'Estrogen Status','Progesterone Status']
#numerical_columns = ['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive']
column_transformer_pipeline = ColumnTransformer([ 
                                        ('categorical', OneHotEncoder(), categorical_columns) 
                                                ], remainder='passthrough')
pipeline = Pipeline(steps=[('column_transformer', column_transformer_pipeline)])

preprocessor = pipeline.fit(X=X_train)

In [12]:
X_train.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive
296,45,White,Married,T1,N1,IIA,Moderately differentiated,2,Regional,Positive,Positive,1,1
1163,61,White,Married,T1,N1,IIA,Moderately differentiated,2,Regional,Positive,Positive,21,2
507,50,White,Married,T1,N1,IIA,Moderately differentiated,2,Regional,Positive,Positive,15,3
2882,69,White,Single,T4,N3,IIIC,Moderately differentiated,2,Regional,Positive,Positive,24,23
7,40,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,Positive,Positive,9,1


In [13]:
y_train.head()

296     20
1163    18
507     12
2882    80
7       30
Name: Tumor Size, dtype: int64

In [14]:
X_train = preprocessor.transform(X_train)
X_test = preprocessor.transform(X_test)

In [15]:
X_train

array([[ 0.,  0.,  1., ..., 45.,  1.,  1.],
       [ 0.,  0.,  1., ..., 61., 21.,  2.],
       [ 0.,  0.,  1., ..., 50., 15.,  3.],
       ...,
       [ 0.,  0.,  1., ..., 57., 31.,  2.],
       [ 0.,  1.,  0., ..., 48., 27.,  1.],
       [ 0.,  0.,  1., ..., 53., 14.,  2.]])

# Decision Trees

In [16]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
dtr = DecisionTreeRegressor(criterion='squared_error',
                           splitter='best',
                           random_state=42)

In [18]:
r2 = cross_val_score(dtr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(dtr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")


R^2: 0.502 +/- 0.095
-ve RMSE: -14.525 +/- 1.534


In [19]:
decision_tree_regressor = dtr.fit(X=X_train, y=y_train)

In [20]:
y_pred = decision_tree_regressor.predict(X_test)

In [21]:
from sklearn.metrics import r2_score
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.53


# Bagging

In [22]:
from sklearn.ensemble import BaggingRegressor

In [23]:
bgr = BaggingRegressor(estimator = dtr, 
                      n_estimators=100,
                      bootstrap=True,
                      oob_score=False,
                      n_jobs=-1,
                      random_state=42)

In [24]:
r2 = cross_val_score(bgr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(bgr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.717 +/- 0.047
-ve RMSE: -10.937 +/- 0.951


In [25]:
bagging_regressor = bgr.fit(X=X_train, y=y_train)
y_pred = bagging_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.718


# Random Forests

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
rfr = RandomForestRegressor(n_estimators=500,
                           criterion='squared_error',
                           max_features = 'sqrt',
                           bootstrap=True,
                           oob_score=False,
                           n_jobs=-1,
                           random_state=42)

In [28]:
r2 = cross_val_score(rfr, X_train, y_train, scoring='r2', cv=10)
neg_rmse_error = cross_val_score(rfr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.726 +/- 0.038
-ve RMSE: -10.777 +/- 0.72


In [29]:
randomforest_regressor = rfr.fit(X=X_train, y=y_train)
y_pred = randomforest_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.716


# Gradient Boosting

In [30]:
from sklearn.ensemble import GradientBoostingRegressor

In [31]:
gbr = GradientBoostingRegressor(loss = 'squared_error',
                               learning_rate=2e-1,
                               n_estimators=100,
                               random_state=42,
                               max_features='sqrt')

In [32]:
r2 = cross_val_score(gbr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(gbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.736 +/- 0.045
-ve RMSE: -10.563 +/- 0.862


In [33]:
gradboost_regressor = gbr.fit(X=X_train, y=y_train)
y_pred = gradboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.714


# Histogram-Based GBR

In [34]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [35]:
hgbr = HistGradientBoostingRegressor(loss = 'squared_error',
                                     learning_rate=0.01,
                                     max_iter = 400,
                                     random_state=42)

In [36]:
r2 = cross_val_score(hgbr, X_train, y_train, scoring='r2', cv=10, n_jobs=-1)
neg_rmse_error = cross_val_score(hgbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10,n_jobs=-1)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.743 +/- 0.04
-ve RMSE: -10.421 +/- 0.858


In [37]:
hgradboost_regressor = hgbr.fit(X=X_train, y=y_train)
y_pred = hgradboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.721


# XGBoost

In [38]:
from xgboost import XGBRegressor

In [39]:
xgbr = XGBRegressor(n_estimators=1000,
                   eta=0.01)

In [40]:
r2 = cross_val_score(xgbr, X_train, y_train, scoring='r2', cv=10)
neg_rmse_error = cross_val_score(xgbr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)


print(f"R^2: {np.mean(r2).round(3)} +/- {np.std(r2).round(3)}")
print(f"-ve RMSE: {np.mean(neg_rmse_error).round(3)} +/- {np.std(neg_rmse_error).round(3)}")

R^2: 0.705 +/- 0.057
-ve RMSE: -11.138 +/- 1.03


In [41]:
xgboost_regressor = xgbr.fit(X=X_train, y=y_train)
y_pred = xgboost_regressor.predict(X_test)
print(f"R^2 score:{r2_score(y_test, y_pred).round(3)}")

R^2 score:0.684
