In [2]:
%matplotlib inline
from scipy.stats import norm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

# Data Retrieval

In [3]:
#load the csv data in a dataframe
DATA_PATH=".//dataset//metabric"
DATA_NAME="featSurv.csv"

import pandas as pd

def load_raw_data(data_path, data_name):
    csv_path = os.path.join(data_path, data_name)
    return pd.read_csv(csv_path)

df=load_raw_data(DATA_PATH,DATA_NAME)

# Data Info

In [4]:
df.head()

Unnamed: 0,ID,age_at_diagnosis,size,lymph_nodes_positive,grade,histological_type,ER_IHC_status,ER.Expr,PR.Expr,HER2_IHC_status,...,NOT_IN_OSLOVAL_P53_mutation_status,NOT_IN_OSLOVAL_P53_mutation_type,NOT_IN_OSLOVAL_P53_mutation_details,NOT_IN_OSLOVAL_Pam50Subtype,NOT_IN_OSLOVAL_IntClustMemb,NOT_IN_OSLOVAL_Site,NOT_IN_OSLOVAL_Genefu,x_Prolif,time,status
0,MB-0000,75.65,22.0,10.0,3.0,IDC,pos,+,-,,...,,,,Normal,,1,,,2999,0
1,MB-0002,43.19,10.0,0.0,3.0,IDC,pos,+,+,,...,MUT,MISSENSE,MB-AD-0002+ex5,6+chr17:7519122+12521A>AC+178H>H/P+MISSENSE+FR+FR,LumA,4,1,ER+/HER2-HighProlif,1484,0
2,MB-0005,48.87,15.0,1.0,2.0,IDC,pos,+,+,,...,,,,LumB,,1,,,3053,0
3,MB-0006,47.68,25.0,3.0,2.0,IDC,pos,+,+,0.0,...,,,,LumB,,1,,,1721,0
4,MB-0008,76.97,40.0,8.0,3.0,IDC,pos,+,+,,...,WT,,,LumB,9,1,ER+/HER2- High Prolif,,1241,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1981 entries, 0 to 1980
Data columns (total 29 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   ID                                         1981 non-null   object 
 1   age_at_diagnosis                           1981 non-null   float64
 2   size                                       1962 non-null   float64
 3   lymph_nodes_positive                       1975 non-null   float64
 4   grade                                      1897 non-null   float64
 5   histological_type                          1981 non-null   object 
 6   ER_IHC_status                              1940 non-null   object 
 7   ER.Expr                                    1981 non-null   object 
 8   PR.Expr                                    1981 non-null   object 
 9   HER2_IHC_status                            821 non-null    float64
 10  HER2_SNP6_state         

# Feature Select Numeric and Categorical

In [6]:
featSel=[1,2,3,4,5,6,7,8,10,11,12,13,15,16,17,18,24,27,28]
df_sel=df[df.columns[featSel]].copy()

colNum=[]
colCat=[]
#separeta cat and num features
from pandas.api.types import is_numeric_dtype
for colname in df_sel.columns:
    if is_numeric_dtype(df_sel[colname]):
        colNum.append(colname)
    else:
        colCat.append(colname)


#sel Num feat
df_sel_num=df_sel[colNum].copy()

#sel cat feat
df_sel_cat=df_sel[colCat].copy()

In [None]:
df_sel_cat.info()

# Data Cleaning 1 (Missing Value & Feature Scaling for Numeric Features)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,MinMaxScaler 


#obj imp
imputer = SimpleImputer(strategy='median')

#obj sca
scaler=MinMaxScaler()


#fit imputer
imputer.fit(df_sel_num)
#trans
arr_sel_num_imp=imputer.transform(df_sel_num)

#fit scaling
scaler.fit(arr_sel_num_imp)
feat_sel_num_imp=scaler.transform(arr_sel_num_imp)


df_sel_num_imp=pd.DataFrame(feat_sel_num_imp,columns=df_sel_num.columns)


#save and remove status 
df_status=df_sel_num_imp['status'].copy()
df_sel_num_imp=df_sel_num_imp.drop('status',axis=1)
feat_sel_num_imp=df_sel_num_imp.to_numpy()
df_sel_num_imp.info()
feat_sel_num_imp

# Data Cleaning 2 (Missing Value for Categorical Features)

In [None]:
#Fill missing data with None (we don't add extra bias)
df_sel_cat.fillna('None',inplace=True)
df_sel_cat.info()




# Data Cleaning 3 (OneHotEncoder for categoral)

In [None]:
from sklearn.preprocessing import OneHotEncoder

#from cat to ohe
encoderOHE=OneHotEncoder(sparse=False)
encoderOHE.fit(df_sel_cat)
feat_sel_cat=encoderOHE.transform(df_sel_cat)
feat_sel_cat

# Train, Val, Test datasets

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
X=np.concatenate((feat_sel_num_imp, feat_sel_cat), axis=1)
y=df_status

X_train, X_test, y_train, y_test = train_test_split( X, y , test_size = 0.2 , random_state = 41)

In [None]:
X_train

# Score Func

In [None]:
def display_scores(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())

# Classifier ML Models

# Linear/Logistic Reg

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg=LogisticRegression().fit( X_train, y_train )

scores = cross_val_score(log_reg, X_train, y_train,scoring='neg_mean_squared_error', cv=10) #greater is better for this -
lin_rmse = np.sqrt(-scores)

display_scores(lin_rmse)


# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()

scores = cross_val_score(tree_reg, X_train, y_train,scoring='neg_mean_squared_error', cv=10) #greater is better for this -
tree_rmse = np.sqrt(-scores)

display_scores(tree_rmse)

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)



scores = cross_val_score(clf, X_train, y_train,scoring='neg_mean_squared_error', cv=10) #greater is better for this -
MLP_rmse = np.sqrt(-scores)

display_scores(MLP_rmse)

# Ensemble Learning: Building a model on top of many other models

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()

scores = cross_val_score(forest_reg, X_train, y_train,scoring='neg_mean_squared_error', cv=10) #greater is better for this -
forest_rmse = np.sqrt(-scores)

display_scores(forest_rmse)

# Grid Search CV


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
     print(np.sqrt(-mean_score), params)


In [None]:
forest_reg=RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=3, n_jobs=4,
           oob_score=False, random_state=42, verbose=0, warm_start=False)
        
scores = cross_val_score(forest_reg, X_train, y_train,scoring='neg_mean_squared_error', cv=10) #greater is better for this -
forest_rmse = np.sqrt(-scores)

display_scores(forest_rmse)

# Random Forest best estimator prediction

In [None]:
from sklearn.metrics import mean_squared_error
final_model=grid_search.best_estimator_
final_predictions=final_model.predict(X_test)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

#  Visualize features and weights

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
attributes =list(df_sel_num.columns)+list(encoderOHE.get_feature_names())
sorted(zip(feature_importances, attributes),reverse=True)

# Learning Curve

In [None]:

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    axes : array of 3 axes, optional (default=None)
        Axes to use for plotting the curves.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt



from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC as SVM
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

fig, axes = plt.subplots(3, 5, figsize=(20, 25))

title = "Learning Curves (LogisticRegression)"
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

estimator = LogisticRegression()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 0], ylim=(0.0, 1.01),
                    cv=cv, n_jobs=4)


title = "Learning Curves (SVM)"
estimator = SVM(gamma=0.001)
plot_learning_curve(estimator, title, X, y, axes=axes[:, 1], ylim=(0.0, 1.01),
                    cv=cv, n_jobs=4)



title = "Learning Curves (GaussianNB)"
estimator = GaussianNB()
plot_learning_curve(estimator, title, X, y, axes=axes[:, 2], ylim=(0.0, 1.01),
                    cv=cv, n_jobs=4)

title = "Learning Curves (DT)"
estimator = DecisionTreeClassifier(max_depth=3)
plot_learning_curve(estimator, title, X, y, axes=axes[:, 3], ylim=(0.0, 1.01),
                    cv=cv, n_jobs=4)


title = "Learning Curves (MLP)"
estimator = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
plot_learning_curve(estimator, title, X, y, axes=axes[:, 4], ylim=(0.0, 1.01),
                    cv=cv, n_jobs=4)

plt.show()

# Decision Tree graphviz

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

from sklearn.tree import export_graphviz

export_graphviz(
        tree_clf,
        out_file="test1.dot",
        feature_names=attributes[:-1],
        class_names='status',
        rounded=True,
        filled=True
    )


# Neural Network

# Data Visualization

In [None]:
import kaplanmeier as km