In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import glob
import gc
from IPython.display import display
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, auc, classification_report, confusion_matrix
from tensorflow import keras
from pandas.plotting import scatter_matrix
import seaborn as sn

2022-12-26 19:54:07.887808: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [17]:
dataset = pd.read_csv("../input/brain-cancer-gene-expression-cumida/Brain_GSE50161.csv")
display(dataset.head())

KeyboardInterrupt: 

In [None]:
display(dataset.info())
#display(dataset.describe())

In [None]:
sum(dataset.isna().sum().values)

#### Hence there is no Null value in the Dataset

In [None]:
classes = dataset.type.unique().tolist()
x_data = dataset.drop(['samples', 'type'], axis = 1).values
y_data = dataset.type.values
func = lambda x : classes.index(x)
y_data = np.asarray([func(i) for i in y_data], dtype = "float32")

In [None]:
print(f"X_data Shape : {x_data.shape}")
print(f"Y_data Shape : {y_data.shape}")

In [None]:
pca_scaler = Pipeline([
    ('Scaler', MinMaxScaler()),
    #('PCA', PCA(n_components = 0.9))
])

x_data = pca_scaler.fit_transform(x_data)

x_data.shape

In [None]:
def KFold_Training(model, model_name, color):
    onehc = OneHotEncoder(sparse=False)
    global tpr, fpr, roc_auc, x_data, y_data, colors, model_names
    cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
    print("Cross Val Score : ", cross_val_score(model, x_data, y_data, cv = cv, scoring = "f1_macro"))
    stratify_kfold = StratifiedKFold(shuffle = True, random_state = 42)
    s_kf = []
    for train_index, val_index in stratify_kfold.split(x_data, y_data):
        x, x_val = x_data[train_index], x_data[val_index]
        y, y_val = y_data[train_index], y_data[val_index]
        model.fit(x, y)
        s_kf.append(model.score(x_val, y_val))
    print(f"\nStratifiedKFold Score : {s_kf}")
    print(classification_report(y_data, model.predict(x_data)))
    display(sn.heatmap(confusion_matrix(y_data, model.predict(x_data)), annot = True))
    
    
    return model

In [None]:
def plot_important_features(model, limiter):
    global dataset, classes
    indexes = np.where(model.feature_importances_ >= np.max(model.feature_importances_) / limiter)
    features = dataset.columns[indexes[0]]
    importancies = np.array(model.feature_importances_[indexes[0]])
    inds = importancies.argsort()
    features = features[inds][-10:]
    importancies = importancies[inds][-10:]
    print("Features                  Importancies", end = "\n\n")
    for feature, importancy in zip(features, importancies):
        print(feature, "       ", importancy)
    fig, ax = plt.subplots(figsize = (15, 10))
    ax.bar(features, importancies)
    ax.set_ylabel('Importance')
    ax.set_xlabel('Features')
    plt.tight_layout()
    plt.show()
    
    #"Plotting Bar Plot for Every Important Feature"
    df = pd.DataFrame()
    df = dataset[features]
    df.hist(figsize = (20, 20))
    df['type'] = np.array(list(map(lambda x : classes.index(x), dataset['type'].values)),
                          dtype = "float32")
    
    #"Plotting Correlation Graphs for Every Important Feature and Label"
    scatter_matrix(df, figsize = (20, 20))
    
    print("Correlation of Type with every other Important Feature")
    display(df.corr()["type"].sort_values(ascending = False))

In [18]:
def plot_corr_matrix(indexes, importancies):
    global dataset, classes
    features = dataset.columns[indexes]    
    fig, ax = plt.subplots(figsize = (15, 10))
    ax.bar(features, importancies)
    plt.tight_layout()
    plt.show()
    
    #"Plotting Bar Plot for Every Important Feature"
    df = pd.DataFrame()
    df = dataset[features]
    df.hist(figsize = (20, 20))
    df['type'] = np.array(list(map(lambda x : classes.index(x), dataset['type'].values)),
                          dtype = "float32")
    
    #"Plotting Correlation Graphs for Every Important Feature and Label"
    scatter_matrix(df, figsize = (20, 20))
    
    print("Correlation of Type with every other Important Feature")
    display(df.corr()["type"].sort_values(ascending = False))

#### After performing PCA and Scaling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, ShuffleSplit, RandomizedSearchCV

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, random_state = 42, shuffle = True, stratify = y_data)

In [None]:
log = LogisticRegression(max_iter = 10000, penalty = "elasticnet", l1_ratio = 0,
                         solver = "saga", n_jobs = -1, C = 0.05)

In [None]:
log.fit(x_train, y_train)

In [None]:
log = KFold_Training(log, "Logestic Regression", 'r')
print("Intercepts : ", log.intercept_)

## Random Forest

In [None]:
rf = RandomForestClassifier(oob_score = True, n_jobs = -1, verbose = 0)

In [None]:
random_grid = {'criterion' : ["gini", "entropy"],
               'max_depth' : np.arange(3, 16, 1),
               'min_samples_split' : np.arange(0.1, 1, 0.1),
               'min_samples_leaf' : np.arange(1, 16, 2),
              'n_estimators' : np.arange(50, 600, 50),
               'max_features' : ["auto", "sqrt", "log2"],
               'class_weight' : ["balanced", "balanced_subsample"],
              'random_state' : np.arange(35, 47, 1)}
rcv = RandomizedSearchCV(estimator = rf,
                         param_distributions = random_grid,
                         n_iter = 100, cv = 5, verbose=0, random_state=35, n_jobs = -1)


rcv = KFold_Training(rcv, "Random Forest", 'darkorange')
rf = rcv.best_estimator_
print("Best Params For Random Forest : ", end = str(rcv.best_params_))
print(f"\n\nOOB_Score : {rf.oob_score_}", end = "\n\n")

## Top 10 Features According to Random Forest

#### These features has high influence on algorithm while prediction

In [None]:
plot_important_features(rf, 2)

## XGB

In [None]:
import xgboost
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(tree_method = "gpu_hist", 
             predictor = "gpu_predictor", gpu_id = 0, use_label_encoder=False, verbosity = 0)
xgb

In [None]:
#random_grid = {'booster' : ['gbtree', 'gblinear'],
#               'max_depth' : np.arange(1, 10, 1),
#               'grow_policy' : ['depthwise', 'lossguide'],
#               'min_child_weight' : np.arange(1, 20, 1),
#               'subsample' : np.arange(0.0, 0.95, 0.5),
#               'max_delta_step' : np.arange(0, 10, 1),
#               'lambda' : np.arange(0, 1, 0.1),
#               'alpha' : np.arange(0, 1, 0.1),
#              'gamma' : np.arange(1, 20, 1),
#              'random_state' : np.arange(35, 47, 1)}
#rcv = RandomizedSearchCV(estimator = xgb,
#                         param_distributions = random_grid,
#                         n_iter = 100, cv = 5, verbose=0, random_state=35, n_jobs = -1)


xgb = KFold_Training(xgb, "XGBoost", 'g')
#print("Best Params For XGB : ", end = str(rcv.best_params_))

## These feature had a high influence on the algorithm

### Correlation between highly influential features

In [None]:
booster = xgb.get_booster()

# Get the importance dictionary (by gain) from the booster
importance = booster.get_score(importance_type="gain")

feature_indexes = list(importance.keys())
features = dataset.columns.tolist().remove('samples')
features = dataset.columns.tolist().remove('type')

# make your changes
for key in importance.keys():
    index = feature_indexes.index(key)
    feature_indexes[index] = feature_indexes[index][1:]
    feature_indexes[index] = int(feature_indexes[index])
    importance[key] = round(importance[key],2)
    
    
pd.Series(list(importance.values()), index = feature_indexes).nlargest(10)
ax = xgboost.plot_importance(importance, max_num_features=10, importance_type='gain', show_values=True)
ax.set_yticklabels(dataset.columns[pd.Series(list(importance.values()), index = feature_indexes).nlargest(10).index.tolist()].tolist())
plt.show()

In [None]:
plot_important_features(xgb, 10)

#### Correlation Plot of all top 10 important Features

## SVC

In [None]:
svc = SVC(max_iter = -1)
svc

In [None]:
random_grid = {'C' : np.arange(1, 100, 5),
              'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree' : np.arange(1, 10, 1),
              'gamma' : ['scale', 'auto'],
              'coef0' : np.arange(0.0, 1.0, 0.1),
              'random_state' : np.arange(35, 47, 1)}
rcv = RandomizedSearchCV(estimator = svc,
                         param_distributions = random_grid,
                         n_iter = 100, cv = 5, verbose=0, random_state=35, n_jobs = -1)


rcv = KFold_Training(rcv, "SVC_RandomizedSearchCV", 'g')
svc = rcv.best_estimator_
print("Best Params For SVC : ", end = str(rcv.best_params_))
print("\n\nIntercepts : ", end = str(svc.intercept_))


### Feature Importances according to SVC

In [None]:
features = dataset.columns.tolist().remove('samples')
features = dataset.columns.tolist().remove('type')

fig ,ax = plt.subplots()
ax.set_xlabel("Importance")
ax.set_ylabel("Index of Features")
ax.set_yticks(np.arange(1, 11, 1))
feature_importance = pd.Series(abs(svc.coef_[0]), index=features).nlargest(10).plot(kind='barh')
ax.set_yticklabels(dataset.columns[pd.Series(abs(svc.coef_[0]), index=features).nlargest(10).index.tolist()].tolist())
plt.show()

In [None]:
plot_corr_matrix(pd.Series(abs(svc.coef_[0]), index=features).nlargest(10).index,
                pd.Series(abs(svc.coef_[0]), index=features).nlargest(10).values)