<a href="https://colab.research.google.com/github/team0243/Project_ML/blob/main/RCC_UCUT_Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing the Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import plot_tree

Perform Machine learning using biobank_RCC_UTUC dataset

In [None]:
df = pd.read_excel('Dataset_RCC_UTUC.xlsx') # replace 'your_file.xlsx' with your file

In [None]:
# Prints information about a DataFrame
df.info()

In [None]:
# Count NaN values in DataFrame

df.isna().sum()


In [None]:
# descriptive statistic  in the DataFrame
df.describe().round(2)

In [None]:
#sns.pairplot(df, hue = '')
sns.pairplot(df, hue = 'Diagnosis')
plt.show()

In [None]:
# Verify column names in your DataFrame
print(df.columns)


In [None]:
# Pass the columns to be dropped as a list, ensuring correct names
#columns_to_drop = ['HN', 'PLR']  # Adjust if needed based on actual column names
#X = df['NLR']
#y = df['PLR']
#Z = df.drop(columns_to_drop, axis='columns')

In [None]:
X = df[[ 'Age ','NLR','PLR','WBC','PLT','PMN','Lymp']]
#X = df.drop(['Diagnosis'], axis = 'columns')
y = df['Diagnosis_num']

In [None]:
# prompt: show X data frame
X.head(5)

In [None]:
X.shape, y.shape

In [None]:
# To solve the imbalance problem between categories 0 and 1.
# Apply SMOTE (Synthetic Minority Oversampling Technique) – Oversampling

sm = SMOTE(sampling_strategy = 0.96 ,random_state = 55)
X_resampled, y_resampled = sm.fit_resample(X,y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state = 50)

In [None]:
#X.shape, y.shape, X_resampled.shape, y_resampled.shape

In [None]:
#y.value_counts(), y_resampled.value_counts()

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
n_features = [7,6,5,4,3,2,1]

model = DecisionTreeClassifier()
column_names = X.columns.tolist()

In [None]:
# Iterate over the n_features_to_select values
for n in n_features:
    rfe = RFE(estimator = model, n_features_to_select = n)

    rfe.fit(X_train, y_train)

    selected_feature_indices = [i for i, support in enumerate(rfe.support_) if support]

    X_train_selected = X_train.iloc[:, selected_feature_indices]
    X_test_selected = X_test.iloc[:, selected_feature_indices]

    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)

    # Calculate the accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Print the selected features and accuracy
    print(f"Number of Selected Features = {n}")
    print("Selected Features")
    for col in X_train_selected.columns:
        print(col)
    print("-------------------------------------------")
    print("Accuracy:", accuracy)
    print("-------------------------------------------")
    print("Classification report")
    print()
    print(classification_report(y_test, y_pred))
    print("-------------------------------------------")

In [None]:
# Select   6 best features from RFE
#cols = ['Age ','NLR','PLR','WBC','PLT','PMN']

In [None]:
# Select  4 best features from RFE
cols = ['Age ','PLR','WBC','PLT']

In [None]:
X_bf = df[cols]
y_bf = df['Diagnosis_num']

In [None]:
sm = SMOTE(sampling_strategy = 0.95 ,random_state = 50)
X_resampled_bf, y_resampled_bf = sm.fit_resample(X_bf ,y_bf)

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled_bf, y_resampled_bf, test_size = 0.3, random_state = 50)

In [None]:
y_bf.value_counts(),y_train.value_counts(),y_test.value_counts()

In [None]:
# Standardizing data with StandardScaler() function
sc = StandardScaler()
X_train =  sc.fit_transform(X_train)
X_test =  sc.fit_transform(X_test)

### Perform supervised machine learning algorithms
* K-nearest neighbors
* Decision tree
* Random forest
* Support vector machine
* Artificial neural network

In [None]:
models = {'Logistic Regression': LogisticRegression(),
          'K-Nearest Neighbors ': KNeighborsClassifier(),
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Support Vector Machine': SVC(),
          'Multilayer Perceptron': MLPClassifier()}

In [None]:
#Define function for determining of model scores
def model_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(50)
    model_scores = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[model_name] = model.score(X_test, y_test)
    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score', ascending=False)
    return model_scores

In [None]:
supervised_model_scores = model_score(models, X_train, X_test, y_train, y_test)
supervised_model_scores.style.background_gradient(cmap = 'Greens')

In [None]:
dt = DecisionTreeClassifier(random_state=50)
param_grid = {'max_depth': [3, 5, 7, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion': ['gini', 'entropy']}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
print("Best parameters for decision tree:")
print(grid_search.best_params_)
print("Best score for decision tree: {:.2f}%".format(grid_search.best_score_*100))

In [None]:
# Evaluate the decision tree model's performance on the testing data
#from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, plot_roc_curve
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred))

In [None]:
# Import necessary libraries
from sklearn.metrics import confusion_matrix,RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(y_pred,y_test)

# Plot the confusion matrix using Seaborn
sns.heatmap(cm, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.title('Confusion Matrix (RF)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
RocCurveDisplay.from_estimator(grid_search, X_test, y_test)
plt.show()

In [None]:
# Import necessary libraries
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Plot the decision tree
plt.figure(figsize = (10, 8))

# Determine the actual number of features in your dataset
num_features = X_train.shape[1] # Get the number of columns in X_train

# Create a list of feature names with the correct length
feature_names = [f'feature_{i}' for i in range(num_features)]

plot_tree(grid_search.best_estimator_, feature_names=feature_names,
          class_names=['Class 0', 'Class 1'], filled=True)
plt.title('Decision Tree', fontsize=16)
plt.show()