<a href="https://colab.research.google.com/github/team0243/Project_ML/blob/main/Classification_RCC_UTUC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing the Python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import plot_tree

## Perform Machine learning in RCC_UTUC dataset

In [None]:
df2 = pd.read_excel('Dataset_RCC_UTUC.xlsx')
df2.head()

In [None]:
df2.Diagnosis.value_counts()

In [None]:
df2.columns

In [None]:
cols_df2  = ['Age ','PLR','WBC','PLT','PMN','Lymp'] #Assume select best 6 feature

In [None]:
# Extract features and target variables
X2 = df2[cols_df2]
y2 = df2['Diagnosis']

In [None]:
# To solve the imbalance problem between categories 0 and 1.
# Apply SMOTE (Synthetic Minority Oversampling Technique) – Oversampling

sm = SMOTE(sampling_strategy = 0.95, random_state = 30)
X_resampled, y_resampled = sm.fit_resample(X2, y2)

In [None]:
X2.shape, y2.shape, X_resampled.shape, y_resampled.shape

In [None]:
y2.value_counts(), y_resampled.value_counts()

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.3, random_state = 50)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
# Select ML models that are good for small datasets include logistic regression, decision trees, and random forests
# More complex models (like SVM or MLP) require a large amount of data to generalize well

models2 = {'Logistic Regression': LogisticRegression(),
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Gradient Boosting': GradientBoostingClassifier()
}

In [None]:
def model_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(40)
    model_scores = {}
    for model_name, model in models.items(): # Use the 'models' argument here
        model.fit(X_train, y_train)
        model_scores[model_name] = model.score(X_test, y_test)
    return pd.DataFrame(model_scores, index=['Score']).transpose()


In [None]:
supervised_model_scores = model_score(models2, X_train, X_test, y_train, y_test) # Pass 'models2' to the function
supervised_model_scores.style.background_gradient(cmap = 'Blues')

In [None]:
#data Visulization
supervised_model_scores.plot(kind='barh', figsize=(5, 3))
plt.title('Model Scores')
plt.xlabel('Score')
plt.ylabel('Models')
plt.show()


## Random Forest

In [None]:
# Tuning the best parameters for Random Forest using GridSearchCV

grid_values = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion' :['gini', 'entropy']
}
cross_validation = StratifiedKFold(n_splits = 3, shuffle=True, random_state= 42)
rfc = RandomForestClassifier()

In [None]:
grid_search_RF = GridSearchCV(rfc, param_grid = grid_values, cv = cross_validation, n_jobs = -1, verbose = 3)
grid_search_RF.fit(X_train, y_train)

In [None]:
print("Best parameters for Random Forest:")
print(grid_search_RF.best_params_)
accuracy = grid_search_RF.best_score_ *100
print("Accuracy: {:.2f}%".format(accuracy) )

In [None]:
y_prediction = grid_search_RF.predict(X_test)

test_accuracy= accuracy_score(y_test, y_prediction)*100

print("Accuracy of RF is: {:.2f}%".format(test_accuracy))
print()
print(classification_report(y_test, y_prediction))

In [None]:
# Import necessary libraries
from sklearn.metrics import confusion_matrix,RocCurveDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(y_test,y_prediction )

# Plot the confusion matrix using Seaborn
sns.heatmap(cm, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.title('Confusion Matrix (RF)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
RocCurveDisplay.from_estimator(grid_search_RF, X_test, y_test)
plt.show()

## Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=10000)
param_grid = {'C': [0.1, 1, 10, 100],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['lbfgs', 'liblinear', 'sag', 'saga']}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
# Use GridSearchCV to find the best hyperparameters
grid_search_lr = GridSearchCV(logreg, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search_lr.fit(X_train, y_train)
print("Best parameters for logistic regression:")
print(grid_search_lr.best_params_)
print("Best score for logistic regression: {:.2f}%".format(grid_search_lr.best_score_*100))

In [None]:
# Evaluate the logistic regression model's performance on the testing data
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,RocCurveDisplay
y_pred_lr = grid_search_lr.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_lr)
print("Accuracy of logistic regression on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred_lr))

In [None]:
# Calculate the confusion matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)

# Plot the confusion matrix using Seaborn
sns.heatmap(cm_lr, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.title('Confusion Matrix (logistic regression)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
y_test.value_counts() #data for Test

In [None]:
# Convert the NumPy array to a Pandas Series
y_pred_lr_series = pd.Series(y_pred_lr)

y_pred_lr_series.value_counts() #data for Predict

In [None]:
RocCurveDisplay.from_estimator(grid_search_lr, X_test, y_test)
plt.show()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state = 10)
param_grid = {'max_depth': [3, 5, 7, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion': ['gini', 'entropy']}
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 10)

In [None]:
# Use GridSearchCV to find the best hyperparameters
grid_search_dt = GridSearchCV(dt, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search_dt.fit(X_train, y_train)
print("Best parameters for decision tree:")
print(grid_search_dt.best_params_)
print("Best score for decision tree: {:.2f}%".format(grid_search_dt.best_score_*100))

In [None]:
# Evaluate the decision tree model's performance on the testing data

y_pred_dt = grid_search_dt.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_dt)
print("Accuracy of decision tree on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred_dt))

In [None]:
# Calculate the confusion matrix
cm_dt = confusion_matrix(y_test,y_pred_dt )

# Plot the confusion matrix using Seaborn
sns.heatmap(cm_dt, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.title('Confusion Matrix (decision tree)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
RocCurveDisplay.from_estimator(grid_search_dt, X_test, y_test)
plt.show()

## Gradient Boosting

In [None]:
gbc = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth' : [3, 4, 5, 6, 7, 8],
    'subsample' : [0.6, 0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
grid_search_GB = GridSearchCV(gbc, param_grid = param_grid, cv = cv, n_jobs = -1, verbose = 3)
grid_search_GB.fit(X_train, y_train)

In [None]:
print("Best parameters for Gradient Boosting:")
print(grid_search_GB.best_params_)
accuracy = grid_search_GB.best_score_ * 100
print("Accuracy: {:.2f}%".format(accuracy))

In [None]:
y_prediction_GB = grid_search_GB.predict(X_test)
test_accuracy = accuracy_score(y_test, y_prediction_GB) * 100
print("Accuracy of GB is: {:.2f}%".format(test_accuracy))
print()
print(classification_report(y_test, y_prediction_GB))

In [None]:
# Calculate the confusion matrix
cm_gb = confusion_matrix(y_test, y_prediction_GB)

# Plot the confusion matrix using Seaborn
sns.heatmap(cm_gb, annot=True, cmap=plt.cm.Blues, fmt='g')
plt.title('Confusion Matrix (decision tree)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
RocCurveDisplay.from_estimator(grid_search_GB, X_test, y_test)
plt.title('Title)', fontsize=16)
plt.xlabel('x Label', fontsize=14)
plt.ylabel('y Label', fontsize=14)
plt.show()

In [None]:
# Plot ROC curves for multiple models on the same graph
plt.figure(figsize=(8, 6))

# Random Forest
RocCurveDisplay.from_estimator(grid_search_RF, X_test, y_test, name="Random Forest", ax=plt.gca())

# Logistic Regression
RocCurveDisplay.from_estimator(grid_search_lr, X_test, y_test, name="Logistic Regression", ax=plt.gca())

# Decision Tree
RocCurveDisplay.from_estimator(grid_search_dt, X_test, y_test, name="Decision Tree", ax=plt.gca())

# Gradient Boosting
RocCurveDisplay.from_estimator(grid_search_GB, X_test, y_test, name="Gradient Boosting", ax=plt.gca())

plt.plot([0, 1], [0, 1], linestyle='--', color='black')  # Diagonal line for reference
plt.title('ROC Curves for Multiple Models', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()


**End**