In [2]:
# importing required library to perform feature selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# importing the dataset
input_file = "chowdary.csv"
df = pd.read_csv(input_file)

# seperating the independent and dependent variables
X = df.iloc[:, 3:]
Y = df.iloc[:, 1]

# splitting the dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

# Convert back to DataFrame
X_train_normalized_df = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_test.columns)

# summarizing the dataset
# number of 'B' and 'C' in training and testing set
print("The number of 'B' and 'C' in training set: ", Y_train.value_counts())
print("The number of 'B' and 'C' in testing set: ", Y_test.value_counts())
print("The shape of the training set is: ", X_train_normalized_df.shape)
print("The shape of the testing set is: ", X_test_normalized_df.shape)

The number of 'B' and 'C' in training set:  tumour
B    44
C    28
Name: count, dtype: int64
The number of 'B' and 'C' in testing set:  tumour
B    18
C    14
Name: count, dtype: int64
The shape of the training set is:  (72, 181)
The shape of the testing set is:  (32, 181)


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the Random Forest classifier
rf = RandomForestClassifier()

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
}

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search_rf.fit(X_train_normalized_df, Y_train)

# Print the best hyperparameters
print("Best parameters: ", grid_search_rf.best_params_)

Best parameters:  {'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 100}


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# Define the Random Forest classifier
estimator = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100)
cv = StratifiedKFold(n_splits=5)
rfecv = RFECV(estimator, step=1, cv=cv, scoring="accuracy")

# Define the hyperparameters grid
param_grid = {
    'step': [1, 5, 10],
    'cv': [StratifiedKFold(n_splits=2), StratifiedKFold(n_splits=5), StratifiedKFold(n_splits=10)]
}

# Create the GridSearchCV object
grid_search_rf = GridSearchCV(rfecv, param_grid, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search_rf.fit(X_train_normalized_df, Y_train)

# Print the best hyperparameters
print("Best parameters: ", grid_search_rf.best_params_)

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

# initializing the estimator model
estimator = RandomForestClassifier(max_depth=10, min_samples_leaf=1, n_estimators=100)

# setting up the RFECV with the estimator and cross-validation strategy
cv = StratifiedKFold(n_splits=2)
rfecv = RFECV(estimator, step=1, cv=cv, scoring='accuracy')

# fitting the data to the model
rfecv.fit(X_train_normalized_df, Y_train)

# getting the optimal number of features
optimal_n_features = rfecv.n_features_
print("Optimal number of features : %d" % optimal_n_features)

# naming of the features selected
selected_features = X_train_normalized_df.columns[rfecv.support_]
print("Selected features : %s" % selected_features)

# getting the cross-validated scores for each number of features
cv_scores = rfecv.cv_results_['mean_test_score']

# getting the number of features for each iteration
n_features = list(range(1, len(cv_scores) + 1))
