<a href="https://colab.research.google.com/github/usermar445/aml_final_project/blob/main/scripts/RandomForest_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [40]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb

from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import math

from tqdm import tqdm
from time import sleep

from sklearn.model_selection import GridSearchCV, train_test_split

from sklearn.model_selection import RandomizedSearchCV

In [3]:
from imblearn.over_sampling import SMOTE

# 0. Functions

**Tree data functions**

In [4]:
# re one-hot-encodes wilderness area and soiltype for tree data set

# has to be executed on the original df
def re_one_hot_encode_soiltype(data, drop=True):
  soil = data.iloc[:, 14:54]
  data['soil_type'] = soil.idxmax(1)
  data['soil_type'] = data['soil_type'].replace(data['soil_type'].unique(), np.arange(1, data['soil_type'].nunique()+1))
  if drop:
    cols = data.iloc[:, 14:54].columns.to_list()
    data = data.drop(columns=cols)
  return data

def re_one_hot_encode_wilderness(data, drop=True):
  wilderness = data[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].copy()
  data['wilderness_area'] = wilderness.idxmax(1)
  data['wilderness_area'] = data['wilderness_area'].replace(data['wilderness_area'].unique(), np.arange(1, data['wilderness_area'].nunique()+1))
  if drop:
    data = data.drop(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis=1)
  return data

def re_one_hot_encode_categorical(data, drop=True):
  return re_one_hot_encode_wilderness(re_one_hot_encode_soiltype(data, drop), drop)

# helper method to split in features and labels
def get_features_labels_tree(data, values=False):
  X = data.drop("Cover_Type", axis=1)
  y = data.loc[:, 'Cover_Type']
  if values:
    return[X.values, y.values]
  return [X, y]

**Heloc functions**

In [5]:
# helper function to determine indexes of rows containing only na values
def get_indexes_nan_rows(data):
  all_na = data.isna().all(axis=1)
  all_na = all_na[all_na==True]
  return all_na.index

# returns percentage of missing values per column
def get_missing_values_per_columns(data, columns, threshold):
  missing = {}
  problematic = []
  for column in columns:
    col = data.loc[:, column]
    nans = np.isnan(col)
    nans = nans[nans == True]
    missing_values = len(nans)/len(col)
    missing.update({column: missing_values})
    if missing_values >= threshold:
      problematic.append(column)
  return [missing, problematic]

#imputes missing values
#because most of the features are very skewed, median seems more reasonable
def impute_heloc(data):
  imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
  columns = data.columns
  indexes = data.index
  imp_median.fit(data.values)
  data = imp_median.transform(data.values)
  data = pd.DataFrame(data, columns=columns)
  data = data.set_index(indexes)
  return data

# wrapper function that prepares complete data set
def prepare_data_heloc_train(data, drop_columns):
  print("Inital dimensions:", data.values.shape)
  print("Recode labels")
  # replace labels with int values (because final submission needs to be 0, 1)
  data['RiskPerformance'] = data['RiskPerformance'].replace(('Bad','Good'), (0,1))
  # columns were determined in EDA
  print("Drop columns")
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  print("Split in features and labels")
  # split to replace nas -> has to be done because complete NaS rows are excluded from prediction/training and will just be assigned fixed binomial probability classification
  dfX = data.drop("RiskPerformance", axis=1)
  dfy = pd.DataFrame(data.loc[:, 'RiskPerformance'])
  print("Replace nas")
  dfX[dfX<0] = np.nan
  # get na rows index
  na_indexes = get_indexes_nan_rows(dfX)
  #drop na rows
  dfX = dfX.drop(na_indexes)
  dfy = dfy.drop(na_indexes)
  print("Impute nas")
  dfX = impute_heloc(dfX)
  return dfX.join(dfy)

# wrapper function that prepares complete data set
def prepare_data_heloc_test(data, drop_columns):
  print("Inital dimensions:", data.values.shape)
  print("Recode labels")
  # replace labels with int values (because final submission needs to be 0, 1)
  #data['RiskPerformance'] = data['RiskPerformance'].replace(('Bad','Good'), (0,1))
  # columns were determined in EDA
  print("Drop columns")
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  print("Split in features and labels")
  print("Replace nas")
  data[data<0] = np.nan
  data = impute_heloc(data)
  return data

# helper method to split in features and labels
def get_features_labels_heloc(data, values=False):
  X = data.drop("RiskPerformance", axis=1)
  y = data.loc[:, 'RiskPerformance']
  if values:
    return[X.values, y.values]
  return [X, y]

**Higgs functions**

In [6]:

# cleans and prepares data
def clean_higgs_data(data, drop_columns, train=True):
  print("Inital dimensions:", data.values.shape)
  if train:
    # replace labels with int values (because final submission needs to be 0, 1)
    data['Label'] = data['Label'].replace(('b','s'), (0,1))
  #drop columns
  data = data.drop(drop_columns, axis=1)
  print("New dimensions:", data.values.shape)
  return data

# helper function to get features and labels split
def get_features_labels_higgs(data, values=False):
  X = data.drop("Label", axis=1)
  y = data.loc[:, 'Label']
  if values:
    return[X.values, y.values]
  return [X, y]

**Prepare data**

In [7]:
def make_datasets_ready(tree, heloc, drop_columns_heloc, higgs, drop_columns_higgs):
  # two versions of tree data: one with one-hot-encoding one with categorical (lgb)
  print("Preparing tree data...")
  print("Inital dimensions ", tree.values.shape)
  print("New dimensions ", tree.values.shape)
  print("Preparing tree data 2nd version...")
  print("Inital dimensions ", tree.values.shape)
  lgb = re_one_hot_encode_categorical(tree)
  print("New dimensions ", lgb.values.shape)
  print("Preparing heloc data...")
  heloc_train = prepare_data_heloc_train(heloc, drop_columns_heloc)
  print("Preparing higgs data...")
  higgs_train = clean_higgs_data(higgs, drop_columns_higgs)
  return [tree, heloc_train, higgs_train, lgb]

In [8]:


def make_datasets_ready_test(tree, heloc, drop_columns_heloc, higgs, drop_columns_higgs):
  # two versions of tree data: one with one-hot-encoding one with categorical (lgb)
  print("Preparing tree data...")
  print("Inital dimensions ", tree.values.shape)
  print("New dimensions ", tree.values.shape)
  print("Preparing tree data 2nd version...")
  print("Inital dimensions ", tree.values.shape)
  lgb = re_one_hot_encode_categorical(tree)
  print("New dimensions ", lgb.values.shape)
  print("Preparing heloc data...")
  heloc_train = prepare_data_heloc_test(heloc, drop_columns_heloc)
  print("Preparing higgs data...")
  higgs_train = clean_higgs_data(higgs, drop_columns_higgs, train=False)
  return [tree, heloc_train, higgs_train, lgb]

In [9]:
def get_training_ready_data(data, full_train=False, split=0.2):
  # first keep data as dataframes to extract columns names (used for LightGBM)
  dfXs = [get_features_labels_tree(data[0])[0], get_features_labels_heloc(data[1])[0], get_features_labels_higgs(data[2])[0], get_features_labels_tree(data[3])[0]]
  dfys = [get_features_labels_tree(data[0])[1], get_features_labels_heloc(data[1])[1], get_features_labels_higgs(data[2])[1], get_features_labels_tree(data[3])[1]]
  column_names = [dat.columns.to_list() for dat in dfXs]

  #only get values
  X = [get_features_labels_tree(data[0], values=True)[0], get_features_labels_heloc(data[1], values=True)[0], get_features_labels_higgs(data[2], values=True)[0], get_features_labels_tree(data[3], values=True)[0]]
  y = [get_features_labels_tree(data[0], values=True)[1], get_features_labels_heloc(data[1], values=True)[1], get_features_labels_higgs(data[2], values=True)[1], get_features_labels_tree(data[3], values=True)[1]]

  if full_train:
      n_features = [x.shape[1] for x in X]
      n_rows = [x.shape[0] for x in X]
      print("number of features ", n_features)
      print("number of rows ", n_rows)
      labels = [np.unique(goal) for goal in y]
      n_labels = [len(np.unique(goal)) for goal in y]
      print("labels: ", labels)
      print("number of labels ", n_labels)
      return X, y
  else:
    #split in train and validation set
    X_train_tree, X_test_tree, y_train_tree, y_test_tree = train_test_split(X[0], y[0], test_size=split, random_state=42)
    X_train_heloc, X_test_heloc, y_train_heloc, y_test_heloc = train_test_split(X[1], y[1], test_size=split, random_state=42)
    X_train_higgs, X_test_higgs, y_train_higgs, y_test_higgs = train_test_split(X[2], y[2], test_size=split, random_state=42)
    X_train_tree_lgb, X_test_tree_lgb, y_train_tree_lgb, y_test_tree_lgb = train_test_split(X[3], y[3], test_size=split, random_state=42)

    # create lists
    X_train = [X_train_tree, X_train_heloc, X_train_higgs, X_train_tree_lgb]
    X_test = [X_test_tree, X_test_heloc, X_test_higgs, X_test_tree_lgb]
    y_train = [y_train_tree, y_train_heloc, y_train_higgs, y_train_tree_lgb]
    y_test = [y_test_tree, y_test_heloc, y_test_higgs, y_test_tree_lgb]


    # extract some controlling numbers for better overivew
    n_features = [x.shape[1] for x in X]
    n_rows = [x.shape[0] for x in X]
    print("number of features ", n_features)
    print("number of rows ", n_rows)
    labels = [np.unique(goal) for goal in y]
    n_labels = [len(np.unique(goal)) for goal in y]
    print("labels: ", labels)
    print("number of labels ", n_labels)

    return X_train, X_test, y_train, y_test, column_names

**Compare classifiers**

In [10]:
def compare_standard_classifiers(classifiers, classifier_names, x_train, x_test, y_train, y_test, data_names):
  scores = []
  for name, clf in zip(classifier_names, classifiers):
          print("Classifer " + name)
          results = {"model": name, "parameters": clf.get_params()}
          score = []
          for xtrain, xtest, ytrain, ytest, data_name in zip(x_train, x_test, y_train, y_test, data_names):
            print("Train ", data_name)
            clf = make_pipeline(StandardScaler(), clf)
            clf.fit(xtrain, ytrain)
            print("Test")
            acc = clf.score(xtest, ytest)
            score.append(acc)
            results.update({data_name: acc})
            print("Done")
          results.update({"overall score": np.mean(score)})
          scores.append(results)
  return scores

**Make predictions**

In [12]:
def make_prediction_heloc(train_set, drop_columns,  test_set, classifier, test_submission):
  train = prepare_data_heloc_train(train_set, drop_columns)
  test, na_indexes = prepare_data_heloc_test(test_set, drop_columns)
  X_test = test.values
  X,y = get_features_labels_heloc(train)
  print("train")
  classifier = make_pipeline(StandardScaler(), classifier)
  classifier.fit(X, y)
  print("predict")
  y_pred = classifier.predict(X_test)
  p = 5000/5459
  na_preds = np.random.binomial(1, p, size=len(na_indexes))
  y_pred[na_indexes] = na_preds
  test_submission['pred'] = y_pred.astype(int)
  test_submission = test_submission.drop('Prediction', axis=1)
  test_submission = test_submission.rename(columns={'pred': 'Prediction'})
  return test_submission

In [54]:
def write_pred_file(output, type, path):
  predictions = pd.DataFrame(output)
  if type == 'tree':
    predictions['ID'] = np.arange(1,3501)
  elif type == 'heloc':
    predictions['ID'] = np.arange(3501, 3501+1046)
  elif type == 'higgs':
    predictions['ID'] = np.arange(4547, 4547+75000)
  predictions = predictions.rename({0: 'Prediction'}, axis=1)
  predictions = predictions[['ID', 'Prediction']]
  predictions.to_csv(path, index=False)
  return predictions

# 1. Load Data

In [13]:
tree_train = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_train.csv")
heloc_train = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_train.csv")
higgs_train = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_train.csv")

In [14]:
tree_test = pd.read_csv("/content/drive/MyDrive/aml/data/covtype_test.csv")
heloc_test = pd.read_csv("/content/drive/MyDrive/aml/data/heloc_test.csv")
higgs_test = pd.read_csv("/content/drive/MyDrive/aml/data/higgs_test.csv")

In [48]:
path = "/content/drive/MyDrive/aml/"

# 2. Prepare data sets

In [15]:
# final columns to drop
# first 3 because of missing values
# the rest because of multicorriliarity
drop_columns_heloc2 = [
                      ]

In [16]:
# final list of columns to drop
higgs_drop_columns2 = ['EventId']

In [17]:
data_train = make_datasets_ready(tree_train, heloc_train, drop_columns_heloc2, higgs_train, higgs_drop_columns2)

Preparing tree data...
Inital dimensions  (58101, 55)
New dimensions  (58101, 55)
Preparing tree data 2nd version...
Inital dimensions  (58101, 55)
New dimensions  (58101, 13)
Preparing heloc data...
Inital dimensions: (9413, 24)
Recode labels
Drop columns
New dimensions: (9413, 24)
Split in features and labels
Replace nas
Impute nas
Preparing higgs data...
Inital dimensions: (175000, 33)
New dimensions: (175000, 32)


In [18]:
data_test = make_datasets_ready_test(tree_test, heloc_test, drop_columns_heloc2, higgs_test, higgs_drop_columns2)

Preparing tree data...
Inital dimensions  (3500, 54)
New dimensions  (3500, 54)
Preparing tree data 2nd version...
Inital dimensions  (3500, 54)
New dimensions  (3500, 12)
Preparing heloc data...
Inital dimensions: (1046, 23)
Recode labels
Drop columns
New dimensions: (1046, 23)
Split in features and labels
Replace nas
Preparing higgs data...
Inital dimensions: (75000, 32)
New dimensions: (75000, 31)


# Predict

In [21]:
X, y = get_training_ready_data(data_train, full_train=True)

number of features  [55, 23, 31, 12]
number of rows  [58101, 8876, 175000, 58101]
labels:  [array([1, 2, 3, 4, 5, 6, 7]), array([0, 1]), array([0, 1]), array([1, 2, 3, 4, 5, 6, 7])]
number of labels  [7, 2, 2, 7]


## Higgs

In [33]:
rand_indexes = np.random.randint(0, X[2].shape[0], 10000)

In [34]:
X_sample = X[2][rand_indexes, :]
y_sample = y[2][rand_indexes]

In [43]:
# Define the parameter grid for Random Search CV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)

# Fit the grid search to the data
rf_random.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", rf_random.best_params_)
print("Best cross-validation score: {:.4f}".format(rf_random.best_score_))





Fitting 2 folds for each of 10 candidates, totalling 20 fits
Best parameters found:  {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': False}
Best cross-validation score: 1.0000


In [44]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [200, 300, 400],  # Number of trees in the forest
    'max_depth': [20, 30, 40],  # Maximum depth of the tree
    'min_samples_split': [5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters found:  {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 1.0000


In [45]:
# Predict on test data with the best model
best_rf_model = grid_search.best_estimator_
pred_higgs = best_rf_model.predict(data_test[2].values)

In [55]:
write_pred_file(pred_higgs, 'higgs', path+"pred_higgs_rf.csv")

Unnamed: 0,ID,Prediction
0,4547,0
1,4548,0
2,4549,1
3,4550,0
4,4551,0
...,...,...
74995,79542,1
74996,79543,0
74997,79544,0
74998,79545,1


## Heloc

In [95]:
X_heloc = heloc_train.drop('RiskPerformance', axis=1)
y_heloc = heloc_train[['RiskPerformance']]

X_heloc = X_heloc.values
y_heloc = y_heloc.values

In [96]:
rand_indexes = np.random.randint(0, X_heloc.shape[0], 10000)

In [97]:
X_sample = X_heloc[rand_indexes, :]
y_sample = y_heloc[rand_indexes]

In [98]:
# Define the parameter grid for Random Search CV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)

# Fit the grid search to the data
rf_random.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", rf_random.best_params_)
print("Best cross-validation score: {:.4f}".format(rf_random.best_score_))





Fitting 2 folds for each of 10 candidates, totalling 20 fits


  self.best_estimator_.fit(X, y, **fit_params)


Best parameters found:  {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 30, 'bootstrap': False}
Best cross-validation score: 0.8191


In [99]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [20, 30, 40],  # Maximum depth of the tree
    'min_samples_split': [1,2,3],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1,2,3],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))


Fitting 3 folds for each of 81 candidates, totalling 243 fits


81 fits failed out of a total of 243.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Best parameters found:  {'bootstrap': False, 'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}
Best cross-validation score: 0.8451


In [100]:
param_grid = {
    'n_estimators': 200,  # Number of trees in the forest
    'max_depth': 40,  # Maximum depth of the tree
    'min_samples_split': 3,  # Minimum number of samples required to split an internal node
    'min_samples_leaf': 1,  # Minimum number of samples required to be at a leaf node
    'bootstrap': False  # Method of selecting samples for training each tree
}


In [101]:
clf = RandomForestClassifier(**param_grid)

In [102]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_heloc, y_heloc)

In [103]:
clf.fit(X_train_smote, y_train_smote)

In [104]:
pred_heloc = clf.predict(heloc_test.values)

In [105]:
write_pred_file(pred_heloc, 'heloc', path+"pred_heloc_rf_2.csv")

Unnamed: 0,ID,Prediction
0,3501,1
1,3502,0
2,3503,0
3,3504,0
4,3505,0
...,...,...
1041,4542,1
1042,4543,0
1043,4544,1
1044,4545,1


# Covtype

In [93]:
rand_indexes = np.random.randint(0, X[0].shape[0], 10000)

In [94]:
X_sample = X[0][rand_indexes, :]
y_sample = y[0][rand_indexes]

IndexError: ignored

In [None]:
# Define the parameter grid for Random Search CV
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)

# Fit the grid search to the data
rf_random.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", rf_random.best_params_)
print("Best cross-validation score: {:.4f}".format(rf_random.best_score_))





Fitting 2 folds for each of 10 candidates, totalling 20 fits


  self.best_estimator_.fit(X, y, **fit_params)


Best parameters found:  {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 30, 'bootstrap': False}
Best cross-validation score: 1.0000


In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [200, 300, 400],  # Number of trees in the forest
    'max_depth': [20, 30, 40],  # Maximum depth of the tree
    'min_samples_split': [5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [False]  # Method of selecting samples for training each tree
}

# Initialize a Random Forest classifier
rf = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_sample, y_sample)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.4f}".format(grid_search.best_score_))


Fitting 3 folds for each of 9 candidates, totalling 27 fits


  self.best_estimator_.fit(X, y, **fit_params)


Best parameters found:  {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Best cross-validation score: 1.0000


In [None]:
param_grid = {
    'n_estimators': 200,  # Number of trees in the forest
    'max_depth': 20,  # Maximum depth of the tree
    'min_samples_split': 5,  # Minimum number of samples required to split an internal node
    'min_samples_leaf': 4,  # Minimum number of samples required to be at a leaf node
    'bootstrap': False  # Method of selecting samples for training each tree
}


In [None]:
clf = RandomForestClassifier(**param_grid)

In [None]:
clf.fit(X_heloc, y_heloc)

  clf.fit(X_heloc, y_heloc)


In [None]:
pred_heloc = clf.predict(heloc_test.values)

In [None]:
write_pred_file(pred_heloc, 'heloc', path+"pred_heloc_rf.csv")

Unnamed: 0,ID,Prediction
0,3501,1
1,3502,0
2,3503,0
3,3504,0
4,3505,0
...,...,...
1041,4542,1
1042,4543,0
1043,4544,1
1044,4545,1
