In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler

### Random Forest on Heloc ###

In [4]:
data= pd.read_csv('heloc_train.csv')
df= pd.DataFrame(data)
df_encoded = pd.get_dummies(df, columns=['RiskPerformance'], drop_first=True)
df_encoded["RiskPerformance_Good"] = df_encoded["RiskPerformance_Good"].astype(int)

In [7]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

x = df_encoded.drop('RiskPerformance_Good', axis=1)
y= df_encoded['RiskPerformance_Good']

selector = RFE(RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1))
selector = selector.fit(x,y)

# Print the selected features
selected_features = [index for index, value in enumerate(selector.get_support()) if value]
print(f"Selected features: {selected_features}")

x_train, x_test, y_train, y_test = train_test_split(x.iloc[:, selected_features], y, test_size=0.1, random_state=42)

# Display the shapes of the resulting sets
print("X_train shape:", x_train.shape)
print("X_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

#balance the dataset
from imblearn.over_sampling import SMOTE
from collections import Counter
counter = Counter(y_train)
#print('Before', counter)
smt = SMOTE()
xtrain, ytrain = smt.fit_resample(x_train, y_train)
counter = Counter(y_train)

from sklearn.metrics import classification_report

# Base model to tune
base_model = RandomForestClassifier(random_state=42) 


from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_features': ['sqrt', 'log2'],
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
}

# Create the randomized search
random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=100,  
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all available processors
    random_state=42
)

# Fit the randomized search
random_search.fit(x,y)

# Print the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
random_forest_clf = RandomForestClassifier(max_depth=13,
                                           max_features='sqrt',
                                           min_samples_leaf=3, 
                                           min_samples_split=6, 
                                           n_estimators=135,
                                           random_state=42)

# Fit the Random Forest classifier on the training data
random_forest_clf.fit(xtrain, ytrain) #fit model on only selected features

Selected features: [0, 1, 2, 3, 4, 7, 11, 13, 14, 17, 22]
X_train shape: (8471, 11)
X_test shape: (942, 11)
y_train shape: (8471,)
y_test shape: (942,)
Best hyperparameters: {'max_depth': 13, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 135}


In [None]:
inpTest = pd.read_csv('heloc_test.csv')

inp_df = pd.DataFrame(inpTest)

inp_selected = inp_df.iloc[:, selected_features]

y_prob_inp = random_forest_clf.predict_proba(inp_selected)[:, 1]
custom_threshold = 0.43
binary_predictions = (y_prob_inp > custom_threshold).astype(int)
pd.DataFrame(binary_predictions).to_csv('AMLHeloc_RF_predictions_1412.csv', 
                                               index=True, 
                                               header=["prediction"])

### Random forest on CovType ###

In [8]:
data = pd.read_csv("covtype_train.csv")
data.drop(columns=['Soil_Type15'])
x, y = data.drop("Cover_Type", axis=1), data["Cover_Type"]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=23)

counter = Counter(ytrain)
#print('Before', counter)

smt = SMOTE()
xtrain, ytrain = smt.fit_resample(xtrain, ytrain)
counter = Counter(ytrain)

categoricals = []
numericals = []
for col in x.columns:
    if col[:9]=="Soil_Type" or col[:15]=='Wilderness_Area':
        categoricals.append(col)
    else:
        numericals.append(col)
        
from sklearn.preprocessing import RobustScaler
ss = RobustScaler()

xtrain[numericals] = ss.fit_transform(xtrain[numericals])
xtest[numericals] = ss.transform(xtest[numericals])





Best hyperparameters: {'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 289}


In [None]:
random_search.fit(x,y)

print("Best hyperparameters:", random_search.best_params_)

In [9]:
random_forest_clf = RandomForestClassifier(max_depth=19,
                                           max_features='sqrt',
                                           min_samples_leaf=2, 
                                           min_samples_split=8, 
                                           n_estimators=289,
                                           random_state=42)

# Fit the Random Forest classifier on the training data
random_forest_clf.fit(xtrain, ytrain) #fit model on only selected features

In [10]:
inpTest = pd.read_csv('covtype_test.csv')

inp_df = pd.DataFrame(inpTest)

inp_selected = inp_df.iloc[:, selected_features]

y_prob_inp = random_forest_clf.predict_proba(inp_selected)[:, 1]
custom_threshold = 0.43
binary_predictions = (y_prob_inp > custom_threshold).astype(int)
pd.DataFrame(binary_predictions).to_csv('AMLCovType_RF_predictions_1412.csv', 
                                               index=True, 
                                               header=["prediction"])

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Hillshade_3pm
- Hillshade_9am
- Horizontal_Distance_To_Fire_Points
- Horizontal_Distance_To_Roadways
- Soil_Type10
- ...


### Random Forest on Higgs ###

In [13]:
df_higgs = pd.read_csv ("higgs_train.csv")
higgs_test = pd.read_csv ("higgs_test.csv")
data = df_higgs
data = data.drop(['EventId'], axis=1)
test_data = higgs_test.drop(['EventId'], axis=1)
data['Label'] = data['Label'].replace({'b': 0, 's': 1})

In [None]:

X_train = data.drop('Label', axis=1).values  # Replace 'label' with your target column name
y_train = data['Label'].values
X_test = test_data.values

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

random_search.fit(X_train_scaled,y_train)

print("Best hyperparameters:", random_search.best_params_)

In [None]:
random_forest_clf = RandomForestClassifier(max_depth=19,
                                           max_features='sqrt',
                                           min_samples_leaf=2, 
                                           min_samples_split=8, 
                                           n_estimators=289,
                                           random_state=42)

# Fit the Random Forest classifier on the training data
random_forest_clf.fit(X_train_scaled, y_train) #fit model on only selected features

In [None]:
inpTest = pd.read_csv('higgs_test.csv')

inp_df = pd.DataFrame(inpTest)

inp_selected = inp_df.iloc[:, selected_features]

y_prob_inp = random_forest_clf.predict_proba(inp_selected)[:, 1]
custom_threshold = 0.43
binary_predictions = (y_prob_inp > custom_threshold).astype(int)
pd.DataFrame(binary_predictions).to_csv('AMLHeloc_RF_predictions_1412.csv', 
                                               index=True, 
                                               header=["prediction"])