# Random Forests (RF)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
data_file = "diabetes.csv"

In [3]:
df = pd.read_csv(data_file)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
X = df.drop(["Outcome"], axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3232)

In [5]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

# Tahmin

In [6]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7792207792207793

# Model Tuning

In [7]:
rf_params = {"max_depth": [2, 5, 8, 10], 
             "max_features": [2,5,8], 
             "n_estimators": [100, 500, 1000], 
             "min_samples_split": [2, 4, 5]}
rf_model = RandomForestClassifier()
rf_cv_model = GridSearchCV(rf_model, rf_params, cv=10, n_jobs=-1).fit(X_train, y_train)

In [8]:
rf_cv_model.best_params_

{'max_depth': 5,
 'max_features': 2,
 'min_samples_split': 4,
 'n_estimators': 500}

In [9]:
rf_tuned = RandomForestClassifier(max_depth=5, max_features=2, min_samples_split=4, n_estimators=500).fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.7792207792207793