# **`RANDOM FOREST CLASSIFIER IMPLEMENTATION WITH PIPELINE AND HYPERPARAMETER TUNING`**

In [1]:
import seaborn as sns
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df["time"].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [18]:
## handling missing values
## handling categorical features
## handling outliers not required here in this dataset
## Feature scaling
## Automating the entire process

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [9]:
df["time"] = encoder.fit_transform(df["time"])

In [11]:
df.time.value_counts()

0    176
1     68
Name: time, dtype: int64

In [12]:
## independent and dependent features
X = df.drop("time",axis=1)
y = df["time"]

In [14]:
X.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.5,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4


In [15]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test  = train_test_split(X , y , test_size= .2 , random_state= 42)

In [16]:
X_train.shape , X_test.shape

((195, 6), (49, 6))

In [25]:
from sklearn.impute import SimpleImputer ## handling missing values
from sklearn.preprocessing import OneHotEncoder## handling categorical features
## handling outliers not required here in this dataset
from sklearn.preprocessing import StandardScaler## Feature scaling
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
## Automating the entire process

In [21]:
categorical_cols = []
numerical_cols = []
for i in df.columns:
    if df[i].dtype == "category":
        categorical_cols.append(i)
    elif i == "time":
            continue
    else:
        numerical_cols.append(i)


In [26]:
## Feature Engineering Automation
num_pipeline = Pipeline(
    steps= [
    ("imputer", SimpleImputer(strategy="median")), ## missing values
    ("scaler", StandardScaler()), ## feature scaling

    ]
)
cat_pipeline = Pipeline(
    steps= [
    ("imputer", SimpleImputer(strategy="most_frequent")), ## handling  missing values
    ("onehotencoder",OneHotEncoder()), ## Categrorical features to numerical
    ]
)



In [27]:
preprocessor = ColumnTransformer([
    ("num_pipeline",num_pipeline,numerical_cols),
    ("cat_pipeline",cat_pipeline,categorical_cols)
])

In [29]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [41]:
## Automate my model training process
models = {
    "Random Forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier(),
    "SVC" : SVC()
}

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
def evaluate_model(X_train,y_train,X_test,y_test,models):

    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        #train model
        model.fit(X_train,y_train)

        #Predict Testing data
        y_test_pred = model.predict(X_test)

        #Get accuracy for test data prediction

        test_model_score = accuracy_score(y_test,y_test_pred)

        report[list(models.keys())[i]] = test_model_score
    
    return report


In [42]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.9591836734693877,
 'Decision Tree': 0.9387755102040817,
 'SVC': 0.9591836734693877}

In [43]:
classifer = RandomForestClassifier()

In [44]:
## Hyperparameter tuning
params = {
    "max_depth" : [3,5,10,None],
    "n_estimators" : [100,200,300],
    "criterion" : ["gini" , "entropy"]
}

In [45]:
from sklearn.model_selection import RandomizedSearchCV

In [46]:
cv = RandomizedSearchCV(classifer,param_distributions=params,cv=5,scoring="accuracy",verbose=3)

In [47]:
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.974 total time=   0.6s
[CV 2/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.923 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=None, n_estimators=300;, score=1.000 total time=   0.6s
[CV 4/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.949 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=None, n_estimators=300;, score=0.923 total time=   0.6s
[CV 1/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.923 total time=   0.4s
[CV 3/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=1.000 total time=   0.4s
[CV 4/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.949 total time=   0.4s
[CV 5/5] END criterion=entropy, max_depth=None, n_estimators=200;, score

In [None]:
# Internal Assignment 
# Decision Regression