# Naive Bayes Algorithm:
 is a classification algorithm based on Bayes Theorem. It is called naive because it assumes that the features in a dataset are independent of each other. This assumption is not true in real life but it simplifies the computation and gives good results in most of the cases.

 # Bayes Theorem:
 is a mathematical formula used for calculating conditional probability. It is defined as:
 P(A|B)= {P(B|A)P(A)}{P(B)}

 where A and B are events and P(B) !=0


In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris

In [6]:
# load the iris datasets
iris= load_iris()
X= iris.data
y= iris.target

# train test split the data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
%%time
# model
gnb= GaussianNB()

# train the model
gnb.fit(X_train, y_train)

# prediction
y_pred= gnb.predict(X_test)

# evaluate the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy Score: 1.0
Confusion Matrix: 
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

CPU times: total: 31.2 ms
Wall time: 37.9 ms


In [17]:
%%time
# model
mul= MultinomialNB()

# train the model
mul.fit(X_train, y_train)

# prediction
y_pred= mul.predict(X_test)

# evaluate the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n ', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy Score: 0.9
Confusion Matrix: 
  [[10  0  0]
 [ 0  9  0]
 [ 0  3  8]]
Classification Report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         9
           2       1.00      0.73      0.84        11

    accuracy                           0.90        30
   macro avg       0.92      0.91      0.90        30
weighted avg       0.93      0.90      0.90        30

CPU times: total: 31.2 ms
Wall time: 44.4 ms


In [18]:
%%time
# model
bin= BernoulliNB()

# train the model
bin.fit(X_train, y_train)

# prediction
y_pred= bin.predict(X_test)

# evaluate the model
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
print('Classification Report:', classification_report(y_test, y_pred))

Accuracy Score: 0.3
Confusion Matrix: 
 [[ 0 10  0]
 [ 0  9  0]
 [ 0 11  0]]
Classification Report:               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30

CPU times: total: 46.9 ms
Wall time: 43.3 ms


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Compare the model and select which one is best:

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRFRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# import grid search cv for cv:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer

In [31]:
# load the dataset
df= sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [32]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Tasks:

In [33]:
# select hte features and variables
X= df.drop('tip', axis=1)
y= df['tip']

le= LabelEncoder()
X['sex']= le.fit_transform(df['sex'])
X['smoker']= le.fit_transform(df['smoker'])
X['day']= le.fit_transform(df['day'])
X['time']= le.fit_transform(df['time'])

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
model={
    'Linear Regressor': LinearRegression(),
    'SVR': SVR(),
    'K-Nearest_Neighbour': KNeighborsRegressor(),
    'decision_tree': DecisionTreeRegressor(),
    'Random_Forest_tree': RandomForestRegressor(),
    'GradientBoostRegressor': GradientBoostingRegressor(),
    'XGBoost': XGBRFRegressor()
}

# train and predict each model with evaluation
model_scores= []
for name, model in model.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction from each model
    y_pred= model.predict(X_test)
    metric= mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

# Select the best model
sorted_models= sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute Error', f'{model[0]} is {model[1]: .2f}')


Mean Absolute Error SVR is  0.57
Mean Absolute Error Linear Regressor is  0.67
Mean Absolute Error K-Nearest_Neighbour is  0.73
Mean Absolute Error GradientBoostRegressor is  0.73
Mean Absolute Error XGBoost is  0.75
Mean Absolute Error Random_Forest_tree is  0.79
Mean Absolute Error decision_tree is  0.95
