# Naive Bayes Algorithm:
 is a classification algorithm based on Bayes Theorem. It is called naive because it assumes that the features in a dataset are independent of each other. This assumption is not true in real life but it simplifies the computation and gives good results in most of the cases.

 # Bayes Theorem:
 is a mathematical formula used for calculating conditional probability. It is defined as:
 P(A|B)= {P(B|A)P(A)}{P(B)}

 where A and B are events and P(B) !=0

In [7]:
# import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import load_iris

In [8]:
# load the dataset
iris= load_iris()
X = iris.data
y= iris.target

# train test split the data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
%%time
# model initialize
gnb= GaussianNB()

# train the model
gnb.fit(X_train, y_train)

# predict the test data
y_pred= gnb.predict(X_test)

# evaluation
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n ',confusion_matrix(y_test, y_pred))
print('Classification Report:\n ', classification_report(y_test, y_pred))

Accuracy Score: 1.0
Confusion Matrix:
  [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Classification Report:
                precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

CPU times: total: 31.2 ms
Wall time: 37.7 ms


In [12]:
%%time
# Model initialize
mnb= MultinomialNB()

# train the model
mnb.fit(X_train, y_train)

# predict the model
y_pred= mnb.predict(X_test)

# evaluation
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n ',confusion_matrix(y_test, y_pred))
print('Classification Report:\n ', classification_report(y_test, y_pred))

Accuracy Score: 0.9
Confusion Matrix:
  [[10  0  0]
 [ 0  9  0]
 [ 0  3  8]]
Classification Report:
                precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.75      1.00      0.86         9
           2       1.00      0.73      0.84        11

    accuracy                           0.90        30
   macro avg       0.92      0.91      0.90        30
weighted avg       0.93      0.90      0.90        30

CPU times: total: 31.2 ms
Wall time: 39.9 ms


In [13]:
%%time
# model initialize
bnb= BernoulliNB()

# train the model
bnb.fit(X_train, y_train)

# predict the model
y_pred= bnb.predict(X_test)

# evaluation
print('Accuracy Score:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n ',confusion_matrix(y_test, y_pred))
print('Classification Report:\n ', classification_report(y_test, y_pred))



Accuracy Score: 0.3
Confusion Matrix:
  [[ 0 10  0]
 [ 0  9  0]
 [ 0 11  0]]
Classification Report:
                precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.30      1.00      0.46         9
           2       0.00      0.00      0.00        11

    accuracy                           0.30        30
   macro avg       0.10      0.33      0.15        30
weighted avg       0.09      0.30      0.14        30

CPU times: total: 46.9 ms
Wall time: 51.2 ms


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# Select the best model with best hyperparameters:

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# train test splite the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [30]:
# load the dataset
df= sns.load_dataset('tips')
print(df.head())
df.columns

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regressor Tasks:

In [31]:
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variable
le = LabelEncoder()
X['sex']= le.fit_transform(X['sex'])
X['smoker']= le.fit_transform(X['smoker'])
X['day']= le.fit_transform(X['day'])
X['time']= le.fit_transform(X['time'])

# train test split the data
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Create a dictionaries of list of models to evaluate performance
models= {
    'Linear Regression' : LinearRegression(),
    'SVR':SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

# Train and predict each model with evaluation metrics
model_scores = []
for name, model in models.items():
    # Fit each model on training data
    model.fit(X_train, y_train)

    # Make prediction for the current model
    y_pred = model.predict(X_test)

    # Calculate Mean Absolute Error
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

# Select the best model from all above models with evaluation metrics sorting
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute Error for', f'{model[0]} is {model[1]:.2f}')

Mean Absolute Error for SVR is 0.57
Mean Absolute Error for Linear Regression is 0.67
Mean Absolute Error for XGBRegressor is 0.67
Mean Absolute Error for GradientBoostingRegressor is 0.72
Mean Absolute Error for KNeighborsRegressor is 0.73
Mean Absolute Error for RandomForestRegressor is 0.77
Mean Absolute Error for Decision Tree is 0.89
