# NAIVE BAYES CLASSIFIER

## Understand the data

### Automobile accidents

The file Accidents.csv contains information on 42,183 actual automobile accidents in 2001 in the United States that involved one of three levels of injury: NO INJURY, INJURY, or FATALITY. For each accident, additional information is recorded, such as day of week, weather conditions, and road type. <br>
A firm might be interested in developing a system for quickly classifying the severity of an
accident based on initial reports and associated data in the system (some of which rely
on GPS-assisted reporting). <br>

## Objective
* predict whether the accident will be a serious one based on the provided data

## Implement the model using Python

#### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix,auc,roc_auc_score
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

#### Load Pima Dataset

In [None]:
df=pd.read_csv('accidents.csv')
df.head()

#### NO OF Rows and Columns in the data

In [None]:
df.shape

#### Summary of the data

In [None]:
df.describe()

#### check to if DATA is in which Form Numeric or Categorical?

In [None]:
df.info()

In [None]:
#Convert relevant variables to categorical variables
df['RushHour'] = df['RushHour'].astype('category')
df['WRK_ZONE'] = df['WRK_ZONE'].astype('category')
df['WKDY'] = df['WKDY'].astype('category')
df['INT_HWY'] = df['INT_HWY'].astype('category')
df['LGTCON_day'] = df['LGTCON_day'].astype('category')
df['LEVEL'] = df['LEVEL'].astype('category')
df['SUR_COND_dry'] = df['SUR_COND_dry'].astype('category')
df['TRAF_two_way'] = df['TRAF_two_way'].astype('category')
df['WEATHER_adverse'] = df['WEATHER_adverse'].astype('category')
df['MAX_SEV'] = df['MAX_SEV'].astype('category')

#### Check if any Null values?

In [None]:
df.isnull().sum()

#### NO Null Values in the data

In [None]:
df['MAX_SEV'].value_counts(normalize=True)

#### Grouping Based On Outcome

In [None]:
#Group numerical variables by mean
df.groupby("MAX_SEV").mean()

### Histogram on the Data

In [None]:
df.hist(figsize=(10,8), grid=False)
plt.show()

#### BOXPLOT on the data (Outlier Detection)

In [None]:
df.plot(kind= 'box',subplots=True, layout=(3,3), sharex=False, sharey=False, figsize=(10,8))

#### Featues

In [None]:
#Create dataframes for X and Y variables
x = df.drop(["MAX_SEV"], axis=1)
y = df[['MAX_SEV']]

In [None]:
##Convert x to dummy variables
x=pd.get_dummies(x, drop_first=True)

In [None]:
##Train test split
from sklearn.model_selection import train_test_split
seed = 7
np.random.seed(seed)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = 123)

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()

In [None]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Classifier Invoking

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)


In [None]:
y_pred_G = classifier.predict(X_test)

#### Evaluation Metrics

In [None]:
cm_G = confusion_matrix(y_test, y_pred_G)
cm_G

In [None]:
from sklearn.naive_bayes import BernoulliNB

optimal_alpha = 1
NB_optimal = BernoulliNB(alpha=optimal_alpha)

# fitting the model
NB_optimal.fit(X_train, y_train)

In [None]:
y_pred_B = NB_optimal.predict(X_test)

In [None]:
NB_optimal.feature_count_

In [None]:
sorted(zip(NB_optimal.feature_count_[1], x.columns), reverse=True)

In [None]:
topn_class2 = sorted(zip(NB_optimal.feature_count_[1], x.columns),reverse=True)[:12]
topn_class2

In [None]:
topn_class1 = sorted(zip(NB_optimal.feature_count_[0], x.columns),reverse=True)[:12]
topn_class1

In [None]:
cm_G = confusion_matrix(y_test, y_pred_G)
cm_B = confusion_matrix(y_test, y_pred_B)

print(cm_G)
print(cm_B)

In [None]:
print(accuracy_score(y_test, y_pred_G))
print(accuracy_score(y_test, y_pred_B))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix



In [None]:
y_scores = NB_optimal.predict_proba(X_test)[:, 1]

In [None]:
p, r, thresholds = precision_recall_curve(y_test, y_scores)

In [None]:
def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    Will only work for binary classification problems.
    """
    return [1 if y >= t else 0 for y in y_scores]

def precision_recall_threshold(p, r, thresholds, t=0.5):
    """
    plots the precision recall curve and shows the current value for each
    by identifying the classifier's threshold (t).
    """
    
    # generate new class predictions based on the adjusted_classes
    # function above and view the resulting confusion matrix.
    y_pred_adj = adjusted_classes(y_scores, t)
    print(pd.DataFrame(confusion_matrix(y_test, y_pred_adj),
                       columns=['pred_neg', 'pred_pos'], 
                       index=['neg', 'pos']))
    
    
   

In [None]:
precision_recall_threshold(p, r, thresholds, 0.17)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    """
    Modified from:
    Hands-On Machine learning with Scikit-Learn
    and TensorFlow; p.89
    """
    plt.figure(figsize=(8, 8))
    plt.title("Precision and Recall Scores as a function of the decision threshold")
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.ylabel("Score")
    plt.xlabel("Decision Threshold")
    plt.legend(loc='best')

In [None]:
# use the same p, r, thresholds that were previously calculated
plot_precision_recall_vs_threshold(p, r, thresholds)

In [None]:
# store the predicted probabilities for class 1
y_pred_prob = NB_optimal.predict_proba(X_test)[:, 1]
y_pred_prob[1:20]

In [None]:
# histogram of predicted probabilities

# 8 bins
plt.hist(y_pred_prob, bins=20)

# x-axis limit from 0 to 1
plt.xlim(0,1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of diabetes')
plt.ylabel('Frequency')

In [None]:
## Changing the cut off value for prediction
pred_proba_df = pd.DataFrame(classifier.predict_proba(X_test))
threshold_list = [0.3,0.4,0.41, 0.45, 0.5,0.55, 0.6]
for i in threshold_list:
    print ('\n******** For i = {} ******'.format(i))
    y_test_pred = pred_proba_df.applymap(lambda x: 1 if x>i else 0)
    test_accuracy = metrics.accuracy_score(y_test.values.reshape(y_test.values.size,1),
                                           y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1))
    print('Our testing accuracy is {:.2f}'.format(test_accuracy))

    print(confusion_matrix(y_test.values.reshape(y_test.values.size,1),
                           y_test_pred.iloc[:,1].values.reshape(y_test_pred.iloc[:,1].values.size,1)))