In [None]:
#---------------------------Header Comment------------------------------#
# implements a naive bayes model with tuning to get best accuracy       #
#                                                                       #
# authors involved: Emmanuelle Trudel                                   #
#-----------------------------------------------------------------------#

In [14]:
#libraries
import pandas as pd
import sys 
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt # this is used for the plot the graph 
import pickle
import time
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import learning_curve


In [4]:
##original datasets used

testx=pd.read_csv("data files/testx.csv",index_col=0);
testy=pd.read_csv("data files/testy.csv",index_col=0);
trainx=pd.read_csv("data files/trainx.csv",index_col=0);
trainy=pd.read_csv("data files/trainy.csv",index_col=0);

In [4]:
trainx.head()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),precip
0,27.777778,26.966667,0.31,4.186,270.0,16.1,1016.8,1.0
1,12.072222,12.072222,0.64,13.685,210.0,16.1,1016.88,1.0
2,17.55,17.55,0.74,7.7763,171.0,9.982,1013.55,1.0
3,15.088889,15.088889,0.93,3.4615,97.0,11.4471,1006.14,1.0
4,22.233333,22.233333,0.76,5.7155,57.0,6.1019,1014.39,1.0


In [5]:
trainy.head()

Unnamed: 0,summary
0,1
1,1
2,1
3,2
4,1


In [7]:
##first attempt at implementing model
classifier=GaussianNB();
classifier.fit(trainx,trainy.values.ravel()); ##ravel flattens the array for ytrain
prediction=classifier.predict(testx);

In [8]:
print(confusion_matrix(testy,prediction))
print(classification_report(testy,prediction))

[[8360 2684 1528   65  124   14    0   21]
 [5223 3559 2185   66  162   39    1   18]
 [1007 2329 2674   67  465   29    0    7]
 [2144  887  936  296  122    4    0    0]
 [   1   12  144   51 2596    0    0    0]
 [   0    1   20    1    0  143   12   18]
 [   3   16    2    0    0  102   19   65]
 [   6    1    0    1    0   40    8   79]]
              precision    recall  f1-score   support

           1       0.50      0.65      0.57     12796
           2       0.38      0.32      0.34     11253
           3       0.36      0.41      0.38      6578
           4       0.54      0.07      0.12      4389
           5       0.75      0.93      0.83      2804
           6       0.39      0.73      0.51       195
           7       0.47      0.09      0.15       207
           8       0.38      0.59      0.46       135

    accuracy                           0.46     38357
   macro avg       0.47      0.47      0.42     38357
weighted avg       0.46      0.46      0.43     38357



In [5]:
##other preprocessed data to try
testx_std=pd.read_csv("data files/std_testx.csv",index_col=0);
trainx_std=pd.read_csv("data files/std_trainx.csv",index_col=0);
testx_norm=pd.read_csv("data files/norm_testx.csv",index_col=0);
trainx_norm=pd.read_csv("data files/norm_trainx.csv",index_col=0);


In [53]:
##first attempt at changing var smoothing value
classifier=GaussianNB(var_smoothing=0);
classifier.fit(trainx,trainy.values.ravel());
prediction=classifier.predict(testx);

print(classification_report(testy,prediction))

              precision    recall  f1-score   support

           1       0.52      0.59      0.55     12646
           2       0.39      0.27      0.32     11207
           3       0.34      0.56      0.42      6710
           4       0.35      0.06      0.11      4351
           5       0.66      0.94      0.77      2871

    accuracy                           0.46     37785
   macro avg       0.45      0.48      0.44     37785
weighted avg       0.44      0.46      0.43     37785



In [20]:
classifier=GaussianNB();
classifier.fit(trainx_std,trainy.values.ravel());
prediction=classifier.predict(testx_std);

print(classification_report(testy,prediction))

              precision    recall  f1-score   support

           1       0.52      0.59      0.55     12646
           2       0.39      0.27      0.32     11207
           3       0.34      0.56      0.42      6710
           4       0.35      0.06      0.11      4351
           5       0.66      0.94      0.77      2871

    accuracy                           0.46     37785
   macro avg       0.45      0.48      0.44     37785
weighted avg       0.44      0.46      0.43     37785



In [54]:
classifier=GaussianNB(var_smoothing=1e-10);
classifier.fit(trainx_norm,trainy.values.ravel());
prediction=classifier.predict(testx_norm);

print(classification_report(testy,prediction))

              precision    recall  f1-score   support

           1       0.52      0.59      0.55     12646
           2       0.39      0.27      0.32     11207
           3       0.34      0.56      0.42      6710
           4       0.35      0.06      0.11      4351
           5       0.66      0.94      0.77      2871

    accuracy                           0.46     37785
   macro avg       0.45      0.48      0.44     37785
weighted avg       0.44      0.46      0.43     37785



In [13]:
##using grid search to find the best var smoothing value to increase accuracy

start=time.time()

nb=GaussianNB();

params={'var_smoothing':np.logspace(0,-15,num=1000)}## log space like num space but on log scale

gridsearch=GridSearchCV(nb,param_grid=params,cv=10,scoring='accuracy'); ##attempts to find best accuracy while performing gridseach

newdata=PowerTransformer().fit_transform(trainx) ##makes data more Gaussian

gridsearch.fit(newdata,trainy.values.ravel());

end=time.time()
time=end-start
print("Elapsed Time:")
print(time)

Elapsed Time:
140.91886115074158


In [15]:
gridsearch.best_params_ ##best value for var_smoothing

{'var_smoothing': 1e-05}

In [16]:
gridsearch.best_score_##highest accuracy determined


0.4728022492970948

In [None]:
##the var smoothing value did not increase the accuracy that much so 
##     other methods were researched

In [28]:
##first function created to call the naive bayes model
def naiveBayesModel1(xtrain, ytrain, xtest,ytest):
    
    #knn model using best parameters found above
    
    nb=GaussianNB(var_smoothing=1e-05);
    #newdata=PowerTransformer().fit_transform(trainx)

    nb.fit(xtrain,ytrain.values.ravel());
    
    prediction=nb.predict(xtest);
    
    print(prediction)

    #print(classification_report(ytest,prediction))
    
    

In [29]:
naiveBayesModel1(trainx_std,trainy, testx_std,testy)

[3 5 3 ... 1 1 1]


In [30]:
naiveBayesModel1(trainx,trainy, testx,testy)

[3 5 3 ... 1 1 1]


In [31]:
naiveBayesModel1(trainx_norm,trainy, testx_norm,testy)

[3 5 3 ... 1 1 1]


In [5]:
corr=trainx.corr()
print(corr)
## checking correlation becasue removing columns with high correlation 
##        may increase the accuracy

                          Temperature (C)  Apparent Temperature (C)  Humidity  \
Temperature (C)                  1.000000                  0.993461 -0.644691   
Apparent Temperature (C)         0.993461                  1.000000 -0.617055   
Humidity                        -0.644691                 -0.617055  1.000000   
Wind Speed (km/h)                0.021735                 -0.038487 -0.225668   
Wind Bearing (degrees)           0.023323                  0.023512  0.012827   
Visibility (km)                  0.387733                  0.379922 -0.362583   
Pressure (millibars)            -0.307039                 -0.291250  0.035957   

                          Wind Speed (km/h)  Wind Bearing (degrees)  \
Temperature (C)                    0.021735                0.023323   
Apparent Temperature (C)          -0.038487                0.023512   
Humidity                          -0.225668                0.012827   
Wind Speed (km/h)                  1.000000                0.082181

In [11]:
trainx.head()
dropped=trainx.drop(columns=['Temperature (C)','Apparent Temperature (C)'])
dropped.head()

Unnamed: 0,Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,0.56,12.4936,151.0,16.1,1009.03
1,0.85,8.5813,145.0,11.4471,1006.98
2,0.9,10.4006,296.0,15.1823,1004.15
3,0.87,5.796,189.0,15.8746,1017.15
4,0.69,8.05,340.0,9.982,1029.0


In [22]:
## remove columns with high correlation to hopefully increase tuning

## things wiht high correlation:
## apparent temp and temp --->just try removing one and see

def naiveBayesModel2(xtrain, ytrain, xtest,ytest):
    
    #knn model using best parameters found above
    dropped=xtrain.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
    droppedtest=xtest.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
    nb=GaussianNB(var_smoothing=1e-05);

    nb.fit(dropped,ytrain.values.ravel());
    
    prediction=nb.predict(droppedtest);
    
    print(classification_report(ytest,prediction))
    
    

In [23]:
naiveBayesModel2(trainx,trainy, testx,testy)

              precision    recall  f1-score   support

           1       0.49      0.66      0.56     12646
           2       0.41      0.33      0.37     11207
           3       0.41      0.43      0.42      6710
           4       0.36      0.07      0.11      4351
           5       0.69      0.95      0.80      2871

    accuracy                           0.47     37785
   macro avg       0.47      0.49      0.45     37785
weighted avg       0.45      0.47      0.44     37785



In [24]:
## remove columns with high correlation to hopefully increase tuning

## things wiht high correlation:
## apparent temp and temp-->try mremovieng both

def naiveBayesModel3(xtrain, ytrain, xtest,ytest):
    
    #knn model using best parameters found above
    dropped=xtrain.drop(columns=['Temperature (C)','Apparent Temperature (C)'])
    droppedtest=xtest.drop(columns=['Temperature (C)','Apparent Temperature (C)'])
    nb=GaussianNB(var_smoothing=1e-05);

    nb.fit(dropped,ytrain.values.ravel());
    
    prediction=nb.predict(droppedtest);
    
    print(classification_report(ytest,prediction))
    
naiveBayesModel3(trainx,trainy, testx,testy)

              precision    recall  f1-score   support

           1       0.46      0.70      0.55     12646
           2       0.42      0.35      0.38     11207
           3       0.49      0.33      0.39      6710
           4       0.30      0.04      0.07      4351
           5       0.71      0.96      0.81      2871

    accuracy                           0.47     37785
   macro avg       0.47      0.48      0.44     37785
weighted avg       0.45      0.47      0.44     37785



In [32]:
## Final Naive Bayes Model that uses all tuning done above


def naiveBayesModel(xtrain, ytrain, xtest,ytest):
    
    #knn model using best parameters found above
    dropped=xtrain.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
    droppedtest=xtest.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
    nb=GaussianNB(var_smoothing=1e-05);

    nb.fit(dropped,ytrain.values.ravel());
    
    prediction=nb.predict(droppedtest);
    
    print(prediction)
    #print(classification_report(ytest,prediction))
    
    
naiveBayesModel(trainx,trainy, testx,testy)

[3 5 2 ... 1 1 1]


In [33]:
naiveBayesModel(trainx_std,trainy, testx_std,testy)

[3 5 3 ... 1 1 1]


In [34]:
naiveBayesModel(trainx_norm,trainy, testx_norm,testy)

[3 5 3 ... 1 1 1]


In [6]:
##making labels so output is label not number
labels={
    1: 'Partly Cloudy',
    2: 'Mostly Cloudy',
    3: 'Overcast',
    4: 'Clear',
    5: 'Foggy',
}

In [13]:

#navie bayes model using best parameters found above
dropped=trainx_norm.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
droppedtest=testx_norm.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])

k_fold=KFold(n_splits=5, shuffle=True, random_state=0);

nb=GaussianNB(var_smoothing=1e-05);

nb.fit(dropped,trainy.values.ravel());
    
print (cross_val_score (nb, trainx_norm, trainy.values.ravel(), cv=k_fold, n_jobs=1));    
    
prediction=nb.predict(droppedtest);
    

[0.45866917 0.468791   0.46429241 0.46401125 0.47038425]


In [16]:
learning_curve(nb(kernel='linear'), trainx_norm, trainy.values.ravel());

TypeError: validation_curve() got an unexpected keyword argument 'params_name'

In [None]:
filenamenb='naive_bayes_final_model.sav'
pickle.dump(nb, open(filenamenb, 'wb')) ##save the model

In [65]:
##loads the model saved above --> final function 

def loadNaiveBayes(testx):
    loaded= pickle.load(open(filenamenb,'rb'))
    droppedtest=testx.drop(columns=['Temperature (C)'])##,'Apparent Temperature (C)'])
    pred=loaded.predict(droppedtest)
    
    labels={
    1: 'Partly Cloudy',
    2: 'Mostly Cloudy',
    3: 'Overcast',
    4: 'Clear',
    5: 'Foggy',
    }
    
    lableprint=pd.DataFrame(data=pred,columns=["columns"])
    lableprint["columns"]=lableprint["columns"].map(labels)
    #print(lableprint["columns"])
    idk=lableprint["columns"].value_counts()
    print(idk)

In [69]:
loadNaiveBayes(trainx)

Foggy    53350
Name: columns, dtype: int64


In [67]:
loadNaiveBayes(trainx_std)

Partly Cloudy    18977
Overcast         17792
Clear            12864
Mostly Cloudy     3484
Foggy              233
Name: columns, dtype: int64


In [68]:
loadNaiveBayes(trainx_norm)

Partly Cloudy    22676
Overcast         12530
Mostly Cloudy    12172
Foggy             4661
Clear             1311
Name: columns, dtype: int64
