In [None]:
#---------------------------------------Header Comments-------------------------------------------------------------
#This notebook develops a decision tree classifier and compares it with each possible way of evaluating the dataset
#by using 5 fold cross validation and every combination of input features
#
#Developed By: Shaun Cyr (2020)
#
#Accuracy: 47.8%
#Training Time: 114 seconds
#-------------------------------------------------------------------------------------------------------------------

In [1]:
#Import required packages

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split

In [2]:
#Load data
d1 = pd.read_csv("data files/trainx.csv")
d2 = pd.read_csv("data files/trainy.csv")

data = pd.concat([d1, d2], axis=1)

#Inspect the data
data['summary'].value_counts()

#NOTE: It was decided as part of the project to select the top 5 of this result

1    18530
2    16143
3     8981
4     5882
5     3814
Name: summary, dtype: int64

In [3]:
#Convert to numeric values
data["SummaryCat"] = data["summary"]

#Check the results
data['SummaryCat'].value_counts()

1    18530
2    16143
3     8981
4     5882
5     3814
Name: SummaryCat, dtype: int64

In [31]:
#This function will DTC model using the given train and test data, returning the model and its accuracy
def trainRound(trainx, trainy, testx, testy):
    #Create the classifier variable
    DecisionTreeClassifier().__init__()
    clf = DecisionTreeClassifier()
    
    #Fit the model
    clf = clf.fit(trainx, trainy)

    #Get the score of the model using the x and y dataframes
    testy = pd.DataFrame(data=testy)
    
    #Return the score and the model
    x =  clf.score(testx, testy)
    return x, clf

#This function will use the given values to split the data and train a DTC on each fold, returning the best fold results
def kfoldApproach(folds, X, Y):
    #Get the fold indexes for the split to happen on
    kf = KFold(n_splits=5, shuffle=True, random_state=1999)
    
    #The list variables that will store each of the fold information
    acuracies = []
    models = []
    
    for train_index, test_index in kf.split(X):
        trainx = X.iloc[train_index]
        testx = X.iloc[test_index]
        
        trainy = Y.iloc[train_index]
        testy = Y.iloc[test_index]
        
        a, m = trainRound(trainx, trainy, testx, testy)
        acuracies.append(a)
        models.append(m)
        
    #Find the model that is the most accurate
    highestIndex = acuracies.index(max(acuracies))
    
    #Return the most accurate model and its accuracy
    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]    
    return selectedModel, selectedAccuracy

In [5]:
#Already done above: data = data.drop(['Formatted Date','Daily Summary','Loud Cover', 'Precip Type'],axis=1)

import itertools

#This will try each combination of predictor attributes to determine which gives the most accurate model
def modelExecution():
    #A list of the predictor attributes
    a = ['Temperature (C)','Apparent Temperature (C)','Humidity','Wind Speed (km/h)','Wind Bearing (degrees)','Visibility (km)','Pressure (millibars)']

    acuracies = []
    models = []
    predictors = []

    #Do every combination of input predictor length (this is how the itertools.combinations function works)
    for L in range(1, len(a)+1):
        
        #Do every combination of that length L
        for subset in itertools.combinations(a, L):
            
            #get the subset of columns we are currently using
            s = list(subset)
            
            #Get the predictor dataframe from this subset of columns
            X = np.array(data[s]) #independent variable
            Y = np.array(data['SummaryCat']) #target variable

            #Find the best model for this combination of predictors
            m, ac= kfoldApproach(5, X, Y)
            
            #Append the results before trying the next combination
            acuracies.append(ac)
            models.append(m)
            predictors.append(s)

    #Select the best model
    highestIndex = acuracies.index(max(acuracies))

    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]
    selectedPredictors = predictors[highestIndex]

    print('The highest accuracy of the folds and combinations is: ')
    print(selectedAccuracy)
    print(selectedPredictors)
    
    return selectedModel, selectedPredictors
    
#This portion of the code will measure the entire process started in the above functions
import time

#Get the start time
start = time.time()

#Run our model training procedure defined above
finalDTM, selectedPredictors = modelExecution()

#Get the end time
end = time.time()
print("Elapsed Time:")
print(end - start)

The highest accuracy of the folds and combinations is: 
0.487535145267104
['Visibility (km)']
Elapsed Time:
114.37338852882385


In [6]:
import pickle

# save the model to disk
filename = 'DecisionTreeClassifier.sav'
pickle.dump(finalDTM, open(filename, 'wb'))

In [46]:
import pickle

#This is a function that can be used to load the saved model and predict results from the given data
def predictFromBestDecisionTreeModel(TestX):
    #Load the model
    filename = 'DecisionTreeClassifier.sav'
    selectedPredictors = ['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
    mod = pickle.load(open(filename, 'rb')) #This is the model that was best
    
    #Predict the output for the given data
    predicted = mod.predict(TestX) #Predict stuff
    
    #Convert back to the label types
    label_map={
        1: 'Partly Cloudy',
        2: 'Mostly Cloudy',
        3: 'Overcast',
        4: 'Clear',
        5: 'Foggy',
    }
    
    #Put the labels back in replace of their numeric equivilents
    df = pd.DataFrame(data=predicted,  columns=["predicted"])
    p = df["predicted"].map(label_map)
    return p, mod

In [None]:
#Below is an example of how you can use the above function
#You can now use the following to get the predictions from the given testX set
testx=pd.read_csv("data files/testx.csv",index_col=0);
testy=pd.read_csv("data files/testy.csv",index_col=0);
pred = predictFromBestDecisionTreeModel(testx[selectedPredictors])

d = pd.concat([testx, testy, pred], axis=1)
print(d)

In [43]:
#Its strange that our best model only used the Visibility column, check what the accuracy is if we use them all
trainx=pd.read_csv("data files/trainx.csv",index_col=0);
trainy=pd.read_csv("data files/trainy.csv",index_col=0);
testx=pd.read_csv("data files/testx.csv",index_col=0);
testy=pd.read_csv("data files/testy.csv",index_col=0);

selectedPredictors = ['Visibility (km)']

Y = trainy['summary']
X = trainx[selectedPredictors]

m, a = kfoldApproach(5, X, Y)

print("Accuracy: ")
print(a)

Accuracy: 
0.487535145267104


In [49]:
#Get the accuracy on the reserved test set to be comparable between models
testx=pd.read_csv("data files/testx.csv",index_col=0);
testy=pd.read_csv("data files/testy.csv",index_col=0);

testx = pd.DataFrame(testx['Visibility (km)'])

pred, mod = predictFromBestDecisionTreeModel(testx)

print(mod.score(testx, testy))

0.4777821886992193
0        Partly Cloudy
1                Foggy
2        Mostly Cloudy
3             Overcast
4                Foggy
             ...      
37780    Partly Cloudy
37781    Partly Cloudy
37782    Partly Cloudy
37783    Partly Cloudy
37784    Partly Cloudy
Name: predicted, Length: 37785, dtype: object
