In [6]:
#Import required packages

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split

In [7]:
#Load data
data = pd.read_csv("wh.csv")

#Inspect the data
data['Summary'].value_counts()

#NOTE: It was decided as part of the project to select the top 5 of this result

Partly Cloudy                          31733
Mostly Cloudy                          28094
Overcast                               16597
Clear                                  10890
Foggy                                   7148
Breezy and Overcast                      528
Breezy and Mostly Cloudy                 516
Breezy and Partly Cloudy                 386
Dry and Partly Cloudy                     86
Windy and Partly Cloudy                   67
Light Rain                                63
Breezy                                    54
Windy and Overcast                        45
Humid and Mostly Cloudy                   40
Drizzle                                   39
Windy and Mostly Cloudy                   35
Breezy and Foggy                          35
Dry                                       34
Humid and Partly Cloudy                   17
Dry and Mostly Cloudy                     14
Rain                                      10
Windy                                      8
Humid and 

In [8]:
#Select the category values to use
b = ['Partly Cloudy','Mostly Cloudy','Overcast','Clear','Foggy']
data = data[data['Summary'].isin(b)]

#Convert to numeric values
data["SummaryCat"] = data["Summary"].astype('category')
data["SummaryCat"] = data["SummaryCat"].cat.codes

#Check the results
data['SummaryCat'].value_counts()

4    31733
2    28094
3    16597
0    10890
1     7148
Name: SummaryCat, dtype: int64

In [9]:
#This function will DTC model using the given train and test data, returning the model and its accuracy
def trainRound(trainx, trainy, testx, testy):
    #Create the classifier variable
    DecisionTreeClassifier().__init__()
    clf = DecisionTreeClassifier()
    
    #Fit the model
    clf = clf.fit(trainx, trainy)

    #Get the score of the model using the x and y dataframes
    testy = pd.DataFrame(data=testy)
    
    #Return the score and the model
    x =  clf.score(testx, testy)
    return x, clf

#This function will use the given values to split the data and train a DTC on each fold, returning the best fold results
def kfoldApproach(folds, X, Y):
    #Get the fold indexes for the split to happen on
    kf = KFold(n_splits=5, shuffle=True, random_state=1999)
    
    #The list variables that will store each of the fold information
    acuracies = []
    models = []
    
    for train_index, test_index in kf.split(X):
        trainx = X[train_index]
        testx = X[test_index]
        
        trainy = Y[train_index]
        testy = Y[test_index]
        
        a, m = trainRound(trainx, trainy, testx, testy)
        acuracies.append(a)
        models.append(m)
        
    #Find the model that is the most accurate
    highestIndex = acuracies.index(max(acuracies))
    
    #Return the most accurate model and its accuracy
    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]    
    return selectedModel, selectedAccuracy

In [10]:
#Already done above: data = data.drop(['Formatted Date','Daily Summary','Loud Cover', 'Precip Type'],axis=1)

import itertools

#This will try each combination of predictor attributes to determine which gives the most accurate model
def modelExecution():
    #A list of the predictor attributes
    a = ['Temperature (C)','Apparent Temperature (C)','Humidity','Wind Speed (km/h)','Wind Bearing (degrees)','Visibility (km)','Pressure (millibars)']

    acuracies = []
    models = []
    predictors = []

    #Do every combination of input predictor length (this is how the itertools.combinations function works)
    for L in range(1, len(a)+1):
        
        #Do every combination of that length L
        for subset in itertools.combinations(a, L):
            
            #get the subset of columns we are currently using
            s = list(subset)
            
            #Get the predictor dataframe from this subset of columns
            X = np.array(data[s]) #independent variable
            Y = np.array(data['SummaryCat']) #target variable

            #Find the best model for this combination of predictors
            m, ac= kfoldApproach(5, X, Y)
            
            #Append the results before trying the next combination
            acuracies.append(ac)
            models.append(m)
            predictors.append(s)

    #Select the best model
    highestIndex = acuracies.index(max(acuracies))

    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]
    selectedPredictors = predictors[highestIndex]

    print('The highest accuracy of the folds and combinations is: ')
    print(selectedAccuracy)
    print(selectedPredictors)
    
    return selectedModel, selectedPredictors
    
#This portion of the code will measure the entire process started in the above functions
import time

#Get the start time
start = time.time()

#Run our model training procedure defined above
finalDTM, selectedPredictors = modelExecution()

#Get the end time
end = time.time()
print("Elapsed Time:")
print(end - start)

The highest accuracy of the folds and combinations is: 
0.5061930976074529
['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
Elapsed Time:
202.09453129768372


In [11]:
import pickle

# save the model to disk
filename = 'DecisionTreeClassifier.sav'
pickle.dump(finalDTM, open(filename, 'wb'))

In [12]:
import pickle

#This is a function that can be used to load the saved model and predict results from the given data
def predictFromBestDecisionTreeModel(TestX):
    #Load the model
    filename = 'DecisionTreeClassifier.sav'
    selectedPredictors = ['Temperature (C)', 'Apparent Temperature (C)', 'Humidity', 'Wind Bearing (degrees)', 'Visibility (km)', 'Pressure (millibars)']
    mod = pickle.load(open(filename, 'rb')) #This is the model that was best
    
    #Predict the output for the given data
    predicted = mod.predict(TestX) #Predict stuff
    
    #Convert back to the label types
    label_map = {
        4:'Partly Cloudy',
        2:'Mostly Cloudy',
        3:'Overcast',
        0:'Clear',
        1:'Foggy'
    }
    
    #Put the labels back in replace of their numeric equivilents
    df = pd.DataFrame(data=predicted,  columns=["predicted"])
    p = df["predicted"].map(label_map)
    return p

#Below is an example of how you can use the above function
#You can now use the following to get the predictions from the given testX set
testx=pd.read_csv("data files/testx.csv",index_col=0);
pred = predictFromBestDecisionTreeModel(testx[selectedPredictors])

print(pred)

0             Overcast
1                Foggy
2                Clear
3                Clear
4                Foggy
             ...      
37780    Partly Cloudy
37781    Partly Cloudy
37782    Mostly Cloudy
37783            Clear
37784    Partly Cloudy
Name: predicted, Length: 37785, dtype: object
