In [1]:
#Import required packages

import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("wh.csv")

#data.head()
data = data.drop(['Formatted Date','Daily Summary','Loud Cover'],axis=1)
#data['precip'] = np.where(data['Precip Type']!= 'snow', 1, 0)

data['Summary'].value_counts()

Partly Cloudy                          31733
Mostly Cloudy                          28094
Overcast                               16597
Clear                                  10890
Foggy                                   7148
Breezy and Overcast                      528
Breezy and Mostly Cloudy                 516
Breezy and Partly Cloudy                 386
Dry and Partly Cloudy                     86
Windy and Partly Cloudy                   67
Light Rain                                63
Breezy                                    54
Windy and Overcast                        45
Humid and Mostly Cloudy                   40
Drizzle                                   39
Breezy and Foggy                          35
Windy and Mostly Cloudy                   35
Dry                                       34
Humid and Partly Cloudy                   17
Dry and Mostly Cloudy                     14
Rain                                      10
Windy                                      8
Humid and 

In [3]:
b = ['Partly Cloudy','Mostly Cloudy','Overcast','Clear','Foggy']
data = data[data['Summary'].isin(b)]
data["SummaryCat"] = data["Summary"].astype('category')
data["SummaryCat"] = data["SummaryCat"].cat.codes

In [4]:
 def trainRound(trainx, trainy, testx, testy):
    #DecisionTreeClassifier().__init__()
    clf = RandomForestClassifier(max_depth=2, random_state=0)
    #clf = clf.__init__(max_depth=20)

    #This line doesnt seem to stop running on the full dataset?
    clf = clf.fit(trainx, trainy)
    clf.score(testx, testy)

    predicty = clf.predict(testx)
    predY = pd.DataFrame(data=predicty)

    testy = pd.DataFrame(data=testy)
    df= pd.concat([predY, testy], axis=1)
    
    x =  clf.score(testx, testy)
    #print(x)
    return x, clf

def kfoldApproach(folds, X, Y):
    #trainx, testx, trainy, testy = train_test_split(X, Y, test_size=0.4, random_state=1999)
    kf = KFold(n_splits=5, shuffle=True, random_state=1999)
    
    acuracies = []
    models = []
    
    for train_index, test_index in kf.split(X):
        trainx = X[train_index]
        testx = X[test_index]
        
        trainy = Y[train_index]
        testy = Y[test_index]
        
        a, m = trainRound(trainx, trainy, testx, testy)
        acuracies.append(a)
        models.append(m)
        
    highestIndex = acuracies.index(max(acuracies))
    
    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]
    
    #print('The highest accuracy of the folds is: ')
    #print(selectedAccuracy)
    
    return selectedModel, selectedAccuracy
 
#This function will instead be called below
#kfoldApproach(5, X, Y)

In [5]:
#Already done above
#data = data.drop(['Formatted Date','Daily Summary','Loud Cover', 'Precip Type'],axis=1)

import itertools

def modelExecution():
    a = ['Temperature (C)','Apparent Temperature (C)','Humidity','Wind Speed (km/h)','Wind Bearing (degrees)','Visibility (km)','Pressure (millibars)']
    #data = data[data['Summary'].isin(a)]
    #data["SummaryCat"] = data["Summary"].astype('category')
    #data["SummaryCat"] = data["SummaryCat"].cat.codes

    #data.head()
    #data['Summary'].value_counts()

    #X = np.array(data.drop(['SummaryCat','Summary'],axis=1)) #independent variable
    #Y = np.array(data['SummaryCat']) #target variable
    #trainx, testx, trainy, testy = train_test_split(X, Y, test_size=0.4, random_state=1999)

    acuracies = []
    models = []
    predictors = []

    #Do every combination of input predictors
    for L in range(1, len(a)+1):
        for subset in itertools.combinations(a, L):
            s = list(subset)
            #print(s)
            X = np.array(data[s]) #independent variable
            Y = np.array(data['SummaryCat']) #target variable

            trainx, testx, trainy, testy = train_test_split(X, Y, test_size=0.4, random_state=1999)

            m, ac= kfoldApproach(5, X, Y)
            acuracies.append(ac)
            models.append(m)
            predictors.append(s)

    highestIndex = acuracies.index(max(acuracies))

    selectedModel = models[highestIndex]
    selectedAccuracy = acuracies[highestIndex]
    selectedPredictors = predictors[highestIndex]

    print('The highest accuracy of the folds and combinations is: ')
    print(selectedAccuracy)
    print(selectedPredictors)
    
    return selectedModel, selectedPredictors
    
import time

start = time.time()
finalDTM, selectedPredictors = modelExecution()
end = time.time()
print("Elapsed Time:")
print(end - start)

The highest accuracy of the folds and combinations is: 
0.4816853694685581
['Humidity', 'Wind Speed (km/h)', 'Visibility (km)', 'Pressure (millibars)']
Elapsed Time:
1490.3931801319122


In [6]:
import pickle

# save the model to disk
filename = 'RandomForestClassifier.sav'
pickle.dump(finalDTM, open(filename, 'wb'))

In [8]:
import pickle

def predictFromBestRandomForestModel(TestX):
    filename = 'RandomForestClassifier.sav'
    selectedPredictors = ['Humidity', 'Wind Speed (km/h)', 'Visibility (km)', 'Pressure (millibars)']
    mod = pickle.load(open(filename, 'rb')) #This is the model that was best
    
    predicted = mod.predict(TestX) #Predict stuff
    
    #Convert back to the label types
    label_map = {
        4:'Partly Cloudy',
        2:'Mostly Cloudy',
        3:'Overcast',
        0:'Clear',
        1:'Foggy'
    }
    df = pd.DataFrame(data=predicted,  columns=["predicted"])
    p = df["predicted"].map(label_map)
    return p

#You can now use the following to get the predictions from the given testX set
testx=pd.read_csv("data files/testx.csv",index_col=0);
pred = predictFromBestRandomForestModel(testx[selectedPredictors])

print(pred)

0        Mostly Cloudy
1                Foggy
2        Mostly Cloudy
3        Mostly Cloudy
4                Foggy
             ...      
37780    Mostly Cloudy
37781    Partly Cloudy
37782    Partly Cloudy
37783    Partly Cloudy
37784    Mostly Cloudy
Name: predicted, Length: 37785, dtype: object
