In [143]:
import numpy as np
import math
import pandas as pd
import pickle
import random
import dill
import sys

In [278]:
# dill.dump_session('notebook_Q1.db')

In [2]:
# dill.load_session('notebook_Q1.db')

In [5]:
def saveInPickle(data, pickleFile):
    file = open(pickleFile,"wb")
    pickle.dump(data,file)
    file.close()
def loadFromPickle(pickleFile):
    file = open(pickleFile,'rb')
    pickleData = pickle.load(file)
    file.close()
    return pickleData

In [6]:
def handleCategoricalFeature(dataSet, feature):
        featureValues = list(set(dataSet[feature]))
        featureMap = {}
        featureMap[feature] = {}
        i = 0
        for f in featureValues:
            featureMap[feature][f] =  i
            i+=1
        print(featureMap)
        dataSet.replace(featureMap, inplace=True)
        return dataSet[feature]

In [172]:
def getTrainTestData(dataSetFile):
    dataSet = pd.read_csv(dataSetFile)
    naColumnMean = dataSet["pm2.5"].mean(axis = 0, skipna = True)
    dataSet["pm2.5"] = dataSet["pm2.5"].fillna(naColumnMean)
    dataSet["cbwd"] =    handleCategoricalFeature(dataSet, "cbwd")
    dataSet = dataSet.drop(["No"], axis = 1)
    trainSet = dataSet.loc[dataSet['year'].isin([2012,2010])]
    testSet = dataSet.loc[dataSet['year'].isin([2013,2011])]
    return trainSet, testSet

In [13]:
def getFeaturesAndLabels(dataSet, targetColumn):
    dataLabels = dataSet[targetColumn]
    data = dataSet
    data = data.drop([targetColumn], axis = 1)
    data = data.drop(["No"], axis = 1)
    return data, dataLabels

def getFeaturesListAndValues(data):
    featuresMap = {}
    features = list(data.columns)
    features.pop(1)
    for feature in features:
        featuresMap[feature] = []
        featureValues = data[feature]
        featureValues = list(set(featureValues))
        featureValues = sorted(featureValues)
        featuresMap[feature] = featureValues
    return featuresMap

In [78]:
def calculateGiniImpurity(R1, R2, classes):
    if(len(R1)==0 or len(R2)==0):
        return 1
    features = list(R1.columns)
    features.pop(1)
    gini1 = gini2 = 0
    total = len(R1) + len(R2)
    l1 = len(R1)
    l2 = len(R2)
    for c in classes:
        p = len(R1.loc[R1['month'] == c])/l1
        gini1 += p*(1-p)
    
    for c in classes:
        p = len(R2.loc[R2['month'] == c])/l2
        gini2 += p*(1-p)
    
    gini = (gini1*l1+gini2*l2)/(l1+l2)
    return gini

def getGiniValue(nLeft, nRight):
    l1 = np.sum(nLeft)
    l2 = np.sum(nRight)
    l = l1+l2
    g1 = nLeft/l1
    g2 = nRight/l2
    g1 = g1*(1-g1)
    g2 = g2*(1-g2)
    g1 = np.sum(g1)
    g2 = np.sum(g2)
    g = (g1*l1+g2*l2)/l
    return g

def getParentGini(data):
    df = data.groupby(['month'], sort=True).size().reset_index(name='Count')
    g1 = df/len(data)
    g1 = g1*(1-g1)
    g1 = g1["Count"].sum()
    return g1

{'cbwd': {'NW': 0, 'NE': 1, 'cv': 2, 'SE': 3}}


In [113]:
def generateDecisionTree(data, maxDepth):
    dTree = []
    minGini = getParentGini(data)
    parentGini = minGini
    l = len(data)
#     minGini = 1
    classes = sorted(list(set(data["month"])))
    df1 = data.groupby(['month'], sort=True).size().reset_index(name='Count')
    for feature in features:
        dataSorted = data.sort_values([feature])
        nRight = np.array(df1["Count"])
        nLeft = np.array([0]*len(classes))
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            c = row["month"]
            indx = classes.index(c)
            nRight[indx]-=1
            nLeft[indx]+=1
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            gini = getGiniValue(nLeft, nRight)
            if(minGini> gini):
                minGini = gini
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2
#                 if(minGini==0):
#                     break
#         if(minGini==0):
#             break
    
#     print("split at", maxDepth)
   
#     print("minGini", minGini)

    if(parentGini==minGini):
        df = data.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        return df.loc[0]["month"]
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]
    
#     if(len(R1)==0 or len(R2)==0):
#         df = data.groupby(['month'], sort=True).size().reset_index(name='Count')
#         df = df.sort_values(by ='Count' , ascending=False)
#         return df.loc[0]["month"]
    
    if(minGini == 0 or maxDepth == 0):    
        df = R1.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT1 = df.loc[0]["month"]
        df = R2.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT2 = df.loc[0]["month"]
    else:    
        DT1 = generateDecisionTree(R1,maxDepth-1)
        DT2 = generateDecisionTree(R2,maxDepth-1)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree
    

In [118]:
def generateDTreeWithParams(data, maxDepth, maxWidth = math.pow(2, 500)):
    dTree = []
    minGini = getParentGini(data)
    parentGini = minGini
    l = len(data)
    classes = sorted(list(set(data["month"])))
    df1 = data.groupby(['month'], sort=True).size().reset_index(name='Count')
    for feature in features:
        dataSorted = data.sort_values([feature])
        nRight = np.array(df1["Count"])
        nLeft = np.array([0]*len(classes))
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            c = row["month"]
            indx = classes.index(c)
            nRight[indx]-=1
            nLeft[indx]+=1
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            gini = getGiniValue(nLeft, nRight)
            if(minGini> gini):
                minGini = gini
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2

    if(parentGini==minGini or maxWidth<=0):
        df = data.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        return df.loc[0]["month"], maxWidth-1
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]
    
    if(minGini == 0 or maxDepth == 0):    
        df = R1.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT1 = df.loc[0]["month"]
        df = R2.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT2 = df.loc[0]["month"]
        width = maxWidth - 2
    else:    
        DT1, width = generateDTreeWithParams(R1,maxDepth-1, maxWidth)
        DT2, width = generateDTreeWithParams(R2,maxDepth-1, width)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree, width
    

In [122]:
def trainDecisionTree(trainSet, maxDepth, maxWidth = 0):
    target = "month"
    dTree = generateDecisionTree(trainSet, maxDepth)
    return dTree
    
def testDecisionTree(test, dTree):
    if(isinstance(dTree, list)):
        minFeature = dTree[0]
        threshold = dTree[1]
        if(test[minFeature] < threshold):
            return testDecisionTree(test, dTree[2])
        else:
            return testDecisionTree(test, dTree[3])
    else:
        return dTree
    
def getAccuracy(results):
    count = 0
    for i in range(len(results)):
        if(results[i][0]==results[i][1]):
            count+=1
    return count

def getTestResults(testSet, dTree):
    results = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = testDecisionTree(testData, dTree)
        results.append((result, testData["month"]))
    count = getAccuracy(results)
    print("Accuracy", 100*count/len(testSet))

In [155]:
######################-----classification using Bagged Decision Tree ----########################################

def generateBaggedDataSet(trainSet, bagSize, n):
    bagTrainSet = []
    l = len(trainSet)
    size = int(l*bagSize/100)
    for i in range(n):
        bagTrainSet.append(trainSet.sample(n = size))
    
    return bagTrainSet

def trainBagDecisionTree(bagTrainSet, maxDepth):
    bagDtrees = []
    for i in range(len(bagTrainSet)):
        dTree = generateDecisionTree(bagTrainSet[i], maxDepth)
        print("done")
        bagDtrees.append(dTree)
    return bagDtrees

def testBagDtree(bagDtrees, testSet):
    bagResults = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = []
        for dTree in bagDtrees:
            y = testDecisionTree(testData, dTree)
            result.append(y)
        bagResults.append((result, testData["month"]))
#     print(bagResults)
    count = getBagAccuracy(bagResults)
    print("Accuracy", 100*count/len(testSet))
    
def testDecisionTree(test, dTree):
    if(isinstance(dTree, list)):
        minFeature = dTree[0]
        threshold = dTree[1]
        if(test[minFeature] < threshold):
            return testDecisionTree(test, dTree[2])
        else:
            return testDecisionTree(test, dTree[3])
    else:
        return dTree
    
def getBagAccuracy(bagResults):
    accuracy = 0
    count = []
    classes = []
    for results in bagResults:
        count = []
        classes = []
        result = results[0]
        truth = results[1]
        for y in result:
            if(y not in classes):
                count.append(0)
                classes.append(y)
            indx =  classes.index(y)
            count[indx]+=1
        
        indx = count.index(max(count))
        if(classes[indx] == truth):
            accuracy+=1
    return accuracy


In [272]:
def generateRandomForestTree(data, maxDepth):
    dTree = []
    minGini = getParentGini(data)
    parentGini = minGini
    l = len(data)
    classes = sorted(list(set(data["month"])))
    df1 = data.groupby(['month'], sort=True).size().reset_index(name='Count')
    randomFeatures = random.sample(features, math.ceil(math.sqrt(len(features))))
    print(randomFeatures)
    for feature in randomFeatures:
        dataSorted = data.sort_values([feature])
        nRight = np.array(df1["Count"])
        nLeft = np.array([0]*len(classes))
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            c = row["month"]
            indx = classes.index(c)
            nRight[indx]-=1
            nLeft[indx]+=1
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            gini = getGiniValue(nLeft, nRight)
            if(minGini> gini):
                minGini = gini
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2

    if(parentGini==minGini):
        df = data.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        return df.loc[0]["month"]
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]

    if(minGini == 0 or maxDepth == 0):    
        df = R1.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT1 = df.loc[0]["month"]
        df = R2.groupby(['month'], sort=True).size().reset_index(name='Count')
        df = df.sort_values(by ='Count' , ascending=False)
        DT2 = df.loc[0]["month"]
    else:    
        DT1 = generateDecisionTree(R1,maxDepth-1)
        DT2 = generateDecisionTree(R2,maxDepth-1)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree
    

In [273]:
##### random forest classification########################################

def generateRandomDataSet(trainSet, bagSize, n):
    bagTrainSet = []
    l = len(trainSet)
    size = int(l*bagSize/100)
    for i in range(n):
        bagTrainSet.append(trainSet.sample(n = size))
    
    return bagTrainSet

def trainRandomForest(bagTrainSet, maxDepth):
    bagDtrees = []
    for i in range(len(bagTrainSet)):
        dTree = generateRandomForestTree(bagTrainSet[i], maxDepth)
        print("done")
        bagDtrees.append(dTree)
    return bagDtrees

def testRandomForest(bagDtrees, testSet):
    bagResults = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = []
        for dTree in bagDtrees:
            y = testDecisionTree(testData, dTree)
            result.append(y)
        bagResults.append((result, testData["month"]))
    print(bagResults)
    count = getBagAccuracy(bagResults)
    print("Accuracy", count/len(testSet))
    
    
def getBagAccuracy(bagResults):
    accuracy = 0
    count = []
    classes = []
    for results in bagResults:
        count = []
        classes = []
        result = results[0]
        truth = results[1]
        for y in result:
            if(y not in classes):
                count.append(0)
                classes.append(y)
            indx =  classes.index(y)
            count[indx]+=1
        
        indx = count.index(max(count))
        if(classes[indx] == truth):
            accuracy+=1
    return accuracy



In [None]:
###################################### REGRESSION ############################################################

In [5]:
##########################################      REGRESSION   ######################################################

def getRegressionFeatures(features, target):
    features.remove(target)
    return features

def getRegionError(nLeft, nRight):
    meanLeft = np.mean(nLeft)
    meanRight = np.mean(nRight)
    nLeft = nLeft-meanLeft
    nRight = nRight - meanRight
    nLeft = np.square(nLeft)
    nRight = np.square(nRight)
    error = np.sum(nLeft) + np.sum(nRight)
    return error

def getParentError(regionValues):
    meanRegion = np.mean(regionValues)
    regionValues = regionValues-meanRegion
    regionValues = np.square(regionValues)
    error = np.sum(regionValues)
    return error

In [195]:
def trainRegressionTree(trainSet, maxDepth, target,  maxWidth = 0):
    dTree = generateRegressionTree(trainSet, maxDepth, target)
    return dTree
    
def testRegressionTree(test, dTree):
    if(isinstance(dTree, list)):
        minFeature = dTree[0]
        threshold = dTree[1]
        if(test[minFeature] < threshold):
            return testRegressionTree(test, dTree[2])
        else:
            return testRegressionTree(test, dTree[3])
    else:
        return dTree
    
def getMSE(results):
    mse = 0
    error = 0
    sd = 0
    elements = []
    for i in range(len(results)):
        elements.append(results[i][0]-results[i][1])
        error+= results[i][0]-results[i][1]
        mse += np.square(results[i][0]-results[i][1])
    sd = np.std(elements)
    error = error/len(elements)
    return mse, [error, sd]

def getRegressionResults(testSet, dTree, target):
    results = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = testRegressionTree(testData, dTree)
#         print(result, testData[target])
        results.append((result, testData[target]))
    count, error = getMSE(results)
#     print(count)
    print("error mean", error[0])
    print("standard deviation error", error[1])
    print("MSE", count/len(testSet))

In [95]:
##### ---------- Regression Decision Tree -----------

def generateRegressionTree(data, maxDepth, target):
    dTree = []
    targetValues = sorted(list(data[target]))
#     print(targetValues)
#     minError = getParentError(targetValues)
#     parentError = minError
    minError = math.inf
    l = len(data)
#     print(regressionFeatures)
    for feature in regressionFeatures:
        dataSorted = data.sort_values([feature])
        nRight = []
        nRight[:] = targetValues
        nLeft = []
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            y = row[target]
#             print(y)
            nRight.remove(y)
            nLeft.append(y)
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            error = getRegionError(nLeft, nRight)
            if(minError> error):
                minError = error
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2

    
#     print("split at", maxDepth)

    if(minError==math.inf):
        targetMean = data[target].mean()
        return targetMean
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]
    
    if(maxDepth == 0):    
        DT1 = R1[target].mean()
        DT2 = R2[target].mean()
    else:    
        DT1 = generateRegressionTree(R1,maxDepth-1, target)
        DT2 = generateRegressionTree(R2,maxDepth-1, target)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree
    

In [132]:
##### ---------- Regression Decision Tree -----------

def generateRTreeWithWidth(data, maxDepth, target, maxWidth = math.pow(2, 500)):
    dTree = []
    targetValues = sorted(list(data[target]))
    minError = math.inf
    l = len(data)
#     print(regressionFeatures)
    for feature in regressionFeatures:
        dataSorted = data.sort_values([feature])
        nRight = []
        nRight[:] = targetValues
        nLeft = []
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            y = row[target]
#             print(y)
            nRight.remove(y)
            nLeft.append(y)
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            error = getRegionError(nLeft, nRight)
            if(minError> error):
                minError = error
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2

    
#     print("split at", maxDepth)

    if(minError==math.inf or maxWidth <= 0):
        targetMean = data[target].mean()
        return targetMean, maxWidth-1
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]
    
    if(maxDepth == 0):    
        DT1 = R1[target].mean()
        DT2 = R2[target].mean()
        width = maxWidth - 2
    else:    
        DT1, width = generateRTreeWithWidth(R1,maxDepth-1, target, maxWidth)
        DT2, width = generateRTreeWithWidth(R2,maxDepth-1, target, width)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree, width
    


In [186]:
###################### -----  Regression using Bagged Decision Tree  ----- ########################################

def generateBaggedDataSet(trainSet, bagSize, n):
    bagTrainSet = []
    l = len(trainSet)
    size = int(l*bagSize/100)
    for i in range(n):
        bagTrainSet.append(trainSet.sample(n = size))
 
    return bagTrainSet

def trainBagRegressionTree(bagTrainSet, maxDepth, target):
    bagDtrees = []
    for i in range(len(bagTrainSet)):
        dTree = generateRegressionTree(bagTrainSet[i], maxDepth, target)
        print("done")
        bagDtrees.append(dTree)
    return bagDtrees

def testBagRtree(bagDtrees, testSet, target):
    bagResults = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = []
        for dTree in bagDtrees:
            y = testRegressionTree(testData, dTree)
            result.append(y)
        result = np.array(result)
        resultMean = np.mean(result)
        bagResults.append((resultMean, testData[target]))
    count, error = getBagMSE(bagResults)
    print("error mean", error[0])
    print("standard deviation error", error[1])
    print("MSE", count/len(testSet))
    

def getBagMSE(bagResults):
    elements = []
    error = 0
    bagMse = 0
    for i in range(len(bagResults)):
        error += bagResults[i][0]-bagResults[i][1]
        elements.append(bagResults[i][0]-bagResults[i][1])
        bagMse += np.square(bagResults[i][0]-bagResults[i][1])
    sd = np.std(elements)
    error = error/len(elements)
    return bagMse, [error, sd]

In [190]:
################################  Regression Random Forest ###################################

def generateRandomForestRegTree(data, maxDepth, target):
    dTree = []
    targetValues = sorted(list(data[target]))
    minError = math.inf
    l = len(data)
    randomFeatures = random.sample(regressionFeatures, math.ceil(math.sqrt(len(regressionFeatures))))
    for feature in randomFeatures:
        dataSorted = data.sort_values([feature])
        nRight = []
        nRight[:] = targetValues
        nLeft = []
        for i in range(0,l-1):
            row = dataSorted.iloc[i]
            y = row[target]
#             print(y)
            nRight.remove(y)
            nLeft.append(y)
            if(row[feature] == dataSorted.iloc[i+1][feature]):
                continue
            error = getRegionError(nLeft, nRight)
            if(minError> error):
                minError = error
                minFeature = feature
                threshold = (row[feature] + dataSorted.iloc[i+1][feature])/2

    
#     print("split at", maxDepth)

    if(minError==math.inf):
        targetMean = data[target].mean()
        return targetMean
    
    R1 = data.loc[data[minFeature] < threshold]
    R2 = data.loc[data[minFeature] >= threshold]
    
    if(maxDepth == 0):    
        DT1 = R1[target].mean()
        DT2 = R2[target].mean()
    else:    
        DT1 = generateRandomForestRegTree(R1,maxDepth-1, target)
        DT2 = generateRandomForestRegTree(R2,maxDepth-1, target)
    
    dTree = [minFeature, threshold, DT1, DT2]
    
    return dTree


def trainRegRandomForest(bagTrainSet, maxDepth, target):
    bagDtrees = []
    for i in range(len(bagTrainSet)):
        dTree = generateRandomForestRegTree(bagTrainSet[i], maxDepth, target)
        print("done")
        bagDtrees.append(dTree)
    return bagDtrees

def testRegRandomForest(bagDtrees, testSet):
    bagResults = []
    for i in range(len(testSet)):
        testData = testSet.iloc[i]
        result = []
        for dTree in bagDtrees:
            y = testRegressionTree(testData, dTree)
            result.append(y)
        result = np.array(result)
        resultMean = np.mean(result)
        bagResults.append((resultMean, testData[target]))
    count, error = getBagMSE(bagResults)
    print("error mean", error[0])
    print("standard deviation error", error[1])
    print("MSE", count/len(testSet))


In [None]:
##################################################################################################################
### command line argument 
argumentList = sys.argv 
dataSetFile = sys.argv[1] 

In [None]:
### code start from here
trainSet, testSet = getTrainTestData(dataSetFile)
featuresMap = getFeaturesListAndValues(trainSet)
featuresMap["DEWP"] = sorted(featuresMap["DEWP"])
features = list(featuresMap.keys())

In [147]:
####using precomputed decision trees
##### Classification using Decision Tree with maxDepth param
print("Classification using Decision Tree with maxDepth param")
path = "decision_trees_pickle"
depthList = [3,5,7,10,20,30,50,100]

for depth in depthList:
    file = path+"/dTree_d"+str(depth)
    dTree = loadFromPickle(file)
    print("depth =",depth,)
    getTestResults(testSet, dTree)
    
### run this code to mnaually test a decision tree
# dTree = trainDecisionTree(trainSet, 25)
# print("Training Accuracy :")
# getTestResults(trainSet, dTree)
# print("Testing Accuracy :")
# getTestResults(testSet, dTree)

depth = 3
Accuracy 10.468036529680365
depth = 5
Accuracy 15.256849315068493
depth = 7
Accuracy 20.422374429223744
depth = 10
Accuracy 32.19178082191781
depth = 20
Accuracy 38.89840182648402
depth = 30
Accuracy 38.89840182648402
depth = 50
Accuracy 38.89840182648402
depth = 100
Accuracy 38.89840182648402


In [149]:
##### Classification using Decision Tree with maxDepth param and maxWidth param

print("\nClassification using Decision Tree with maxDepth param and maxWidth param")

depth = 5
widthList = [5, 10, 20]
print("depth =",depth,)
for width in widthList:
    file = path+"/dTree_d"+str(depth)+"_w"+str(width)
    dTree = loadFromPickle(file)
    print("width =",width)
    getTestResults(testSet, dTree)

########## run this code to manually test 
# maxDepth = 5
# maxWidth = 20 
# dTree, width = generateDTreeWithParams(trainSet, maxDepth, maxWidth)
# print("Training Accuracy :")
# getTestResults(trainSet, dTree)
# print("Testing Accuracy :")
# getTestResults(testSet, dTree)

depth = 5
width = 5
Accuracy 8.43607305936073
width = 10
Accuracy 8.43607305936073
width = 20
Accuracy 8.05365296803653


In [156]:
### bagged decision tree classification

print("\nbagged decision tree classification")

depthList = [3, 5]
nList = [5, 10, 15]
for depth in depthList:
    for n in nList:
        file = path+"/baggedDTree_d"+str(depth)+"_n"+str(n)
        dTree = loadFromPickle(file)
        print("depth =",depth, "n =", n)
        testBagDtree(dTree, testSet)


### run this code to manually test
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 15  #### number of decision trees taking part in bagged decision tree training
# bagTrainSet = generateBaggedDataSet(trainSet, bagSize, n)
# bagDtrees = trainBagDecisionTree(bagTrainSet, maxDepth = 5)
# saveInPickle(bagDtrees, "baggedDTree_d5_n15")
# testBagDtree(bagDtrees, testSet)

depth = 3 n = 5
Accuracy 10.673515981735159
depth = 3 n = 10
Accuracy 12.174657534246576
depth = 3 n = 15
Accuracy 14.126712328767123
depth = 5 n = 5
Accuracy 16.48972602739726
depth = 5 n = 10
Accuracy 19.337899543378995
depth = 5 n = 15
Accuracy 19.726027397260275


In [158]:
##### random forest classification  ########################################
print("\nrandom forest classification")

depthList = [5]
nList = [5, 10, 15]
for depth in depthList:
    for n in nList:
        file = path+"/randomForest_d"+str(depth)+"_n"+str(n)
        dTree = loadFromPickle(file)
        print("depth =",depth, "n =", n)
        testBagDtree(dTree, testSet)

### run this code to manually test        
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 10  #### number of decision trees taking part in bagged decision tree training
# bagTrainSet = generateRandomDataSet(trainSet, bagSize, n)
# randomForest = trainRandomForest(bagTrainSet, maxDepth = 5)
# saveInPickle(randomForest, "randomForest_d5_n10")
# testBagDtree(randomForest, testSet)

depth = 5 n = 5
Accuracy 20.8675799086758
depth = 5 n = 10
Accuracy 18.46461187214612
depth = 5 n = 15
Accuracy 19.09246575342466


In [196]:
#### Regression using decision tree with maxDepth

print("\nregression using decision tree")

depthList = [3,5,10,20]
target="pm2.5"
for depth in depthList:
    file = path+"/RTree_d"+str(depth)
    dTree = loadFromPickle(file)
    print("depth =",depth,)
    getRegressionResults(testSet, dTree, target)
    
# regressionFeatures = list(trainSet.columns)
# regressionFeatures.remove(target)
# print(regressionFeatures)
# target = "pm2.5"
# maxDepth = 20

# dTree = trainRegressionTree(trainSet, maxDepth, target)
# print("Training Accuracy :")
# getRegressionResults(trainSet, dTree, target)
# print("Testing Accuracy :")
# getRegressionResults(testSet, dTree, target)
# saveInPickle(dTree, "RTree_d"+str(maxDepth))


regression using decision tree
depth = 3
error mean -98.00118172295589
standard deviation error 93.41026603879334
MSE 18329.70942053402
depth = 5
error mean -6.492356414352403
standard deviation error 79.49131000272936
MSE 6361.019057760899
depth = 10
error mean -14.107162193792345
standard deviation error 85.16963327443298
MSE 7452.878457263348
depth = 20
error mean -13.658723239657148
standard deviation error 92.11119361907664
MSE 8671.03271046856


In [187]:
#### regression using bagged decision tree
print("\nregression using bagged decision tree")
depthList = [10]
nList = [5, 10, 15]
for depth in depthList:
    for n in nList:
        file = path+"/baggedRegTree_d"+str(depth)+"_n"+str(n)
        dTree = loadFromPickle(file)
        print("depth =",depth, "n =", n)
        testBagRtree(dTree, testSet, target)

        
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 10  #### number of decision trees taking part in bagged decision tree training
# target = "pm2.5"
# maxDepth = 10
# bagTrainSet = generateBaggedDataSet(trainSet, bagSize, n)
# bagDtrees = trainBagRegressionTree(bagTrainSet,maxDepth, target)
# testBagRtree(bagDtrees, testSet, target)


regression using bagged decision tree
depth = 10 n = 5
error mean -8.734479877863011
standard deviation error 75.8623815644502
MSE 5831.392075366973
depth = 10 n = 10
error mean -8.618251012317113
standard deviation error 72.4729240413074
MSE 5326.598969608431
depth = 10 n = 15
error mean -8.656993967499071
standard deviation error 72.1550563083018
MSE 5281.295695407537


In [191]:
#### regression using Random forest
print("\nregression using Random forest")
depthList = [10]
nList = [5, 10, 15]
for depth in depthList:
    for n in nList:
        file = path+"/Reg_RandomForest_d"+str(depth)+"_n"+str(n)
        dTree = loadFromPickle(file)
        print("depth =",depth, "n =", n)
        testRegRandomForest(dTree, testSet)
        
        
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 10  #### number of decision trees taking part in bagged decision tree training
# maxDepth = 10
# bagTrainSet = generateRandomDataSet(trainSet, bagSize, n)
# randomForest = trainRegRandomForest(bagTrainSet, maxDepth, target)
# testRegRandomForest(randomForest, testSet)


regression using Random forest
depth = 10 n = 5
error mean -7.653774585011568
standard deviation error 77.10757638595479
MSE 6004.15860151401
depth = 10 n = 10
error mean -6.010047673735416
standard deviation error 75.01752098500705
MSE 5663.749127776518
depth = 10 n = 15
error mean -7.026487414689301
standard deviation error 74.23671597993476
MSE 5560.461524874287


In [14]:
##### Classification using Decision Tree with maxDepth param

# depthList = [3,5,7,10,20,30,50,100]

# for depth in depthList:
#     file = path+"/dTree_d"+str(depth)
#     dTree = loadFromPickle(file)
#     print("depth =",depth,)
#     getTestResults(testSet, dTree)

    
# dTree = trainDecisionTree(trainSet, 25)
# print("Training Accuracy :")
# getTestResults(trainSet, dTree)
# print("Testing Accuracy :")
# getTestResults(testSet, dTree)
# saveInPickle(dTree, "DTree_d25")

# ##### Classification using Decision Tree with maxDepth param and maxWidth param
# maxDepth = 5
# maxWidth = 
# dTree, width = generateDTreeWithParams(trainSet, maxDepth, maxWidth)
# print("Training Accuracy :")
# getTestResults(trainSet, dTree)
# print("Testing Accuracy :")
# getTestResults(testSet, dTree)
# saveInPickle(dTree, "DTree_d"+str(maxDepth)+"_w"+str(maxWidth))

# ### bagged decision tree classification
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 15  #### number of decision trees taking part in bagged decision tree training
# bagTrainSet = generateBaggedDataSet(trainSet, bagSize, n)
# bagDtrees = trainBagDecisionTree(bagTrainSet, maxDepth = 5)
# saveInPickle(bagDtrees, "baggedDTree_d5_n15")
# testBagDtree(bagDtrees, testSet)

# ##### random forest classification########################################
# bagSize = 10 ###### bagSize denotes the perentage of training data to be taken for training
# n = 10  #### number of decision trees taking part in bagged decision tree training
# bagTrainSet = generateRandomDataSet(trainSet, bagSize, n)
# randomForest = trainRandomForest(bagTrainSet, maxDepth = 5)
# saveInPickle(randomForest, "randomForest_d5_n10")
# testBagDtree(randomForest, testSet)