# Problem 2

Use this notebook to write your code for problem 3.

In [19]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=' ')

# Load the training and test data
data_train = load_data('data/training_data.txt', 1)
X_train = data_train[:, 1:]
y_train = data_train[:, 0]
#y_train = keras.utils.np_utils.to_categorical(y_train_pre,num_classes=2)

data_test = load_data('data/test_data.txt', 1)
X_test = data_test[:,:]

In [20]:
xMean = np.mean(X_train,axis=0)
xStd = np.std(X_train,axis=0)
X_train = (X_train - xMean)/xStd
X_test = (X_test - xMean)/xStd

## 2C - Depth vs Width for MNIST

As in problem 2, we have conveniently provided for your use code that loads, preprocesses, and deals with the uglies of the MNIST data.

In [2]:
# load MNIST data into Keras format
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from sklearn.cross_validation import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [21]:
class AdaBoost():
    def __init__(self, n_clfs=100):
        '''
        Initialize the AdaBoost model.

        Inputs:
            n_clfs (default 100): Initializer for self.n_clfs.        
                
        Attributes:
            self.n_clfs: The number of DT weak classifiers.
            self.coefs: A list of the AdaBoost coefficients.
            self.clfs: A list of the DT weak classifiers, initialized as empty.
        '''
        self.n_clfs = n_clfs
        self.coefs = []
        self.clfs = []

    def fit(self, X, Y, n_nodes=4):
        '''
        Fit the AdaBoost model. Note that since we are implementing this method in a class, rather
        than having a bunch of inputs and outputs, you will deal with the attributes of the class.
        (see the __init__() method).
        
        This method should thus train self.n_clfs DT weak classifiers and store them in self.clfs,
        with their coefficients in self.coefs.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            Y: A (N, ) shaped numpy array containing the (float) labels of the data points.
               (Even though the labels are ints, we treat them as floats.)
            n_nodes: The max number of nodes that the DT weak classifiers are allowed to have.
            
        Outputs:
            A (N, T) shaped numpy array, where T is the number of iterations / DT weak classifiers,
            such that the t^th column contains D_{t+1} (the dataset weights at iteration t+1).
        '''
        
        N = X.shape[0];
        
        D = np.zeros((N,self.n_clfs+1))        
        D[:,0] = 1./N*np.ones((N,))  
        
        for t in np.arange(0,self.n_clfs):
            print(t)
            clf = DecisionTreeClassifier(max_leaf_nodes=n_nodes)
            self.clfs.append(clf.fit(X,Y,sample_weight=D[:,t]))
            h = self.clfs[t].predict(X)
 
            e = np.sum(D[:,t]*np.sign(np.abs(Y-h)))
            a = 0.5*np.log((1-e)/e)
            self.coefs.append(a)
            
            D[:,t+1] = D[:,t]*np.exp(-a*Y*h)
            D[:,t+1] /= np.sum(D[:,t+1])          
             
        return D
    
    pass

    
    def predict(self, X):
        '''
        Predict on the given dataset.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            
        Outputs:
            A (N, ) shaped numpy array containing the (float) labels of the data points.
            (Even though the labels are ints, we treat them as floats.)
        '''
        # Initialize predictions.
        Y_pred = np.zeros(len(X))
        
        # Add predictions from each DT weak classifier.
        for i, clf in enumerate(self.clfs):
            Y_curr = self.coefs[i] * clf.predict(X)
            Y_pred += Y_curr

        # Return the sign of the predictions.
        return np.sign(Y_pred)

    def loss(self, X, Y):
        '''
        Calculate the classification loss.

        Inputs:
            X: A (N, D) shaped numpy array containing the data points.
            Y: A (N, ) shaped numpy array containing the (float) labels of the data points.
               (Even though the labels are ints, we treat them as floats.)
            
        Outputs:
            The classification loss.
        '''
        # Calculate the points where the predictions and the ground truths don't match.
        Y_pred = self.predict(X)
        misclassified = np.where(Y_pred != Y)[0]

        # Return the fraction of such points.
        return float(len(misclassified)) / len(X)

In [12]:
train = np.arange(0,16000)
test = np.arange(16000,20000)

model = AdaBoost(n_clfs=500)
D = model.fit(X_train[train], y_train_pre[train])

print('Training loss: %f' % model.loss(X_train[train], y_train_pre[train]))
print('Test loss: %f' % model.loss(X_train[test], y_train_pre[test]))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [181]:
## In the line below we have specified the loss function as 'mse' (Mean Squared Error) because in the above code we did not one-hot encode the labels.
## In your implementation, since you are one-hot encoding the labels, you should use 'categorical_crossentropy' as your loss.
## You will likely have the best results with RMS prop or Adam as your optimizer.  In the line below we use Adadelta

n_folds = 10
skf = StratifiedKFold(y_train_pre, n_folds=n_folds, shuffle=True)

train = np.arange(0,16000)
test = np.arange(16000,20000)

#for i, (train, test) in enumerate(skf):
print("Running Fold", i+1, "/", n_folds)
model = None # Clearing the NN.
model = create_model()

model.compile(loss='binary_crossentropy',optimizer='rmsprop', metrics=['accuracy'])

fit = model.fit(X_train[train], y_train[train], batch_size=128, epochs=10, verbose=1)    
score = model.evaluate(X_train[test], y_train[test], verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])


Running Fold 10 / 10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 1.1143082467913628
Test accuracy: 0.8225


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import zero_one_loss

n_folds = 5
skf = StratifiedKFold(y_train, n_folds=n_folds, shuffle=True)

pred = np.zeros(10000)

for i, (train, test) in enumerate(skf):
    print("Running Fold", i+1, "/", n_folds)
    forest = RandomForestClassifier(n_estimators = 500, criterion='entropy',max_features='log2')

    forest = forest.fit(X_train[train], y_train[train]) 
    acc = forest.score(X_train[test],y_train[test])
    
    pred += forest.predict(X_test)
    
    print('Test accuracy:', acc)

y_labs = np.maximum(0.,np.sign(pred-2.5))



Running Fold 1 / 5
Test accuracy: 0.8482879280179955
Running Fold 2 / 5
Test accuracy: 0.8395401149712571
Running Fold 3 / 5
Test accuracy: 0.84725
Running Fold 4 / 5
Test accuracy: 0.8537134283570893
Running Fold 5 / 5
Test accuracy: 0.8394598649662416


In [38]:
file = open("labels.txt","w") 
file.write("Id,Prediction\n")
for i,x in enumerate(y_labs):
    file.write("%i,%i\n" % (i+1,x))

In [39]:
y_labs = np.maximum(0.,np.sign(pred-2.5))

In [40]:
y_labs

array([1., 1., 0., ..., 0., 1., 0.])