In [1]:
import sys
import os
import gc
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
df_data = pd.read_csv("./training_data.csv", header=None)
df_label = pd.read_csv("./training_labels.csv", header=None)
# combine data with label by app name
df = pd.merge(df_data, df_label, left_on=0, right_on=0, how="inner")
df.rename(columns={'1_x':'1', '1_y':'label'}, inplace=True)
df.drop([0], axis=1, inplace=True)
# encode label 
categories = list(sorted(set(df_label[1])))
new_col = pd.Series(map(lambda x : categories.index(x), df.iloc[:,-1].tolist()))
df = df.assign(label = new_col)
df.drop(df.columns[-2], axis=1, inplace=True)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,13617,13618,13619,13620,13621,13622,13623,13624,13625,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
data=df.values
#release memory
del df_data,df_label,df,categories
gc.collect()

72

In [4]:
X = data[:,0:-1]
print(X)

y = data[:,-1]
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[19. 23. 12. ... 14.  9. 19.]


In [5]:
print(X.shape)

(20104L, 13625L)


In [6]:
def sigmoid(z):  
    return 1 / (1 + np.exp(-z))

def cost(theta, X, y, learningRate):  
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate / 2 * len(X)) * np.sum(np.power(
        theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / (len(X)) + reg

def gradient(theta, X, y, learningRate):  
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    parameters = int(theta.ravel().shape[1])
    error = sigmoid(X * theta.T) - y

    grad = ((X.T * error) / len(X)).T + ((learningRate / len(X)) * theta)

    # intercept gradient is not regularized
    grad[0, 0] = np.sum(np.multiply(error, X[:,0])) / len(X)

    return np.array(grad).ravel()

In [7]:
from scipy.optimize import minimize

def one_vs_all(X, y, num_labels, learning_rate):  
    rows = X.shape[0]
    params = X.shape[1]

    # k X (n + 1) array for the parameters of each of the k classifiers
    all_theta = np.zeros((num_labels, params + 1))

    # insert a column of ones at the beginning for the intercept term
    X = np.insert(X, 0, values=np.ones(rows), axis=1)

    # labels are 1-indexed instead of 0-indexed
    for i in range(1, num_labels + 1):
        theta = np.zeros(params + 1)
        y_i = np.array([1 if label == i else 0 for label in y])
        y_i = np.reshape(y_i, (rows, 1))

        # minimize the objective function
        fmin = minimize(fun=cost, x0=theta, args=(X, y_i, learning_rate), method='TNC', jac=gradient)
        all_theta[i-1,:] = fmin.x
        print('processing:', i)
    return all_theta

In [8]:
rows = X.shape[0]  
params = X.shape[1]

# all_theta = np.zeros((30, params + 1))

X = np.insert(X, 0, values=np.ones(rows), axis=1)

theta = np.zeros(params + 1)

y_0 = np.array([1 if label == 0 else 0 for label in y])  
y_0 = np.reshape(y_0, (rows, 1))

# X.shape, y_0.shape, theta.shape, all_theta.shape


In [9]:
np.unique(y)

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29.])

In [10]:
all_theta = one_vs_all(X, y, 30, 1)  
all_theta

  if __name__ == '__main__':
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


('processing:', 1)
('processing:', 2)
('processing:', 3)
('processing:', 4)
('processing:', 5)
('processing:', 6)
('processing:', 7)
('processing:', 8)
('processing:', 9)
('processing:', 10)
('processing:', 11)
('processing:', 12)
('processing:', 13)
('processing:', 14)
('processing:', 15)
('processing:', 16)
('processing:', 17)
('processing:', 18)
('processing:', 19)
('processing:', 20)
('processing:', 21)
('processing:', 22)
('processing:', 23)
('processing:', 24)
('processing:', 25)
('processing:', 26)
('processing:', 27)
('processing:', 28)
('processing:', 29)
('processing:', 30)


array([[-1.92647351e+00, -1.92589859e+00, -2.16890259e-01, ...,
         1.90368422e+00,  9.84874841e-01,  5.79213319e-01],
       [-1.91684298e+00, -1.91787793e+00, -2.88948033e-02, ...,
        -7.34479296e-02, -3.20928969e-02, -2.79270457e-02],
       [-2.48723376e+00, -2.47938971e+00, -2.36315839e-01, ...,
        -1.25856853e-01, -6.46599775e-02, -4.97163512e-02],
       ...,
       [-4.25566904e+00, -4.25532222e+00,  3.20313220e+00, ...,
        -1.76429875e+00, -1.62449236e-01, -1.59822825e-01],
       [-2.17653927e+00, -2.17656534e+00, -5.03572114e-02, ...,
        -3.63372710e-02, -2.12351118e-02, -1.84706962e-02],
       [-7.36106328e+00, -7.36117617e+00, -9.27118851e-04, ...,
        -1.27405651e-04, -9.96096820e-04, -1.00017388e-03]])

In [14]:
def predict_all(X, all_theta):  
    rows = X.shape[0]
    params = X.shape[1]
    num_labels = all_theta.shape[0]

    # same as before, insert ones to match the shape
    X = np.insert(X, 0, values=np.ones(rows), axis=1)

    # convert to matrices
    X = np.matrix(X)
    all_theta = np.matrix(all_theta)

    # compute the class probability for each class on each training instance
    h = sigmoid(X * all_theta.T)

    # create array of the index with the maximum probability
    h_argmax = np.argmax(h, axis=1)

    # because our array was zero-indexed we need to add one for the true label prediction
    h_argmax = h_argmax + 1

    return h_argmax

In [11]:
all_theta.shape

(30L, 13627L)

In [15]:
y_pred = predict_all(X, all_theta)  
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, y)]  
accuracy = (sum(map(int, correct)) / float(len(correct)))  
print 'accuracy = {0}%'.format(accuracy * 100)

accuracy = 82.7795463589%
