# SGD 

In [1]:
import numpy as np
import pandas as pd
import cudf
import os
from cuml.solvers import SGD as cumlSGD
from sklearn.linear_model import SGDRegressor

# Select a particular GPU to run the notebook  
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# Helper Functions

In [2]:

from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start


In [58]:
import gzip
def load_data(nrows, ncols, cached = '/rapids/notebooks/wip/notebooks/cuml/data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]

    else:
        print('use random data')
        X = np.random.rand(nrows,ncols)
        y = np.random.randint(10,size=(nrows,1))
    train_rows = int(nrows*0.8)
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
    df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
    df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    return df_X_train, df_X_test, df_y_train, df_y_test


In [59]:

from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a).ravel()
    b = to_nparray(b).ravel()
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x


# Run tests

In [60]:
%%time
nrows = 2**20
ncols = 399

X_train, X_test, y_train, y_test = load_data(nrows,ncols)
y_train_ser = y_train['fea0']
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

use mortgage data
training data (838860, 399)
training label (838860, 1)
testing data (209716, 399)
testing label (209716, 1)
CPU times: user 13.9 s, sys: 1.99 s, total: 15.9 s
Wall time: 15.9 s


In [61]:
#set parameters 
learning_rate = 'adaptive'
datatype = np.float32
penalty = 'elasticnet'
loss = 'squared_loss'

Note :setting n_iter to the number of times sgd should iterate through the entire dataset, 
      setting max_iter will not ensure that sgd will iterate through the entire data that many times and 
      therefore a DeprecationWarning will occur


In [62]:
%%time

sk_sgd = SGDRegressor(learning_rate=learning_rate, eta0=0.07,
                       n_iter=5, tol=0.0, fit_intercept=True,
                       penalty=penalty, loss=loss)
sk_sgd.fit(X_train, y_train_ser)




CPU times: user 14.8 s, sys: 697 ms, total: 15.5 s
Wall time: 15.5 s


In [63]:
%%time
y_sk = sk_sgd.predict(X_test)
error_sk = mean_squared_error(y_test,y_sk)


CPU times: user 315 ms, sys: 180 ms, total: 496 ms
Wall time: 299 ms


In [64]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = cudf.Series(y_train_ser)

CPU times: user 1.69 s, sys: 406 ms, total: 2.1 s
Wall time: 2.11 s


In [65]:
%%time
cu_sgd = cumlSGD(learning_rate=learning_rate, eta0=0.07, epochs=500,
                 batch_size=512,
                 tol=0.0, penalty=penalty, loss=loss)
cu_sgd.fit(X_cudf, y_cudf)


CPU times: user 1min 36s, sys: 2min 44s, total: 4min 20s
Wall time: 4min 23s


In [66]:
%%time
y_pred = cu_sgd.predict(X_cudf_test)
y_pred = to_nparray(y_pred).ravel()
error_cu = mean_squared_error(y_test,y_pred)

CPU times: user 379 ms, sys: 25.8 ms, total: 404 ms
Wall time: 405 ms


In [67]:
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)

SKL MSE(y):
1.5865602700295393e-07
CUML MSE(y):
1.04534735e-07


In [68]:

y_cuml_class = cu_sgd.predictClass(X_cudf_test)
y_cuml_class = to_nparray(y_cuml_class).ravel()
error_cu_pc = mean_squared_error(y_test,y_cuml_class)

In [69]:
np.unique(y_cuml_class)

array([0., 1.], dtype=float32)

In [71]:
y_pred

array([5.9840787e-04, 5.3116089e-01, 2.1306269e-01, ..., 7.6330650e-01,
       8.4202766e-01, 2.4278395e-01], dtype=float32)

In [72]:
y_sk 

array([8.14354375e-04, 5.31264145e-01, 2.13235212e-01, ...,
       7.63361033e-01, 8.42065215e-01, 2.42950042e-01])

In [73]:
error_cu_pc

0.08312841

In [70]:
np.unique(y_train)

array([0.        , 0.20466426, 0.20868516, 0.20948935, 0.2110977 ,
       0.21190189, 0.21270607, 0.2159228 , 0.21672697, 0.21913953,
       0.22155207, 0.22316043, 0.2247688 , 0.22637716, 0.22718135,
       0.22798553, 0.23039807, 0.23120224, 0.23200643, 0.23401688,
       0.23522316, 0.24165662, 0.2424608 , 0.24326497, 0.24567752,
       0.2464817 , 0.24969843, 0.252111  , 0.25291514, 0.25371933,
       0.2553277 , 0.2601528 , 0.26095697, 0.26176116, 0.26417372,
       0.26578206, 0.2689988 , 0.27060714, 0.27623641, 0.2770406 ,
       0.2778448 , 0.27945316, 0.28106153, 0.28166467, 0.28186572,
       0.28266987, 0.28347406, 0.2866908 , 0.2907117 , 0.2915159 ,
       0.29232007, 0.29634097, 0.29875353, 0.305187  , 0.30679533,
       0.3092079 , 0.3116204 , 0.3124246 , 0.32046643, 0.3212706 ,
       0.3220748 , 0.32287896, 0.3252915 , 0.3260957 , 0.32770407,
       0.33252916, 0.33333334, 0.33413753, 0.33574587, 0.33655006,
       0.33815843, 0.3389626 , 0.34057096, 0.34137514, 0.34217