In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

#os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


In [2]:
## Where can we get the data ? 

In [3]:
# Generate predictors
X_raw = np.random.random(100*9)
X_raw = np.reshape(X_raw, (100, 9))

# Standardize the predictors
scaler = StandardScaler().fit(X_raw)
X = scaler.transform(X_raw)

# Add an intercept column to the model.  Essentially add 1.
#X = np.abs(np.concatenate((np.ones((X.shape[0],1)), X), axis=1))

print(X.shape)
#print(X)

(100, 9)


In [4]:
# We have the data.  How about the target ?
# Lets generate target

# Define coefficients
some_weights = np.array([2,6,7,3,5,7,1,2,4])
# Y = Xb
Y_truth = np.matmul(X,some_weights)

print(Y_truth)


[  3.22837193 -13.28519907 -12.04249453   3.25367343 -13.73964687
  10.62541322 -10.27630005  -2.13634762 -23.37443697   1.44027384
 -14.54257482  -4.31415827   3.61238716 -24.30138388  -2.6051898
  13.12320224   9.47060367   6.89827267  26.87800061  15.20467588
  11.45982132 -15.76661224  -2.33532527 -12.31019993 -13.10080956
  20.76860392 -13.23811085   2.93862903  17.46179181   9.73393013
  -1.9473759  -12.05232674 -11.9552322   19.68929573  12.19413759
  -4.00540253   9.72235637  -6.4919078   -1.0462246    8.3476282
  -7.55838013   2.88029136 -20.33610587  -7.79637239   8.11505389
 -21.51576329 -31.4239922  -10.09427758   2.60203556   7.12002922
 -38.21378718  17.44682659 -12.05881451  22.01628766  22.20822399
 -20.60085557  14.44873096  10.18232719  14.87996322  -3.13307234
  29.81996721   3.10815652  -2.328849     3.5401399  -17.64742408
  -7.16543961  -6.67958679  -7.91741451   2.95967729  11.51399966
  -2.16546046   5.06157388  15.18635625  -0.10514701  -0.61872241
  21.6454500

In [5]:
def mean_absolute_percentage_error(y_pred, y_truth, w=None):
    y_true = np.array(y_truth)
    y_pred = np.array(y_pred)
    if np.any(y_true==0):
        print("Remove zeros from set...")
        idx = np.where(y_true==0)
        y_true = np.delete(y_true, idx)
        y_pred = np.delete(y_pred, idx)
        if type(w) != type(None):
            w = np.array(w)
            w = np.delete(w, idx)     
    if type(w) == type(None):
        return(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)
    return min(100,100/sum(w)*np.dot(w, (np.abs((y_true - y_pred) / y_true))))
    
custom_loss_function = mean_absolute_percentage_error


In [6]:
from scipy.optimize import minimize

def objective_function(w, X, Y):
    return custom_loss_function(np.matmul(X,w), Y)

Y = Y_truth
# add some noise
#Y = Y_truth*np.exp(np.random.normal(loc=0.0, scale=0.2, size=100))

#print(Y)

# provide a starting weights at which to initialize
# the parameter search space
initial_weights = np.array([1]*X.shape[1])
#initial_weights = np.array([np.mean(X)]*X.shape[1])

result = minimize(objective_function, initial_weights, args=(X,Y),
                  method='BFGS', options={'maxiter': 1000})

# The optimal values for the input parameters are stored
# in result.x
estimated_weights = result.x

print(estimated_weights)


[1.99999991 6.00000006 6.99999995 3.00000006 5.00000002 7.
 0.99999996 1.99999999 3.99999999]


In [7]:
pd.DataFrame({
    "initial_weights": some_weights, 
    "estimated_weights": estimated_weights,
    "error": np.array(np.around(some_weights-estimated_weights,2))
})

#[["y_truth", "y_pred", "error"]]


Unnamed: 0,initial_weights,estimated_weights,error
0,2,2.0,0.0
1,6,6.0,-0.0
2,7,7.0,0.0
3,3,3.0,-0.0
4,5,5.0,-0.0
5,7,7.0,0.0
6,1,1.0,0.0
7,2,2.0,0.0
8,4,4.0,0.0


In [8]:
round(custom_loss_function(np.matmul(X,estimated_weights), Y),2)

0.0

In [9]:
class ManyTargetsModel:
    """
    Linear model: Y = XB, fit by minimizing the provided loss_function
    with L2 regularization
    """
    def __init__(self, loss_function, regularization=0.00012):
        self.regularization = regularization
        self.loss_function = loss_function


    def predict(self, X):
        prediction = np.matmul(X, self.beta)
        return(prediction)

#     def score(self, X, y_true):
#         return(sum(self.predict(X)))


    def model_error(self):
        error = self.loss_function(
            self.predict(self.X), self.Y, w=self.sample_weights
        )
        return(error)
    
    def l2_regularized_loss(self, beta):
        self.beta = beta
        m = len(self.X)
#        return (self.model_error())
        return(self.model_error()/m + \
                sum(((self.regularization)/2*m)*(np.array(self.beta)**2)))
    
    def fit(self, X, Y, maxiter=250, sample_weights=None, initial_weights=None):        
        self.X = X
        self.Y = Y

        self.beta = None  #latest weights
        self.sample_weights = sample_weights
        self.estimated_weights = initial_weights
        
        # Initialize estimated_weights
        if type(self.estimated_weights)==type(None):
            # set estimated_weights = 1 for every feature
            self.estimated_weights = np.array([1]*self.X.shape[1])
            
        res = minimize(self.l2_regularized_loss, self.estimated_weights
                       #, args=(X,Y),
                       ,method='BFGS', options={'maxiter': maxiter})
        self.beta = res.x
        self.estimated_weights = self.beta

In [10]:
model = ManyTargetsModel(mean_absolute_percentage_error, regularization=0.000012)
model.fit( X, Y)
model.estimated_weights

array([1.99999864, 5.99999705, 6.99999644, 2.99999825, 4.99999785,
       6.99999648, 0.9999998 , 1.99999899, 3.99999836])

In [11]:
pd.DataFrame({
    "initial_weights": some_weights, 
    "estimated_weights": model.estimated_weights,
    "error": np.array(np.around(some_weights-model.estimated_weights,6))
})

Unnamed: 0,initial_weights,estimated_weights,error
0,2,1.999999,1e-06
1,6,5.999997,3e-06
2,7,6.999996,4e-06
3,3,2.999998,2e-06
4,5,4.999998,2e-06
5,7,6.999996,4e-06
6,1,1.0,0.0
7,2,1.999999,1e-06
8,4,3.999998,2e-06


In [12]:
# Predicted Y vs. observed Y
#plt.scatter(model.predict(X), Y)
round(custom_loss_function(np.matmul(X,model.estimated_weights), Y),4)

0.0

In [13]:
from sklearn.model_selection import KFold

# Used to cross-validate models and identify optimal lambda
class CustomCrossValidator:
    
    """
    Cross validates arbitrary model using MAPE criterion on
    list of lambdas.
    """
    def __init__(self):
        pass
        
    def cross_validate(self, ModelClass, X, Y, lambdas, 
                        loss_function, 
                        sample_weights=None,
                        num_folds=10):
        """
        lambdas: set of regularization parameters to try
        num_folds: number of folds to cross-validate against
        """
        
        self.X = X
        self.Y = Y
        self.ModelClass = ModelClass
        self.loss_function = loss_function
        self.sample_weights = sample_weights
    

        self.lambdas = lambdas
        self.cv_scores = []
        X = self.X
        Y = self.Y 
        
        # Beta values are not likely to differ dramatically
        # between differnt folds. Keeping track of the estimated
        # beta coefficients and passing them as starting values
        # to the .fit() operator on our model class can significantly
        # lower the time it takes for the minimize() function to run
        beta_init = None
        
        for lam in self.lambdas:
            print("Lambda: {}".format(lam))
            
            # Split data into training/holdout sets
            kf = KFold(n_splits=num_folds, shuffle=True)
            kf.get_n_splits(X)
            
            # Keep track of the error for each holdout fold
            k_fold_scores = []
            
            # Iterate over folds, using k-1 folds for training
            # and the k-th fold for validation
            f = 1
            for train_index, test_index in kf.split(X):
                # Training data
                CV_X = X[train_index,:]
                CV_Y = Y[train_index]
                CV_weights = None
                if type(self.sample_weights) != type(None):
                    CV_weights = self.sample_weights[train_index]
                
                # Holdout data
                holdout_X = X[test_index,:]
                holdout_Y = Y[test_index]
                holdout_weights = None
                if type(self.sample_weights) != type(None):
                    holdout_weights = self.sample_weights[test_index]
                
                # Fit model to training sample
                lambda_fold_model = self.ModelClass(self.loss_function, regularization=lam)
                lambda_fold_model.fit(CV_X, CV_Y, sample_weights=CV_weights, initial_weights=beta_init)
                
                # Extract beta values to pass as beta_init 
                # to speed up estimation of the next fold
                beta_init = lambda_fold_model.beta
                
                # Calculate holdout error
                fold_preds = lambda_fold_model.predict(holdout_X)
                fold_mape = self.loss_function(holdout_Y, fold_preds, w=holdout_weights)
                k_fold_scores.append(fold_mape)
                print("Fold: {}. Error: {}".format( f, fold_mape))
                f += 1
            
            # Error associated with each lambda is the average
            # of the errors across the k folds
            lambda_scores = np.mean(k_fold_scores)
            print("** AVERAGE: {}".format(lambda_scores))
            self.cv_scores.append(lambda_scores)
        
        # Optimal lambda is that which minimizes the cross-validation error
        self.lambda_star_index = np.argmin(self.cv_scores)
        self.lambda_star = self.lambdas[self.lambda_star_index]
        print("\n\n**BEST LAMBDA: {}**".format(self.lambda_star))

In [14]:
# specify lambdas values to search
lambdas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]

cross_validator = CustomCrossValidator()
cross_validator.cross_validate(ManyTargetsModel, X, Y,  lambdas, custom_loss_function,  num_folds=5)

Lambda: 1
Fold: 1. Error: 336858.6669361465
Fold: 2. Error: 373631.2280348293
Fold: 3. Error: 1650006.1947128961
Fold: 4. Error: 1536954.6373374881
Fold: 5. Error: 996228.6772652713
** AVERAGE: 978735.8808573263
Lambda: 0.1
Fold: 1. Error: 120947.39792130326
Fold: 2. Error: 51381.13491326714
Fold: 3. Error: 36458.61144592528
Fold: 4. Error: 44678.51463257665
Fold: 5. Error: 172117.82334977048
** AVERAGE: 85116.69645256856
Lambda: 0.01
Fold: 1. Error: 14096.839790096441
Fold: 2. Error: 12315.339566307
Fold: 3. Error: 52721.62694026774
Fold: 4. Error: 10260.738284255334
Fold: 5. Error: 13897.174068118533
** AVERAGE: 20658.34372980901
Lambda: 0.001
Fold: 1. Error: 4566.477059083119
Fold: 2. Error: 1212.0620956125717
Fold: 3. Error: 4832.549165204718
Fold: 4. Error: 8375.716414623304
Fold: 5. Error: 6776.8599809071175
** AVERAGE: 5152.732943086166
Lambda: 0.0001
Fold: 1. Error: 35.731281298015475
Fold: 2. Error: 86.25535128920625
Fold: 3. Error: 38.84530753007775
Fold: 4. Error: 37.9950324

In [15]:
lambda_star = cross_validator.lambda_star
print('with regularization: ', lambda_star)
final_model = ManyTargetsModel(custom_loss_function, regularization=lambda_star)
final_model.fit(X, Y)
final_model.estimated_weights
y_pred = final_model.predict(X)
train_data = pd.DataFrame(X)
train_data['y_truth'] = Y
train_data['y_pred'] = y_pred
train_data

with regularization:  1e-06


Unnamed: 0,0,1,2,3,4,5,6,7,8,y_truth,y_pred
0,1.507579,-0.563400,0.562276,-1.332146,1.050077,-0.462330,-1.296277,-0.530634,0.999397,3.228372,3.228372
1,-1.691418,-1.092415,1.645547,0.116214,-1.293266,-0.630865,-0.540870,0.685703,-1.290875,-13.285199,-13.285200
2,1.534903,-0.941776,-0.516393,-0.662857,-0.619402,-1.427847,1.506040,0.650407,1.606690,-12.042495,-12.042495
3,0.489120,-0.057204,1.180967,1.490712,1.513530,-1.624721,-1.450672,0.805173,-1.618630,3.253673,3.253674
4,-1.216013,0.609950,-0.881171,-1.621498,-0.214712,0.924015,-1.567411,-0.730678,-1.575103,-13.739647,-13.739647
5,-0.317969,1.177619,1.751039,-0.159548,0.355435,-0.866265,0.005105,-0.055901,-0.797403,10.625413,10.625413
6,-0.667827,-0.597424,0.336092,0.023883,-1.019887,-1.158811,-0.978959,0.644807,1.280015,-10.276300,-10.276301
7,1.093962,0.855995,-1.441042,0.546202,-1.096045,0.998796,-0.887760,-0.833294,0.007861,-2.136348,-2.136348
8,0.147885,-0.102030,-1.711481,0.757459,0.293677,-1.325290,0.609164,-1.115484,-0.979897,-23.374437,-23.374438
9,-1.212376,1.297297,-0.711397,1.496720,-0.461206,-1.283986,-0.620666,1.444033,1.399348,1.440274,1.440273


In [16]:
round(custom_loss_function(np.matmul(X,final_model.estimated_weights), Y),4)

0.0

In [17]:
test_data = np.random.random((10,9))
test_result = final_model.predict(test_data)

In [18]:
data = pd.DataFrame(test_data)
data['y_pred'] = test_result
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,y_pred
0,0.60209,0.812987,0.846952,0.857944,0.214467,0.141252,0.302948,0.897758,0.797035,21.932302
1,0.62433,0.850129,0.332856,0.265681,0.33401,0.640477,0.829841,0.048767,0.982292,20.486398
2,0.805075,0.324719,0.812155,0.780388,0.794551,0.12076,0.726676,0.207888,0.719619,20.423726
3,0.610952,0.475557,0.880631,0.367347,0.134853,0.345883,0.659849,0.435708,0.051033,16.172549
4,0.287632,0.150329,0.781215,0.571158,0.578206,0.126166,0.697782,0.367559,0.676292,16.571479
5,0.555329,0.247484,0.776044,0.373284,0.840344,0.380229,0.958696,0.498656,0.223507,18.861082
6,0.92878,0.693301,0.145977,0.212827,0.52459,0.026931,0.352008,0.391696,0.954128,15.441055
7,0.19492,0.582053,0.691961,0.3366,0.038425,0.476979,0.483514,0.141998,0.501276,16.039267
8,0.231129,0.198635,0.422906,0.407853,0.423211,0.03553,0.174594,0.068946,0.309196,9.752001
9,0.276302,0.997535,0.905833,0.946538,0.180229,0.654463,0.229402,0.985819,0.303154,24.614303


In [19]:
import numpy as np
from sklearn.metrics import make_scorer
def my_custom_loss_func(ground_truth, predictions):
    diff = np.abs(ground_truth - predictions).max()
    return np.log(1 + diff)

# loss_func will negate the return value of my_custom_loss_func,
#  which will be np.log(2), 0.693, given the values for ground_truth
#  and predictions defined below.
loss  = make_scorer(my_custom_loss_func, greater_is_better=False)
score = make_scorer(my_custom_loss_func, greater_is_better=True)
ground_truth = [[1, 1],[2,1]]
predictions  = [0, 1]
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent', random_state=0)
clf = clf.fit(ground_truth, predictions)
loss(clf,ground_truth, predictions) 

score(clf,ground_truth, predictions)

0.6931471805599453

In [20]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()

# Organize our data
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

# Look at our data
print(label_names)
print('Class label = ', labels[0])
print(feature_names)
print(features[0])

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(train, train_labels)

# Make predictions
preds = gnb.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))

['malignant' 'benign']
Class label =  0
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1

In [21]:
from functools import reduce
y_true = [[1,2],[0,0],[2,6,7,0]]
y_pred = np.array([2,4,7])


#print(y_pred[y_pred != 0])
y_true_excluding_zeros = [np.array(v)[np.array(v)!=0] for v in y_true]

y_idx = []
for idx, v in enumerate(y_true):
    if v.__contains__(2):
        print("Remove zeros from set...")
        y_idx.append(idx)

print(y_true)
print(y_pred)
print('Idx to delete ', y_idx)
y_true = y_true_excluding_zeros

#y_true = np.delete(y_true,y_idx,axis=0)
#y_pred = np.delete(y_pred,y_idx)
print(y_true)
print(y_pred)
matched_index = [t.__contains__(p) for (t,p) in zip(y_true, y_pred)]
print(matched_index)
print(sum(matched_index)/len(matched_index))


#matched_values = [reduce(np.intersect1d, (p, a)) for (p,a) in zip(y_true, y_pred)]
#print(matched_values)



Remove zeros from set...
Remove zeros from set...
[[1, 2], [0, 0], [2, 6, 7, 0]]
[2 4 7]
Idx to delete  [0, 2]
[array([1, 2]), array([], dtype=int64), array([2, 6, 7])]
[2 4 7]
[True, False, True]
0.6666666666666666


In [22]:
from sklearn.metrics.scorer import make_scorer
def multi_targets_scorer_function(y_true, y_pred):
    y_true_excluding_zeros = [np.array(v)[np.array(v)!=0] for v in y_true]
    matched_index = [t.__contains__(p) for (t,p) in zip(y_true_excluding_zeros, y_pred)]
    return sum(matched_index)/len(y_true_excluding_zeros)

multi_targets_scorer = make_scorer(multi_targets_scorer_function, greater_is_better=True)

In [23]:
#!conda install -n mldds -c anaconda joblib
import os
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

import multiprocessing
num_cores = multiprocessing.cpu_count()

print("Cores: ", num_cores)

import time
import keras
# import tensorflow as tf
# config = tf.ConfigProto( device_count = {'GPU': 0 , 'CPU': num_cores} )
# sess = tf.Session(config=config) 
# keras.backend.set_session(sess)

# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from MyTotoResearchv4 import *

Cores:  12


  return f(*args, **kwds)
Using TensorFlow backend.
  return f(*args, **kwds)
  return _inspect.getargspec(target)
  _config = json.load(open(_config_path))


Done.


In [24]:
#Install autograd
#!conda install -c omnia autograd


In [25]:
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler



In [26]:
def wTx(w, x):
    return np.dot(x, w)

def sigmoid_range(z,bottom,top):
#    return 1./(1+np.exp(-z))
    return bottom + (top - bottom) / (1 + np.exp(-z))

def sigmoid_range_inverse(y, bottom,top):
    return np.log((y - bottom) / (top - y))

def custom_predictions(w, x):
    predictions = sigmoid_range(wTx(w, x),1,49)
    return predictions
#     global i
#     if ( i < 10 ):
#         print(X)
#     print(predictions)
#     return predictions.clip(eps, 1-eps)

def custom_loss(y, y_predicted):
#    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

i = 0
def custom_loss_given_weights(w):
#     global i
#     if ( i < 10 ):
#         i = i + 1
#         print(X)
    y_predicted = custom_predictions(w, X)
    y_matched = y_predicted[np.abs(y_predicted-y).argmin()]
    return custom_loss(y_matched, y_predicted)
    
gradient = grad(custom_loss_given_weights)

NameError: name 'grad' is not defined

In [None]:
%pylab inline

import pandas as pd

from autograd import grad
import autograd.numpy as np


def getAllData(df):
    drop_cols = ['T', 'D', 'N1','N2','N3','N4','N5','N6','N7','L','M','S','R','E','A','V' ,'J','U']
    X = df.drop(drop_cols, axis=1)
    return X



In [None]:
def wTx(w, x):
    return np.dot(x, w)

def sigmoid_range(z,bottom,top):
#    return 1./(1+np.exp(-z))
    return bottom + (top - bottom) / (1 + np.exp(-z))

def sigmoid_range_inverse(y, bottom,top):
    return np.log((y - bottom) / (top - y))

def custom_predictions(w, x):
    predictions = sigmoid_range(wTx(w, x),1,49)
    return predictions
#     global i
#     if ( i < 10 ):
#         print(X)
#     print(predictions)
#     return predictions.clip(eps, 1-eps)

def custom_loss(y, y_predicted):
    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

i = 0
def custom_loss_given_weights(w):
#     global i
#     if ( i < 10 ):
#         i = i + 1
#         print(X)
    y_predicted = custom_predictions(w, X)
    y_matched = y_predicted[np.abs(y_predicted-y).argmin()]
    return custom_loss(y_matched, y_predicted)
    
gradient = grad(custom_loss_given_weights)

In [None]:
X = np.array([
    [ 0.3213,  0.4856,  0.2995,  2.5044],
    [ 0.3005,  0.4757,  0.2974,  2.4691],
    [ 0.5638,  0.8005,  0.3381,  2.3102],
    [ 0.5281,  0.6542,  0.3129,  2.1298],
    [ 0.3221,  0.5126,  0.3085,  2.6147],
    [ 0.3055,  0.4885,  0.289 ,  2.4957],
    [ 0.3276,  0.5185,  0.3218,  2.6013],
    [ 0.5313,  0.7028,  0.3266,  2.1543],
    [ 0.4728,  0.6399,  0.3062,  2.0597],
    [ 0.3221,  0.5126,  0.3085,  2.6147]
])
y = np.array([1., 1., 0., 0., 1., 1., 1., 1., 0., 0.])

weights = np.zeros(X.shape[1])
eps = 1e-15


In [None]:
df = pd.DataFrame(
    [(y_hat, custom_loss(False, y_hat)) for y_hat in np.linspace(0, 1, 101)],
    columns=['y_hat', 'loss']
).plot(x='y_hat', title='y_hat vs. Loss for y=0')

In [None]:
df = pd.DataFrame(
    [(y_hat, custom_loss(True, y_hat)) for y_hat in np.linspace(0, 1, 101)],
    columns=['y_hat', 'loss']
).plot(x='y_hat', title='y_hat vs. Loss for y=1')

In [None]:
for i in range(1000):
    if i % 100 == 0:
        print('Iteration %-4d | Loss: %.4f' % (i, custom_loss_given_weights(weights)))
    weights -= gradient(weights) * .05

In [None]:
def store_prediction(mrt, model, f, scaler=None, name='unnamed'):
    def getAllData(df):
        drop_cols = ['T', 'L','M','S','R','E','A','V' ,'J','U']
        X = df.drop(drop_cols, axis=1)
#        print(df.head())
        use_cols = ['Ph','il','age','dist','adia','sundist','sunadia']
        X = df[use_cols]
        return X

    test_data = mtr.get_test_data()
    X = mtr.modified_dataset(getAllData(test_data)) #
#    X = getAdjustedDataF(test_data,f)


    if ( scaler == None ):
        Z = X
    else:
        scaler.fit(X)
        Z = scaler.transform(X)

    predictions = model.predict(Z)

    dfResult= pd.DataFrame(predictions, columns=['N1', 'N2', 'N3', 'N4', 'N5','N6', 'N7'])
#    mtr.print_predictions(dfResult)

    global df_predictions
    global prev_r
    r = mtr.getAccuracyCount(np.array(dfResult)) ;
#    if ( r > prev_r ):
#        df_predictions = []
    df_predictions.append(dfResult)
    g_all_pred.update({name : dfResult})



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from keras.models import Input, Model
import keras
from keras.layers import Dense
import time
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
import json as simplejson
from keras import regularizers
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVR

from sklearn.linear_model import SGDRegressor, SGDClassifier, LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge, RidgeClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC, SVR, LinearSVC
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

seed = 42

mtr = MyTotoResearch(algo_no=1)
lresult, df = mtr.load_totodata()

df_predictions = []


all_models = []

#all_models.append(('SVCpoly01', SVC(kernel='poly', coef0=0.05, probability=True, degree=2, random_state=seed)))
#all_models.append(('SVCrbf010', SVC(kernel='rbf', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf011', SVC(kernel='rbf', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf012', SVC(kernel='rbf', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0103', SVC(kernel='rbf', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0113', SVC(kernel='rbf', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0123', SVC(kernel='rbf', coef0=0.25, probability=True, degree=3, random_state=seed)))


#all_models.append(('SVCrbf020', SVC(kernel='sigmoid', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf021', SVC(kernel='sigmoid', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf022', SVC(kernel='sigmoid', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0203', SVC(kernel='sigmoid', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0213', SVC(kernel='sigmoid', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0223', SVC(kernel='sigmoid', coef0=0.25, probability=True, degree=3, random_state=seed)))


# all_models.append(('SVCrbf030', SVC(kernel='linear', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf031', SVC(kernel='linear', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf032', SVC(kernel='linear', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0303', SVC(kernel='linear', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0313', SVC(kernel='linear', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0323', SVC(kernel='linear', coef0=0.25, probability=True, degree=3, random_state=seed)))



# all_models.append(('LR', (LogisticRegression(random_state=seed))))

#all_models.append(('KNNC', KNeighborsClassifier()))
#all_models.append(('KNNR', KNeighborsRegressor()))
#all_models.append(('RC', RidgeClassifier(random_state=seed)))
# all_models.append(('LR', LogisticRegression(random_state=seed)))
# all_models.append(('LDA', LinearDiscriminantAnalysis()))
# all_models.append(('DTR', DecisionTreeRegressor()))
# all_models.append(('ETR', ExtraTreesRegressor(n_estimators=5)))
#all_models.append(('ETC', ExtraTreesClassifier(n_estimators=5)))
# all_models.append(('EN', ElasticNet()))
#all_models.append(('CART', DecisionTreeClassifier()))
# all_models.append(('NB', GaussianNB()))
# all_models.append(('Lasso', Lasso()))
all_models.append(('GBR', GradientBoostingRegressor()))
#all_models.append(('RFR5', RandomForestClassifier(n_estimators=5, n_jobs=5, random_state=seed)))
# all_models.append(('RFR5', RandomForestClassifier(n_estimators=5, n_jobs=5, random_state=seed)))
# all_models.append(('RFR3', RandomForestRegressor(n_estimators=3, n_jobs=5, random_state=seed)))
# all_models.append(('SGDR', SGDRegressor(random_state=seed)))
#all_models.append(('AdaB', AdaBoostClassifier(RandomForestClassifier(n_estimators=3))))
#all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(500,500,500), max_iter=2000, alpha=0.001, activation='tanh', learning_rate='adaptive', solver='sgd', verbose=0,  random_state=42,tol=0.000000001)))

#92.45 accuracy
#all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(490,490,490,490,490,490,490), max_iter=500000, alpha=0.001, activation='relu', learning_rate='adaptive', solver='adam', verbose=10,  random_state=42,tol=0.000000001)))


all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(780,490,780,490,780,490,280), max_iter=500000, alpha=0.001, activation='relu', learning_rate='adaptive', solver='adam', verbose=10,  random_state=42,tol=0.000000001)))





In [None]:
# evaluate each model in turn
from sklearn import model_selection
results = []
names = []
scoring = 'accuracy'

g_all_pred = {}

X = mtr.modified_dataset(getAllData(df)) #
f = 1.0 #365/27.58
#    X = getAdjustedDataF(df,f)

scaler = StandardScaler()
scaler = RobustScaler()
scaler.fit(X)
Z = scaler.transform(X)

for name, model in all_models:
    
    
#    scaler = None
#    Z = X

#     kfold = model_selection.KFold(n_splits=3, random_state=seed)
#     cv_results = model_selection.cross_val_score(model, Z, mtr.getTarget(3), cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)
    
    oClassifier = MultiOutputClassifier(model, n_jobs=7)
    oClassifier.fit(Z, mtr.getTargets()) 
    print(oClassifier)
    s = oClassifier.score(Z, mtr.getTargets())
    if(oClassifier.score(Z, mtr.getTargets()) == 1.0):
        print( name, ' ', str(f), ' ', str(s))
    store_prediction(mtr, oClassifier, f, scaler=scaler, name=name)
    start = time.clock()
    print(str(f), " Time taken: ", (time.clock() - start),  " ")

# for n in range(len(df_predictions)):
#     print( mtr.getAccuracyCount(np.array(df_predictions[n])))
#     mtr.print_predictions(df_predictions[n])






# boxplot algorithm comparison
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# plt.show()

print('Done')

In [None]:
def combine_prediction(arr, initial_pred=[]):
    global s
    if ( isinstance(arr, list) ):
        for a in arr:
            combine_prediction(a, initial_pred)
        return 
    if ( len(s) > 1 ):
        s += '_'
    s += arr
    initial_pred.append(g_all_pred[arr])
    return 



In [None]:
import itertools
from itertools import combinations
import operator 
from itertools import islice

name_ = []

lst = [name for name, model in all_models]
iBestIndex = -1
iBestN = []
#print("List ", lst)
top_n = 12


dict_accuracy = {}
for z in range(5, 0,-1):
    a = [list(x) for x in itertools.combinations(lst, z) if len(x) > 1 ] 
#    print(a)

    for xx in a:
        test_pred = []
        s = ''
        combine_prediction(xx, test_pred)
#        print(s)

        #print(len(test_pred))

        all_pred = [] ;
        for i in range(len(test_pred)):
            if ( i == 0 ):
                all_pred = test_pred[i]
            else:
                all_pred = np.column_stack((all_pred, test_pred[i]) )

        top_seven = []
        for i in range(len(all_pred)):
            unique, counts = np.unique(all_pred[i], return_counts=True)
            x = dict(zip(unique, counts))
            sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) # sorted by value
            l = list(islice([int(x) for x,y in sorted_x],top_n))
            while ( len(l) < top_n ):
                l.append(-1)

            top_seven.append(l)
            

#        print(len(top_seven))
#         if(len(top_seven[0]) < top_n ):
#             print("*** Caught ", )
        columns = ['N'+str(i+1) for i in range(len(top_seven[0]))]
#        print(columns)
        df_top_seven = pd.DataFrame(top_seven, columns=columns)
        r = mtr.getAccuracyCount(np.array(df_top_seven)) ;
        matched, weighted_match = mtr.print_weighted_numbers(df_top_seven.values)
        r = sum(weighted_match)

        dict_accuracy.update({s: r})

t_accuracy = sorted(dict_accuracy.items(),key=operator.itemgetter(1), reverse=True)
print('Done')


In [None]:
matched, weighted_match = mtr.print_weighted_numbers(df_top_seven.values)
print(matched)
print(weighted_match)

In [None]:
n = 7
print(t_accuracy[:n])

a = [x[0].split('_') for x in t_accuracy[:n] ] 
print(a)
for xx in a:
    test_pred = []
    s = ''
    combine_prediction(xx, test_pred)
    all_pred = [] ;
    for i in range(len(test_pred)):
        if ( i == 0 ):
            all_pred = test_pred[i]
        else:
            all_pred = np.column_stack((all_pred, test_pred[i]) )

    top_seven = []
    for i in range(len(all_pred)):
        unique, counts = np.unique(all_pred[i], return_counts=True)
        x = dict(zip(unique, counts))
        sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) # sorted by value
        l = list(islice([int(x) for x,y in sorted_x],top_n))
        while ( len(l) < top_n ):
          l.append(-1)
        top_seven.append(l)


    columns = ['N'+str(i+1) for i in range(len(top_seven[0]))]
    df_top_seven = pd.DataFrame(top_seven, columns=columns)
    r = mtr.getAccuracyCount(np.array(df_top_seven)) ;
    print ( "Accuracy: ",  r)
    dict_accuracy.update({s: r})
    mtr.plot_matched_counts(df_top_seven.values)




In [None]:

# Nov 26
# 16 22 28 31 38 46 33

In [None]:
#Keep track of all results
#df_predictions = []

#print(df_predictions)
#mtr = MyTotoResearch(algo_no=1)
def getAllData(df):
#     drop_cols = ['T', 'L','M','S','R','E','A','V' ,'J','U','K']
#     X = df.drop(drop_cols, axis=1)

    use_cols = ['Ph','il','age','dist','adia','sundist','sunadia']
    X = df[use_cols]
    return X

lresult, df = mtr.load_totodata()

test_data = mtr.get_test_data()
X = mtr.modified_dataset(getAllData(test_data)) #

print(len(df_predictions))
for n in range(len(df_predictions)):
    print( mtr.getAccuracyCount(np.array(df_predictions[n])))
    mtr.print_predictions(df_predictions[n])

