In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler

#os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"


In [47]:
# Generate predictors
X_raw = np.reshape(np.random.random(100*9), (100, 9))

# Standardize the predictors
scaler = StandardScaler().fit(X_raw)
X = scaler.transform(X_raw)

print(X.shape)

(100, 9)


In [48]:
# Lets generate target

# Define coefficients
some_weights = np.array([2,6,7,3,5,7,1,2,4])
Y_truth = np.matmul(X,some_weights)

print(Y_truth)

[ -2.04110388  -8.5724505   -7.23085155   6.87484683  -7.42147162
   5.5858523   21.31690013  -0.84706693 -22.61726735  -4.77937261
 -22.7455036   -6.8954092    0.07074924   2.43202914 -10.76550653
 -41.35170936  -5.65726034  -2.3728673    8.14786604  14.96951487
   4.47156115 -29.28242162   5.49649993 -26.4347966    7.57117087
  22.54042899  -6.33131555  32.09758329 -10.86437351  21.40124674
 -12.59003552  -3.37852505  15.62912372 -32.56274255   8.62355047
  16.23333353  10.43385017 -16.4257275    3.40393089 -11.57441275
   0.3948683   -4.35615823 -23.8913256  -10.57974635 -26.56892105
  24.51586948  14.85344067  29.83525859 -10.60900179   7.30337067
  -4.65095041  -0.40459003  20.18433457  17.57085456  15.21616621
 -18.54668836 -10.18576955  13.72151126   1.05325125  38.88514474
  19.76540699  13.61383201  -0.73306291   6.07779034   4.45245843
 -31.60316399  10.52112459 -14.54845361  -7.23491191 -19.31832958
  23.67091443  -4.33569764  -1.90458896   0.90209002   4.15632061
   1.63650

In [52]:
def mean_absolute_percentage_error(y_pred, y_truth, w=None):
    y_true = np.array(y_truth)
    y_pred = np.array(y_pred)
    if np.any(y_true==0):
        print("Remove zeros from set...")
        idx = np.where(y_true==0)
        y_true = np.delete(y_true, idx)
        y_pred = np.delete(y_pred, idx)
        if type(w) != type(None):
            w = np.array(w)
            w = np.delete(w, idx)     
    if type(w) == type(None):
        return(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)
    return 100/sum(w)*np.dot(w, (np.abs((y_true - y_pred) / y_true)))
    
custom_loss_function = mean_absolute_percentage_error


In [53]:
from scipy.optimize import minimize

def objective_function(w, X, Y):
    return custom_loss_function(np.matmul(X,w), Y)

Y = Y_truth
# add some noise
#Y = Y_truth*np.exp(np.random.normal(loc=0.0, scale=0.2, size=100))

#print(Y)

# provide a starting weights at which to initialize
# the parameter search space
initial_weights = np.array([1]*X.shape[1])
#initial_weights = np.array([np.mean(X)]*X.shape[1])

result = minimize(objective_function, initial_weights, args=(X,Y),
                  method='BFGS', options={'maxiter': 1000})

# The optimal values for the input parameters are stored
# in result.x
estimated_weights = result.x

print(estimated_weights)


[1.99999998 5.99999997 6.99999996 2.99999998 4.99999997 6.99999995
 1.         1.99999998 3.99999997]


In [55]:
pd.DataFrame({
    "correcxt_weights": some_weights, 
    "estimated_weights": estimated_weights,
    "error": np.array(np.around(some_weights-estimated_weights,2))
})

Unnamed: 0,correcxt_weights,estimated_weights,error
0,2,2.0,0.0
1,6,6.0,0.0
2,7,7.0,0.0
3,3,3.0,0.0
4,5,5.0,0.0
5,7,7.0,0.0
6,1,1.0,0.0
7,2,2.0,0.0
8,4,4.0,0.0


In [56]:
round(custom_loss_function(np.matmul(X,estimated_weights), Y),4)

0.0

In [None]:
#How to factor in the regularization
#We need to create a Class

In [57]:
class ManyTargetsModel:
    """
    Linear model: Y = XB, fit by minimizing the provided loss_function
    with L2 regularization
    """
    def __init__(self, loss_function, regularization=0.00012):
        self.regularization = regularization
        self.loss_function = loss_function


    def predict(self, X):
        prediction = np.matmul(X, self.beta)
        return(prediction)

#     def score(self, X, y_true):
#         return(sum(self.predict(X)))


    def model_error(self):
        error = self.loss_function(
            self.predict(self.X), self.Y, w=self.sample_weights
        )
        return(error)
    
    def l2_regularized_loss(self, beta):
        self.beta = beta
        m = len(self.X)
#        return (self.model_error())
        return(self.model_error()/m + \
                sum(((self.regularization)/2*m)*(np.array(self.beta)**2)))
    
    def fit(self, X, Y, maxiter=250, sample_weights=None, initial_weights=None):        
        self.X = X
        self.Y = Y

        self.beta = None  #latest weights
        self.sample_weights = sample_weights
        self.estimated_weights = initial_weights
        
        # Initialize estimated_weights
        if type(self.estimated_weights)==type(None):
            # set estimated_weights = 1 for every feature
            self.estimated_weights = np.array([1]*self.X.shape[1])
            
        res = minimize(self.l2_regularized_loss, self.estimated_weights
                       #, args=(X,Y),
                       ,method='BFGS', options={'maxiter': maxiter})
        self.beta = res.x
        self.estimated_weights = self.beta

In [37]:
model = ManyTargetsModel(mean_absolute_percentage_error, regularization=0.000012)
model.fit( X, Y)
model.estimated_weights

array([1.9999998 , 5.99999991, 6.99999968, 2.99999989, 4.99999973,
       6.99999967, 1.0000003 , 1.99999989, 4.00000013])

In [38]:
pd.DataFrame({
    "initial_weights": some_weights, 
    "estimated_weights": model.estimated_weights,
    "error": np.array(np.around(some_weights-model.estimated_weights,6))
})

Unnamed: 0,initial_weights,estimated_weights,error
0,2,2.0,0.0
1,6,6.0,0.0
2,7,7.0,0.0
3,3,3.0,0.0
4,5,5.0,0.0
5,7,7.0,0.0
6,1,1.0,-0.0
7,2,2.0,0.0
8,4,4.0,-0.0


In [39]:
# Predicted Y vs. observed Y
#plt.scatter(model.predict(X), Y)
round(custom_loss_function(np.matmul(X,model.estimated_weights), Y),4)

0.0

In [40]:
from sklearn.model_selection import KFold

# Used to cross-validate models and identify optimal lambda
class CustomCrossValidator:
    
    """
    Cross validates arbitrary model using MAPE criterion on
    list of lambdas.
    """
    def __init__(self):
        pass
        
    def cross_validate(self, ModelClass, X, Y, lambdas, 
                        loss_function, 
                        sample_weights=None,
                        num_folds=10):
        """
        lambdas: set of regularization parameters to try
        num_folds: number of folds to cross-validate against
        """
        
        self.X = X
        self.Y = Y
        self.ModelClass = ModelClass
        self.loss_function = loss_function
        self.sample_weights = sample_weights
    

        self.lambdas = lambdas
        self.cv_scores = []
        X = self.X
        Y = self.Y 
        
        # Beta values are not likely to differ dramatically
        # between differnt folds. Keeping track of the estimated
        # beta coefficients and passing them as starting values
        # to the .fit() operator on our model class can significantly
        # lower the time it takes for the minimize() function to run
        beta_init = None
        
        for lam in self.lambdas:
            print("Lambda: {}".format(lam))
            
            # Split data into training/holdout sets
            kf = KFold(n_splits=num_folds, shuffle=True)
            kf.get_n_splits(X)
            
            # Keep track of the error for each holdout fold
            k_fold_scores = []
            
            # Iterate over folds, using k-1 folds for training
            # and the k-th fold for validation
            f = 1
            for train_index, test_index in kf.split(X):
                # Training data
                CV_X = X[train_index,:]
                CV_Y = Y[train_index]
                CV_weights = None
                if type(self.sample_weights) != type(None):
                    CV_weights = self.sample_weights[train_index]
                
                # Holdout data
                holdout_X = X[test_index,:]
                holdout_Y = Y[test_index]
                holdout_weights = None
                if type(self.sample_weights) != type(None):
                    holdout_weights = self.sample_weights[test_index]
                
                # Fit model to training sample
                lambda_fold_model = self.ModelClass(self.loss_function, regularization=lam)
                lambda_fold_model.fit(CV_X, CV_Y, sample_weights=CV_weights, initial_weights=beta_init)
                
                # Extract beta values to pass as beta_init 
                # to speed up estimation of the next fold
                beta_init = lambda_fold_model.beta
                
                # Calculate holdout error
                fold_preds = lambda_fold_model.predict(holdout_X)
                fold_mape = self.loss_function(holdout_Y, fold_preds, w=holdout_weights)
                k_fold_scores.append(fold_mape)
                print("Fold: {}. Error: {}".format( f, fold_mape))
                f += 1
            
            # Error associated with each lambda is the average
            # of the errors across the k folds
            lambda_scores = np.mean(k_fold_scores)
            print("** AVERAGE: {}".format(lambda_scores))
            self.cv_scores.append(lambda_scores)
        
        # Optimal lambda is that which minimizes the cross-validation error
        self.lambda_star_index = np.argmin(self.cv_scores)
        self.lambda_star = self.lambdas[self.lambda_star_index]
        print("\n\n**BEST LAMBDA: {}**".format(self.lambda_star))

In [41]:
# specify lambdas values to search
lambdas = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]

cross_validator = CustomCrossValidator()
cross_validator.cross_validate(ManyTargetsModel, X, Y,  lambdas, custom_loss_function,  num_folds=5)

Lambda: 1
Fold: 1. Error: 1343315.7669230883
Fold: 2. Error: 10117286.405875206
Fold: 3. Error: 502567.5719595025
Fold: 4. Error: 6897773.432129745
Fold: 5. Error: 866656.001443986
** AVERAGE: 3945519.8356663054
Lambda: 0.1
Fold: 1. Error: 595807.8279506762
Fold: 2. Error: 117232.14174044305
Fold: 3. Error: 199046.06853327173
Fold: 4. Error: 136417.92202956617
Fold: 5. Error: 36356.05396702196
** AVERAGE: 216972.0028441958
Lambda: 0.01
Fold: 1. Error: 13928.831982322816
Fold: 2. Error: 12191.400491754674
Fold: 3. Error: 50647.65520196705
Fold: 4. Error: 20154.86605634329
Fold: 5. Error: 138511.81373967763
** AVERAGE: 47086.91349441309
Lambda: 0.001
Fold: 1. Error: 5346.244646863285
Fold: 2. Error: 1519.5868544844739
Fold: 3. Error: 1439.0510586443656
Fold: 4. Error: 889.627951195533
Fold: 5. Error: 1575.8220999228656
** AVERAGE: 2154.066522222104
Lambda: 0.0001
Fold: 1. Error: 804.3843824454036
Fold: 2. Error: 36.3264268232217
Fold: 3. Error: 57.44985774160144
Fold: 4. Error: 34.895358

In [42]:
lambda_star = cross_validator.lambda_star
print('with regularization: ', lambda_star)
final_model = ManyTargetsModel(custom_loss_function, regularization=lambda_star)
final_model.fit(X, Y)
final_model.estimated_weights
y_pred = final_model.predict(X)
train_data = pd.DataFrame(X)
train_data['y_truth'] = Y
train_data['y_pred'] = y_pred
train_data

with regularization:  1e-06


Unnamed: 0,0,1,2,3,4,5,6,7,8,y_truth,y_pred
0,0.463340,0.605805,0.022953,-1.143801,0.235715,0.644254,1.276217,-1.254529,-1.377642,0.235711,0.235711
1,1.640901,-0.544786,-1.709071,0.282615,-0.169942,-0.611693,-0.131537,-1.024020,-0.953006,-22.225730,-22.225729
2,0.970912,0.187193,0.221589,-1.385247,-0.244116,0.242864,1.323801,1.097155,0.517549,6.528140,6.528139
3,0.102926,-0.695344,-0.291127,-0.486043,0.042624,0.714035,1.176323,-0.320013,0.941989,2.053385,2.053385
4,-0.091495,-0.200845,1.579307,-1.185153,-1.450891,1.515346,0.734429,-0.988471,-1.072902,3.930470,3.930470
5,-1.617708,-1.643191,-1.058965,-0.515723,0.322674,-0.080178,0.069471,-0.409094,1.209383,-16.913551,-16.913551
6,-0.003564,1.601169,0.676719,1.484848,1.340410,-0.745118,-0.771643,1.473302,-0.689814,19.693391,19.693391
7,0.260826,-0.173039,-1.437188,1.023740,-1.353533,-0.130372,0.651006,-1.704830,1.556819,-11.717325,-11.717324
8,-0.691403,1.387041,-0.710818,0.725680,1.220434,1.577667,1.119616,-1.504898,1.521724,25.483311,25.483310
9,-0.054124,-1.452695,1.669528,0.316676,-0.136198,-1.354090,1.262854,-1.366396,-1.273295,-12.910427,-12.910426


In [43]:
round(custom_loss_function(np.matmul(X,final_model.estimated_weights), Y),4)

0.0

In [44]:
test_data = np.random.random((10,9))
test_result = final_model.predict(test_data)

In [45]:
data = pd.DataFrame(test_data)
data['y_pred'] = test_result
data


Unnamed: 0,0,1,2,3,4,5,6,7,8,y_pred
0,0.376174,0.098168,0.749752,0.773878,0.931769,0.436162,0.867291,0.341407,0.7998,21.372531
1,0.123409,0.236409,0.007843,0.325244,0.330626,0.585656,0.994374,0.614705,0.881945,14.200184
2,0.623453,0.997036,0.714577,0.016341,0.78911,0.004986,0.528634,0.660836,0.923247,21.803928
3,0.591259,0.221597,0.386482,0.116665,0.031293,0.629575,0.829792,0.66327,0.074777,12.586394
4,0.899467,0.46464,0.694096,0.527885,0.327508,0.384824,0.62872,0.918305,0.382895,19.357311
5,0.855617,0.564766,0.865399,0.239339,0.591516,0.268935,0.268407,0.119821,0.129832,17.743136
6,0.787594,0.144437,0.402183,0.465212,0.11225,0.669448,0.364317,0.725385,0.076757,14.022224
7,0.646415,0.073898,0.016171,0.446606,0.654247,0.874867,0.026137,0.803386,0.521216,16.302307
8,0.254905,0.640424,0.522446,0.381834,0.699115,0.312262,0.408142,0.118649,0.604684,17.900562
9,0.273383,0.887628,0.399006,0.141607,0.019918,0.215078,0.03277,0.120582,0.433281,12.702592


In [18]:
import numpy as np
from sklearn.metrics import make_scorer
def my_custom_loss_func(ground_truth, predictions):
    diff = np.abs(ground_truth - predictions).max()
    return np.log(1 + diff)

# loss_func will negate the return value of my_custom_loss_func,
#  which will be np.log(2), 0.693, given the values for ground_truth
#  and predictions defined below.
loss  = make_scorer(my_custom_loss_func, greater_is_better=False)
score = make_scorer(my_custom_loss_func, greater_is_better=True)
ground_truth = [[1, 1],[2,1]]
predictions  = [0, 1]
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent', random_state=0)
clf = clf.fit(ground_truth, predictions)
loss(clf,ground_truth, predictions) 

score(clf,ground_truth, predictions)

0.6931471805599453

In [19]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset
data = load_breast_cancer()

# Organize our data
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

# Look at our data
print(label_names)
print('Class label = ', labels[0])
print(feature_names)
print(features[0])

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(train, train_labels)

# Make predictions
preds = gnb.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))

['malignant' 'benign']
Class label =  0
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1

In [20]:
from functools import reduce
y_true = [[1,2],[0,0],[2,6,7,0]]
y_pred = np.array([2,4,7])


#print(y_pred[y_pred != 0])
y_true_excluding_zeros = [np.array(v)[np.array(v)!=0] for v in y_true]

y_idx = []
for idx, v in enumerate(y_true):
    if v.__contains__(2):
        print("Remove zeros from set...")
        y_idx.append(idx)

print(y_true)
print(y_pred)
print('Idx to delete ', y_idx)
y_true = y_true_excluding_zeros

#y_true = np.delete(y_true,y_idx,axis=0)
#y_pred = np.delete(y_pred,y_idx)
print(y_true)
print(y_pred)
matched_index = [t.__contains__(p) for (t,p) in zip(y_true, y_pred)]
print(matched_index)
print(sum(matched_index)/len(matched_index))


#matched_values = [reduce(np.intersect1d, (p, a)) for (p,a) in zip(y_true, y_pred)]
#print(matched_values)



Remove zeros from set...
Remove zeros from set...
[[1, 2], [0, 0], [2, 6, 7, 0]]
[2 4 7]
Idx to delete  [0, 2]
[array([1, 2]), array([], dtype=int64), array([2, 6, 7])]
[2 4 7]
[True, False, True]
0.6666666666666666


In [21]:
from sklearn.metrics.scorer import make_scorer
def multi_targets_scorer_function(y_true, y_pred):
    y_true_excluding_zeros = [np.array(v)[np.array(v)!=0] for v in y_true]
    matched_index = [t.__contains__(p) for (t,p) in zip(y_true_excluding_zeros, y_pred)]
    return sum(matched_index)/len(y_true_excluding_zeros)

multi_targets_scorer = make_scorer(multi_targets_scorer_function, greater_is_better=True)

In [22]:
#!conda install -n mldds -c anaconda joblib
import os
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

import multiprocessing
num_cores = multiprocessing.cpu_count()

print("Cores: ", num_cores)

import time
import keras
# import tensorflow as tf
# config = tf.ConfigProto( device_count = {'GPU': 0 , 'CPU': num_cores} )
# sess = tf.Session(config=config) 
# keras.backend.set_session(sess)

# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from MyTotoResearchv4 import *

Cores:  12


  return f(*args, **kwds)
Using TensorFlow backend.
  return f(*args, **kwds)
  return _inspect.getargspec(target)
  _config = json.load(open(_config_path))


Done.


In [23]:
#Install autograd
#!conda install -c omnia autograd


In [24]:
%matplotlib inline
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler



In [25]:
def wTx(w, x):
    return np.dot(x, w)

def sigmoid_range(z,bottom,top):
#    return 1./(1+np.exp(-z))
    return bottom + (top - bottom) / (1 + np.exp(-z))

def sigmoid_range_inverse(y, bottom,top):
    return np.log((y - bottom) / (top - y))

def custom_predictions(w, x):
    predictions = sigmoid_range(wTx(w, x),1,49)
    return predictions
#     global i
#     if ( i < 10 ):
#         print(X)
#     print(predictions)
#     return predictions.clip(eps, 1-eps)

def custom_loss(y, y_predicted):
#    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

i = 0
def custom_loss_given_weights(w):
#     global i
#     if ( i < 10 ):
#         i = i + 1
#         print(X)
    y_predicted = custom_predictions(w, X)
    y_matched = y_predicted[np.abs(y_predicted-y).argmin()]
    return custom_loss(y_matched, y_predicted)
    
gradient = grad(custom_loss_given_weights)

NameError: name 'grad' is not defined

In [None]:
%pylab inline

import pandas as pd

from autograd import grad
import autograd.numpy as np


def getAllData(df):
    drop_cols = ['T', 'D', 'N1','N2','N3','N4','N5','N6','N7','L','M','S','R','E','A','V' ,'J','U']
    X = df.drop(drop_cols, axis=1)
    return X



In [None]:
def wTx(w, x):
    return np.dot(x, w)

def sigmoid_range(z,bottom,top):
#    return 1./(1+np.exp(-z))
    return bottom + (top - bottom) / (1 + np.exp(-z))

def sigmoid_range_inverse(y, bottom,top):
    return np.log((y - bottom) / (top - y))

def custom_predictions(w, x):
    predictions = sigmoid_range(wTx(w, x),1,49)
    return predictions
#     global i
#     if ( i < 10 ):
#         print(X)
#     print(predictions)
#     return predictions.clip(eps, 1-eps)

def custom_loss(y, y_predicted):
    return -(y*np.log(y_predicted) - (1-y)*np.log(1-y_predicted)**2).mean()

i = 0
def custom_loss_given_weights(w):
#     global i
#     if ( i < 10 ):
#         i = i + 1
#         print(X)
    y_predicted = custom_predictions(w, X)
    y_matched = y_predicted[np.abs(y_predicted-y).argmin()]
    return custom_loss(y_matched, y_predicted)
    
gradient = grad(custom_loss_given_weights)

In [None]:
X = np.array([
    [ 0.3213,  0.4856,  0.2995,  2.5044],
    [ 0.3005,  0.4757,  0.2974,  2.4691],
    [ 0.5638,  0.8005,  0.3381,  2.3102],
    [ 0.5281,  0.6542,  0.3129,  2.1298],
    [ 0.3221,  0.5126,  0.3085,  2.6147],
    [ 0.3055,  0.4885,  0.289 ,  2.4957],
    [ 0.3276,  0.5185,  0.3218,  2.6013],
    [ 0.5313,  0.7028,  0.3266,  2.1543],
    [ 0.4728,  0.6399,  0.3062,  2.0597],
    [ 0.3221,  0.5126,  0.3085,  2.6147]
])
y = np.array([1., 1., 0., 0., 1., 1., 1., 1., 0., 0.])

weights = np.zeros(X.shape[1])
eps = 1e-15


In [None]:
df = pd.DataFrame(
    [(y_hat, custom_loss(False, y_hat)) for y_hat in np.linspace(0, 1, 101)],
    columns=['y_hat', 'loss']
).plot(x='y_hat', title='y_hat vs. Loss for y=0')

In [None]:
df = pd.DataFrame(
    [(y_hat, custom_loss(True, y_hat)) for y_hat in np.linspace(0, 1, 101)],
    columns=['y_hat', 'loss']
).plot(x='y_hat', title='y_hat vs. Loss for y=1')

In [None]:
for i in range(1000):
    if i % 100 == 0:
        print('Iteration %-4d | Loss: %.4f' % (i, custom_loss_given_weights(weights)))
    weights -= gradient(weights) * .05

In [None]:
def store_prediction(mrt, model, f, scaler=None, name='unnamed'):
    def getAllData(df):
        drop_cols = ['T', 'L','M','S','R','E','A','V' ,'J','U']
        X = df.drop(drop_cols, axis=1)
#        print(df.head())
        use_cols = ['Ph','il','age','dist','adia','sundist','sunadia']
        X = df[use_cols]
        return X

    test_data = mtr.get_test_data()
    X = mtr.modified_dataset(getAllData(test_data)) #
#    X = getAdjustedDataF(test_data,f)


    if ( scaler == None ):
        Z = X
    else:
        scaler.fit(X)
        Z = scaler.transform(X)

    predictions = model.predict(Z)

    dfResult= pd.DataFrame(predictions, columns=['N1', 'N2', 'N3', 'N4', 'N5','N6', 'N7'])
#    mtr.print_predictions(dfResult)

    global df_predictions
    global prev_r
    r = mtr.getAccuracyCount(np.array(dfResult)) ;
#    if ( r > prev_r ):
#        df_predictions = []
    df_predictions.append(dfResult)
    g_all_pred.update({name : dfResult})



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from keras.models import Input, Model
import keras
from keras.layers import Dense
import time
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
import json as simplejson
from keras import regularizers
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier, ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVR

from sklearn.linear_model import SGDRegressor, SGDClassifier, LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge, RidgeClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC, SVR, LinearSVC
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

seed = 42

mtr = MyTotoResearch(algo_no=1)
lresult, df = mtr.load_totodata()

df_predictions = []


all_models = []

#all_models.append(('SVCpoly01', SVC(kernel='poly', coef0=0.05, probability=True, degree=2, random_state=seed)))
#all_models.append(('SVCrbf010', SVC(kernel='rbf', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf011', SVC(kernel='rbf', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf012', SVC(kernel='rbf', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0103', SVC(kernel='rbf', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0113', SVC(kernel='rbf', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0123', SVC(kernel='rbf', coef0=0.25, probability=True, degree=3, random_state=seed)))


#all_models.append(('SVCrbf020', SVC(kernel='sigmoid', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf021', SVC(kernel='sigmoid', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf022', SVC(kernel='sigmoid', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0203', SVC(kernel='sigmoid', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0213', SVC(kernel='sigmoid', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0223', SVC(kernel='sigmoid', coef0=0.25, probability=True, degree=3, random_state=seed)))


# all_models.append(('SVCrbf030', SVC(kernel='linear', coef0=0.75, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf031', SVC(kernel='linear', coef0=0.5, probability=True, degree=2, random_state=seed)))
# all_models.append(('SVCrbf032', SVC(kernel='linear', coef0=0.25, probability=True, degree=2, random_state=seed)))

# all_models.append(('SVCrbf0303', SVC(kernel='linear', coef0=0.75, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0313', SVC(kernel='linear', coef0=0.5, probability=True, degree=3, random_state=seed)))
# all_models.append(('SVCrbf0323', SVC(kernel='linear', coef0=0.25, probability=True, degree=3, random_state=seed)))



# all_models.append(('LR', (LogisticRegression(random_state=seed))))

#all_models.append(('KNNC', KNeighborsClassifier()))
#all_models.append(('KNNR', KNeighborsRegressor()))
#all_models.append(('RC', RidgeClassifier(random_state=seed)))
# all_models.append(('LR', LogisticRegression(random_state=seed)))
# all_models.append(('LDA', LinearDiscriminantAnalysis()))
# all_models.append(('DTR', DecisionTreeRegressor()))
# all_models.append(('ETR', ExtraTreesRegressor(n_estimators=5)))
#all_models.append(('ETC', ExtraTreesClassifier(n_estimators=5)))
# all_models.append(('EN', ElasticNet()))
#all_models.append(('CART', DecisionTreeClassifier()))
# all_models.append(('NB', GaussianNB()))
# all_models.append(('Lasso', Lasso()))
all_models.append(('GBR', GradientBoostingRegressor()))
#all_models.append(('RFR5', RandomForestClassifier(n_estimators=5, n_jobs=5, random_state=seed)))
# all_models.append(('RFR5', RandomForestClassifier(n_estimators=5, n_jobs=5, random_state=seed)))
# all_models.append(('RFR3', RandomForestRegressor(n_estimators=3, n_jobs=5, random_state=seed)))
# all_models.append(('SGDR', SGDRegressor(random_state=seed)))
#all_models.append(('AdaB', AdaBoostClassifier(RandomForestClassifier(n_estimators=3))))
#all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(500,500,500), max_iter=2000, alpha=0.001, activation='tanh', learning_rate='adaptive', solver='sgd', verbose=0,  random_state=42,tol=0.000000001)))

#92.45 accuracy
#all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(490,490,490,490,490,490,490), max_iter=500000, alpha=0.001, activation='relu', learning_rate='adaptive', solver='adam', verbose=10,  random_state=42,tol=0.000000001)))


all_models.append(('MLPC', MLPClassifier(hidden_layer_sizes=(780,490,780,490,780,490,280), max_iter=500000, alpha=0.001, activation='relu', learning_rate='adaptive', solver='adam', verbose=10,  random_state=42,tol=0.000000001)))





In [None]:
# evaluate each model in turn
from sklearn import model_selection
results = []
names = []
scoring = 'accuracy'

g_all_pred = {}

X = mtr.modified_dataset(getAllData(df)) #
f = 1.0 #365/27.58
#    X = getAdjustedDataF(df,f)

scaler = StandardScaler()
scaler = RobustScaler()
scaler.fit(X)
Z = scaler.transform(X)

for name, model in all_models:
    
    
#    scaler = None
#    Z = X

#     kfold = model_selection.KFold(n_splits=3, random_state=seed)
#     cv_results = model_selection.cross_val_score(model, Z, mtr.getTarget(3), cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
#     print(msg)
    
    oClassifier = MultiOutputClassifier(model, n_jobs=7)
    oClassifier.fit(Z, mtr.getTargets()) 
    print(oClassifier)
    s = oClassifier.score(Z, mtr.getTargets())
    if(oClassifier.score(Z, mtr.getTargets()) == 1.0):
        print( name, ' ', str(f), ' ', str(s))
    store_prediction(mtr, oClassifier, f, scaler=scaler, name=name)
    start = time.clock()
    print(str(f), " Time taken: ", (time.clock() - start),  " ")

# for n in range(len(df_predictions)):
#     print( mtr.getAccuracyCount(np.array(df_predictions[n])))
#     mtr.print_predictions(df_predictions[n])






# boxplot algorithm comparison
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# plt.show()

print('Done')

In [None]:
def combine_prediction(arr, initial_pred=[]):
    global s
    if ( isinstance(arr, list) ):
        for a in arr:
            combine_prediction(a, initial_pred)
        return 
    if ( len(s) > 1 ):
        s += '_'
    s += arr
    initial_pred.append(g_all_pred[arr])
    return 



In [None]:
import itertools
from itertools import combinations
import operator 
from itertools import islice

name_ = []

lst = [name for name, model in all_models]
iBestIndex = -1
iBestN = []
#print("List ", lst)
top_n = 12


dict_accuracy = {}
for z in range(5, 0,-1):
    a = [list(x) for x in itertools.combinations(lst, z) if len(x) > 1 ] 
#    print(a)

    for xx in a:
        test_pred = []
        s = ''
        combine_prediction(xx, test_pred)
#        print(s)

        #print(len(test_pred))

        all_pred = [] ;
        for i in range(len(test_pred)):
            if ( i == 0 ):
                all_pred = test_pred[i]
            else:
                all_pred = np.column_stack((all_pred, test_pred[i]) )

        top_seven = []
        for i in range(len(all_pred)):
            unique, counts = np.unique(all_pred[i], return_counts=True)
            x = dict(zip(unique, counts))
            sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) # sorted by value
            l = list(islice([int(x) for x,y in sorted_x],top_n))
            while ( len(l) < top_n ):
                l.append(-1)

            top_seven.append(l)
            

#        print(len(top_seven))
#         if(len(top_seven[0]) < top_n ):
#             print("*** Caught ", )
        columns = ['N'+str(i+1) for i in range(len(top_seven[0]))]
#        print(columns)
        df_top_seven = pd.DataFrame(top_seven, columns=columns)
        r = mtr.getAccuracyCount(np.array(df_top_seven)) ;
        matched, weighted_match = mtr.print_weighted_numbers(df_top_seven.values)
        r = sum(weighted_match)

        dict_accuracy.update({s: r})

t_accuracy = sorted(dict_accuracy.items(),key=operator.itemgetter(1), reverse=True)
print('Done')


In [None]:
matched, weighted_match = mtr.print_weighted_numbers(df_top_seven.values)
print(matched)
print(weighted_match)

In [None]:
n = 7
print(t_accuracy[:n])

a = [x[0].split('_') for x in t_accuracy[:n] ] 
print(a)
for xx in a:
    test_pred = []
    s = ''
    combine_prediction(xx, test_pred)
    all_pred = [] ;
    for i in range(len(test_pred)):
        if ( i == 0 ):
            all_pred = test_pred[i]
        else:
            all_pred = np.column_stack((all_pred, test_pred[i]) )

    top_seven = []
    for i in range(len(all_pred)):
        unique, counts = np.unique(all_pred[i], return_counts=True)
        x = dict(zip(unique, counts))
        sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) # sorted by value
        l = list(islice([int(x) for x,y in sorted_x],top_n))
        while ( len(l) < top_n ):
          l.append(-1)
        top_seven.append(l)


    columns = ['N'+str(i+1) for i in range(len(top_seven[0]))]
    df_top_seven = pd.DataFrame(top_seven, columns=columns)
    r = mtr.getAccuracyCount(np.array(df_top_seven)) ;
    print ( "Accuracy: ",  r)
    dict_accuracy.update({s: r})
    mtr.plot_matched_counts(df_top_seven.values)




In [None]:

# Nov 26
# 16 22 28 31 38 46 33

In [None]:
#Keep track of all results
#df_predictions = []

#print(df_predictions)
#mtr = MyTotoResearch(algo_no=1)
def getAllData(df):
#     drop_cols = ['T', 'L','M','S','R','E','A','V' ,'J','U','K']
#     X = df.drop(drop_cols, axis=1)

    use_cols = ['Ph','il','age','dist','adia','sundist','sunadia']
    X = df[use_cols]
    return X

lresult, df = mtr.load_totodata()

test_data = mtr.get_test_data()
X = mtr.modified_dataset(getAllData(test_data)) #

print(len(df_predictions))
for n in range(len(df_predictions)):
    print( mtr.getAccuracyCount(np.array(df_predictions[n])))
    mtr.print_predictions(df_predictions[n])

