#### We have now more than one target value that we want to predict!!!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

data= fetch_openml('mnist_784', version=1, parser="auto")#Get data from https://www.openml.org/d/554
dfData = pd.DataFrame(np.c_[data["data"],data["target"]],columns = data["feature_names"]+["target"])
def numberOfLoops(num): #now: extra target numberOfLoops
    if num in ["0","6","9"]:
        return 1
    elif num == "8":
        return 2
    return 0
dfData["numLoops"] = dfData["target"].apply(numberOfLoops)
dfData["hasLoop"] = dfData["numLoops"].apply(lambda x: True if x>0 else False) #also an extra target: the bool hasLoops

In [2]:
dfData.columns

Index(['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7',
       'pixel8', 'pixel9', 'pixel10',
       ...
       'pixel778', 'pixel779', 'pixel780', 'pixel781', 'pixel782', 'pixel783',
       'pixel784', 'target', 'numLoops', 'hasLoop'],
      dtype='object', length=787)

In [3]:
img_pipeline = Pipeline([("mm_scaler", MinMaxScaler())])
y = dfData[["target","numLoops","hasLoop"]] #splitting now additionally into numLoops and hasLoop (= three different target values)
dfData = dfData.drop(["target","numLoops","hasLoop"],axis=1)
X = dfData.copy()
X_transf = img_pipeline.fit_transform(X)

In [4]:
stratSplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)
for train_index, test_index in stratSplit.split(X_transf, y):
    X_train = X_transf[train_index]
    X_test = X_transf[test_index]
    
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]

In [5]:
log_reg = LogisticRegression(C=1e5, max_iter=100)
log_reg.fit(X_train, y_train.values)
log_reg.predict(X_test)[0]
#result: error because the logistic regression isn't set up to do multiple logistic regression

ValueError: y should be a 1d array, got an array of shape (56000, 3) instead.

In [6]:
#solution: use the MultiOutputClassifier
from sklearn.multioutput import MultiOutputClassifier

#not necessary in new sklearn versions: transform targets to int to avoid a TypeError at the end
#y_train["target"] = y_train["target"].apply(lambda x: int(x))
#y_train["hasLoop"] = y_train["hasLoop"].apply(lambda x: int(x))

# y_test["target"] = y_test["target"].apply(lambda x: int(x))
log_reg = LogisticRegression(C=1e5)
log_reg_multi_label = MultiOutputClassifier(log_reg) #possible: initialize the LR within the MultiOutputClassifier
log_reg_multi_label.fit(X_train, y_train.values)
log_reg_multi_label.predict(X_test)[0]

#result: MOC will train a different classifier for bacically every label that we have (here: 3 labels)

TypeError: '<' not supported between instances of 'bool' and 'str'

In [7]:
#writing our own multilabel classifier class
from sklearn.base import clone

class customMultilabel():
    def __init__(self, clf): #initialize the input classifier
        self.clf = clf
        self.clfs = []
        
    def fit(self, X, y):
        if len(y.shape) == 2: #if shape is two-dimensional
            self.numOutputs = y.shape[-1] #final element tells us the number of output that we want
        elif len(y.shape) == 1:
            self.numOutputs = 1
        else:
            print("Unexpected target shape")
            raise(RuntimeError)
            
        for i in range(self.numOutputs): #iterate over numbers of output
            self.clfs.append(clone(self.clf)) #copy the self-created classifier clf independently (only clones target or number of loops or hasLoop)
            self.clfs[i].fit(X, y.iloc[:,i]) #fit each classifier to one specific label
            
    def predict(self, X):
        output = None
        for i in range(self.numOutputs): #loop over number of outputs
            r = self.clfs[i].predict(X) #
            if output is None:
                output = r.copy()
            else:
                output = np.c_[output, r] #join all features together
            
        return output

#now run all again that had been executed before the "from sklearn.multioutput import MultiOutputClassifier"

In [None]:
cml_log_reg = customMultilabel(LogisticRegression(C=1e5))
cml_log_reg.fit(X_train, y_train)
output = cml_log_reg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#show output
output[:5]

In [16]:
#show expcted test output
y_test[:5]

Unnamed: 0,target,numLoops,hasLoop
54755,0,1,True
6034,0,1,True
18245,4,0,False
41828,6,1,True
24289,1,0,False
