In [60]:
import pandas as pd

import importlib

import numpy as np

from os import listdir
from os.path import isfile, join

import time

class Primitive(object):

    def __init__(self, name):
        self.name = name

def class_for_name(module_name, class_name):
    try:
        # load the module, will raise ImportError if module cannot be loaded
        m = importlib.import_module(module_name)
        # get the class, will raise AttributeError if class cannot be found
        c = getattr(m, class_name)()
        return c
    except:
        print('Could not load module: '+module_name+'.'+class_name)
        return -1

#Data: dataset to test (from the test datasets that illustrate different requirements)
#Primitive: primitive being tested. We assume it has the fit() method
#The second field returned indicates whether the primitives needs an array or not
def passTest(data, primitive):
    target = data.iloc[:,-1]
    train = data.drop(data.columns[[len(data.columns)-1]], axis=1) #drop target column (the last one)
    #test
    try:
        y_pred = primitive.fit(train,target).predict(train)# Not all primitives have a predict, but should have fit
        #print("PASSED: "+data.name)
        #print (y_pred)
        return True, False
    except Exception as e:
        #print(e)
        try:
            y_pred = primitive.fit(train).transform(train)
            #print (y_pred)
            return True, False
        except:
            #Some primitives can only be applied to arrays, not matrix!
            try:
                for col in train.columns:
                    #print (col)
                    #Need to do the transform, otherwise exceptions may not be raised
                    y_pred = primitive.fit(train[col]).transform(train[col])
                return True, True
            except: 
                return False, False
        #print("NOT PASSED: " +data.name)
        


#Path: String with the path to the dataset folders. The system assumes to have three: clean_data, requirement_data and performance_data
#Primitive module name: string with the module name. E.g., 'sklearn.svm'
#Primitive name: string with the name of the primitive to be loaded. E.g., 'SVC'
#testPerformance: boolean that is true if you want to test the performance tests (will require more time)
def getPrimitiveRequirements(path, primitiveModuleName, primitiveName, testPerformance):
    CLEAN = path + "clean_data"
    REQ = path + "requirement_data"
    PERF = path + "performance_data"
    primExec =  class_for_name(primitiveModuleName,primitiveName)
    if(primExec == -1):
        print("The primitive module could not be loaded.")
        return -1
    prim = Primitive(primitiveName)
    prim.id = primitiveModuleName+"."+primitiveName
    #Clean data files: all primitives should pass these tests
    data_clean_int = pd.read_csv(CLEAN +'/int_clean_data.csv')
    data_clean_float = pd.read_csv(CLEAN +'/float_clean_data.csv')
    data_clean_int.name = "CLEAN DATA INT" 
    data_clean_float.name = "CLEAN DATA FLOAT"
    
    if not hasattr(primExec, 'fit'):
        print("Primitive does not have fit method. No requirements considered")
        return -1
    
    passed, p = (passTest(data_clean_int, primExec)) and (passTest(data_clean_float, primExec))
    #print(passed)
    if(not passed):
        print("The primitive "+primitiveName+" cannot execute the clean datasets. No further requirements addressed")
        return -1
    
    if hasattr(primExec, 'predict'):
        #primitive is a classifier/regression
        target = data_clean_float.iloc[:,-1]
        train = data_clean_float.drop(data_clean_float.columns[[len(data_clean_float.columns)-1]], axis=1) #drop target column (the last one)
        y_pred = primExec.fit(train,target).predict(train)
        
        if issubclass(y_pred.dtype.type, np.floating):
            prim.isRegression = True
        else:
            prim.isClassification = True
    
    #Rest of the tests
    onlyfiles = [f for f in listdir(REQ) if isfile(join(REQ, f))]
    for d in onlyfiles:
        data = pd.read_csv(REQ+"/"+d)
        data.name = d
        passed,array = passTest(data, primExec)
        if ("missing" in data.name) and (not passed):
            #print("Primitive cannot handle missing values")
            prim.missing = False
        if ("categorical" in data.name) and (not passed):
            #print("Primitive cannot handle string/categorical values")
            prim.categorical = False
        if ("unique" in data.name) and (not passed):
            #print("Primitive cannot handle having a column of unique values")
            prim.unique = False
        if ("negative" in data.name) and (not passed):
            #print("Primitive cannot handle negative values")
            prim.negative = False
        if(array):
            prim.isArray = True
    if(testPerformance):
        onlyfiles = [f for f in listdir(PERF) if isfile(join(PERF, f))]
        for d in onlyfiles:
            data = pd.read_csv(PERF+"/"+d)
            data.name = d
            start = time.time()
            passed,array = passTest(data, primExec)
            end = time.time()
            print(data.name +": "+ str(end - start))      
    return prim


#assumes certain variables of the JSON have been initialized.
#NON-MISSING-VALUES: The primitive cannot handle missing values
#NUMERICAL: The primitive cannot handle string/categorical values
#NOT-UNIQUE: The primitive cannot handle columns with a single value
#NON-NEGATIVE: The primitive needs to have positive values
#ARRAY: The primitive needs to be an array, not a matrix

#Will produce a JSON file that looks like:
#{
#        "class": "sklearn.linear_model.LogisticRegression",
#        "name": "LogisticRegression", 
#        "requirements": ["NUMERICAL"]
#    }
def primitiveToJSON(primitive):
    try:
        json = "{\n" + "\"class\": \""+primitive.id+"\",\n"
        json = json + "\"name\": \""+primitive.name+"\",\n"
        if hasattr(primitive, 'isArray'):
            json = json + "\"isArray\": \"true\",\n"
        if hasattr(primitive, 'isClassification'):
            json = json + "\"LearningType\": \"Classification\",\n"
        if hasattr(primitive, 'isRegression'):
            json = json + "\"LearningType\": \"Regression\",\n"
        #If it's classification on regression, then add the task:"Modeling"
        json = json + "\"requirements\":["
        if hasattr(primitive, 'missing') or hasattr(primitive, 'categorical') or hasattr(primitive, 'unique') or hasattr(primitive, 'negative'):
            #attributes are only there if false
            if hasattr(primitive, 'missing'):
                json = json + "\"NON-MISSING-VALUES\","
            if hasattr(primitive, 'categorical'):
                json = json + "\"NUMERICAL\","
            if hasattr(primitive, 'unique'):
                json = json + "\"NOT-UNIQUE\","
            if hasattr(primitive, 'negative'):
                json = json + "\"NON-NEGATIVE\","
            json = json[:-1]
        json = json + "]\n}\n"
        return json
    except:
        print("Cannot serialize primitive")
      
def allPrimitivesToText(jsonFile):
    output = ""
    for i in data['search_primitives']:
        if i["is_class"]: #necessary because otherwise it will attempt to try many primitives that may even download data
            primitive = i["id"]
            print (primitive [:primitive.rindex('.')]+"."+primitive [primitive.rindex('.')+1:])
            p = getPrimitiveRequirements(DATADIR,primitive [:primitive.rindex('.')],primitive [primitive.rindex('.')+1:],False)
            if(p != -1):
                output += primitiveToJSON(p)
                #print (primitiveToJSON(p))
    return output
        
#Main script        
DATADIR = "data_profiler/" #Dir with the profiling datasets
print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.svm','SVC',True)))
print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.linear_model','LinearRegression',True)))
print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.preprocessing','LabelEncoder',True)))

#print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.feature_extraction.text','TfidfVectorizer',False)))
#sklearn.metrics.scorer.get_scorer
#print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.linear_model.sgd_fast','Regression',False)))
#print (primitiveToJSON(getPrimitiveRequirements(DATADIR,'sklearn.decomposition.dict_learning','DictionaryLearning',False)))



#from pprint import pprint
#import json

#with open('sklearn.json') as data_file:    
#    data = json.load(data_file)
#print(allPrimitivesToText(data))



float_100_350.csv: 0.015002012252807617
float_100_50.csv: 0.004004240036010742
float_300_350.csv: 0.1300971508026123
float_300_50.csv: 0.026957988739013672
float_600_350.csv: 0.5237033367156982
float_600_50.csv: 0.09051036834716797
{
"class": "sklearn.svm.SVC",
"name": "SVC",
"LearningType": "Classification",
"requirements":["NON-MISSING-VALUES","NUMERICAL"]
}

float_100_350.csv: 0.004458427429199219
float_100_50.csv: 0.0019996166229248047
float_300_350.csv: 0.01400136947631836
float_300_50.csv: 0.0025000572204589844
float_600_350.csv: 0.025495052337646484
float_600_50.csv: 0.0025010108947753906
{
"class": "sklearn.linear_model.LinearRegression",
"name": "LinearRegression",
"LearningType": "Regression",
"requirements":["NON-MISSING-VALUES","NUMERICAL"]
}

float_100_350.csv: 0.04850316047668457
float_100_50.csv: 0.009998798370361328
float_300_350.csv: 0.06453680992126465
float_300_50.csv: 0.010003805160522461
float_600_350.csv: 0.09606361389160156
float_600_50.csv: 0.014004230499267578
