In [187]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.io import arff
from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Perceptron, Lasso, LogisticRegression, LinearRegression, SGDRegressor, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [188]:
def readData(path):
    data = arff.loadarff(path)
    dff = pd.DataFrame(data[0])
    dff = dff.apply(LabelEncoder().fit_transform)
    df = dff.to_numpy()[:,:].astype('U13')
    df = df.astype(float)
    return df[:, :-1], df[:, -1], df
    

In [189]:
x, y, df = readData('./AdolescentASD/Autism-Adolescent-Data.arff')
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.30, random_state=1)
df

array([[0., 0., 0., ..., 1., 3., 0.],
       [0., 0., 0., ..., 1., 4., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 5., 1.],
       [1., 1., 0., ..., 1., 5., 0.],
       [1., 0., 1., ..., 1., 3., 0.]])

In [190]:
preproc=[("missing", SimpleImputer()),
        ("var", VarianceThreshold(0.01)),
        ("standardize", StandardScaler()),
        ("poly", PolynomialFeatures(2))]

pipe=Pipeline(preproc + [('model', SGDRegressor())])

#pipe=Pipeline(preproc + [('lr', LogisticRegression())])

In [191]:

params_grid=[
        {"model":[SGDRegressor(max_iter=500)],
               "model__loss":['huber', 'squared_loss', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
               "model__penalty":['l1','l2'],
               "model__alpha":np.logspace(-5,5,5),
               "poly__degree":[1,2]},
        {"model":[LinearRegression()],
               "poly__degree":[1,2]},
        {"model":[Ridge()],
               "poly__degree":[1,2],
               "model__alpha":np.logspace(-5,5,5)},
        {"model":[Lasso()],
               "poly__degree":[1,2],
               "model__alpha":np.logspace(-5,5,5)}]
"""
params_grid=[
        {"lr":[LogisticRegression(penalty='l1',max_iter=500)],
                "lr__C":np.logspace(-2,2,5),
                "lr__solver":['lbfgs']},
       {"lr": [RandomForestClassifier(random_state = 1,
                                       n_jobs = -1, criterion = 'entropy')],
         "lr__n_estimators": [100, 200],
         "lr__max_depth": [6, 8]},
        {"lr": [SVC(kernel='rbf', gamma='scale', max_iter=1000, degree=2)],
               "lr__C":np.logspace(-2,2,5)},
        {"lr": [MLPClassifier(random_state=1, max_iter=1000, hidden_layer_sizes=60)],
           "lr__solver":['sgd'],
           "lr__activation":['relu','logistic']}
            
]
"""

'\nparams_grid=[\n        {"lr":[LogisticRegression(penalty=\'l1\',max_iter=500)],\n                "lr__C":np.logspace(-2,2,5),\n                "lr__solver":[\'lbfgs\']},\n       {"lr": [RandomForestClassifier(random_state = 1,\n                                       n_jobs = -1, criterion = \'entropy\')],\n         "lr__n_estimators": [100, 200],\n         "lr__max_depth": [6, 8]},\n        {"lr": [SVC(kernel=\'rbf\', gamma=\'scale\', max_iter=1000, degree=2)],\n               "lr__C":np.logspace(-2,2,5)},\n        {"lr": [MLPClassifier(random_state=1, max_iter=1000, hidden_layer_sizes=60)],\n           "lr__solver":[\'sgd\'],\n           "lr__activation":[\'relu\',\'logistic\']}\n            \n]\n'

In [192]:
best_model=GridSearchCV(pipe,params_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1, verbose=1)
best_model.fit(x_train,y_train)
print("Parámetros del mejor clasificador:\n{}".format(best_model.best_params_))
print("Error en CV: {:0.3f}%".format(100.0 * best_model.best_score_))
print("Error en training: {:0.3f}%".format(
        100.0 * best_model.score(x_train, y_train)))
print("Error en test: {:0.3f}%".format(
        100.0 * best_model.score(x_test, y_test)))


Fitting 5 folds for each of 102 candidates, totalling 510 fits
Parámetros del mejor clasificador:
{'model': SGDRegressor(alpha=0.0031622776601683794, max_iter=500), 'model__alpha': 0.0031622776601683794, 'model__loss': 'squared_loss', 'model__penalty': 'l2', 'poly__degree': 1}
Error en CV: -10.367%
Error en training: -5.371%
Error en test: -7.282%
