## Importando bibliotecas 

In [1]:
#Importando as bibliotecas necessárias
import pandas as pd
import numpy as np
import math
import config
#import matplotlib.pyplot as plt

from scipy.io.arff import loadarff

from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold

#Importando as bibliotecas referentes ao XGBoost
import xgboost as xgb

#Importando as bibliotecas referentes ao ANN
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Sequential

## A fazer




## Duvidas

## Funções

### Modelos

In [2]:
#DecisionTree
def model_dtree(x_train,y_train,x_test,booster,eta,max_depth,colsample_bytree,subsample,min_child_weight,reg_lambda,num_boost,my_num):
    bst = DecisionTreeRegressor().fit(x_train, y_train)
    predict = bst.predict(x_test)
    return predict

#RandomForestRegressor
def model_randomForest(x_train,y_train,x_test,my_num):
    #booster,eta,max_depth,colsample_bytree,subsample,min_child_weight,reg_lambda,num_boost):
    bst = RandomForestRegressor(n_estimators = my_num,n_jobs = -1).fit(x_train,y_train)
    predict = bst.predict(x_test)
    return predict

#SVR
def model_svr(x_train,y_train,x_test,ker,gam,c,epsi,shrink,verb,max_it):
    bst = SVR(kernel=ker, gamma=gam, C=c, epsilon=epsi, shrinking=shrink, verbose=verb, max_iter=max_it).fit(x_train,y_train)
    predict = bst.predict(x_test)
    return predict

def model_multi_dtree(x_train, y_train, x_test):
    model = DecisionTreeRegressor()
    bst = MultiOutputRegressor(model).fit(x_train, y_train)
    predict = bst.predict(x_test)
    return predict

def model_ann_global(x_train, y_train, x_test):
    model = Sequential()
    model.add(Input(shape = (x_train.shape[1])))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(y_train.shape[1], activation='linear'))

    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=10, batch_size=32, verbose=0, validation_split=0.2)
    
    predict = model.predict(x_test)
    return predict

def model_ann_local(x_train, y_train, x_test):
    predict_list = []
    for i in range(y_train.shape[1]):
        model = Sequential()
        model.add(Input(shape = x_train.shape[1]))
        model.add(Dense(x_train.shape[1]//2, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(x_train.shape[1]//6, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(x_train.shape[1]//18, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='linear'))

        model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
        model.fit(x_train, y_train[:,i], epochs=10, batch_size=32, verbose=0, validation_split=0.2)

        predict = model.predict(x_test)
        predict_list.append(predict)
    
    predictions = np.concatenate(predict_list, axis=1)

    return predictions


def model_xgboost(x_train, y_train, x_test):
    param = {'max_depth': 4,
             'eta': 0.05, 
             'objective': 'reg:squarederror',
             'booster': 'gbtree',
            }
    bst = xgb.XGBRegressor(**param, n_estimators=100)
    bst.fit(x_train, y_train) 
    predict = bst.predict(x_test)
    return predict


### Cálculo de erro 

In [3]:
def computer_RRMSE_list(real_test,result_p,real_train_mean):
    """RRMSE: Relative Root Mean Square Error
        input: real_test: real test data
              result_p: predicted data
              real_train_mean: mean of real train data
        output: RRMSE list """

    _list = []
    for i in range(result_p.shape[1]):
        fenzi = 0
        fenmu = 0
        for j in range(result_p.shape[0]):
            fenzi += (result_p[j,i] - real_test[j,i])**2
            fenmu += (real_train_mean[i] - real_test[j,i])**2
        _list.append(math.sqrt(fenzi/fenmu))    
    return _list

## Implementação

In [4]:
path = "../TCC/mtr-datasets/"

#text_file = input("dataset: ")
text_file = 'andro'
data, meta = loadarff(path + text_file + ".arff")
data = pd.DataFrame(data)

data.reset_index(inplace=True)
data.replace('?', np.nan, inplace=True)
data.replace('     ?', np.nan, inplace=True)
data = data.applymap(float)

#Normalização
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)


if config.all_config[text_file].get('sample_random') == True:
    data = data.sample(frac=1,random_state = config.all_config[text_file].get('sample_random_num')).reset_index(drop=True)
data = data.fillna(pd.Series(np.nanmean(data,axis=0),index=data.columns))
label = data.iloc[:,-config.all_config[text_file].get('targets_num'):].values
data = data.iloc[:,:-config.all_config[text_file].get('targets_num')].values

error_results = []

#### MultiOutput Dtree

In [5]:
# Teste Multi-dtree
loss_list = [] 
kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'), shuffle=config.all_config[text_file].get('kfold_random'))
for train_index , test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]
    
    result_p_train = model_multi_dtree(x_train, y_train, x_test)
    loss_list_RRMSE = computer_RRMSE_list(y_test,result_p_train,np.mean(y_train,axis=0))

error_results.append(('MultiOutput Decision Tree', loss_list_RRMSE))
# print("RRMSE: ", loss_list_RRMSE)

#### Random Forest Global 

In [6]:
loss_list_RRMSE = []

kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
            shuffle=config.all_config[text_file].get('kfold_random'),
            random_state=config.all_config[text_file].get('kfold_random_num'))

for train_index, test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

    result_p_train = model_randomForest(x_train, y_train, x_test, my_num=100) #call model_randomForest once
    
    real_train_mean = y_train.mean(axis=0)    
    loss_list_RRMSE.append(computer_RRMSE_list(y_test, result_p_train, real_train_mean))

loss_list_RRMSE_np = np.array(loss_list_RRMSE)
error_results.append(('Random Forest Global', loss_list_RRMSE_np.mean(axis=0)))
# print('RRMSE: ', loss_list_RRMSE_np)
# print('RRMSE mean: ', loss_list_RRMSE_np.mean(axis=0))

#### Random Forest Local 

In [7]:
loss_list_RRMSE = []

kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
            shuffle=config.all_config[text_file].get('kfold_random'),
            random_state=config.all_config[text_file].get('kfold_random_num'))

for train_index, test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

    result_p_train = []
    
    for i in range(label.shape[1]):

        predicted_test = None 

        if text_file in ['sf1', 'sf2']:
            predicted_test = model_svr(x_train, y_train[:,i], x_test,
                                    config.all_config[text_file].get('svr1_kernel'),
                                    config.all_config[text_file].get('svr1_gamma'),
                                    config.all_config[text_file].get('svr1_C'),
                                    config.all_config[text_file].get('svr1_epsilon'),
                                    config.all_config[text_file].get('svr1_shrinking'),
                                    config.all_config[text_file].get('svr1_verbose'),
                                    config.all_config[text_file].get('svr1_max_iter'))
        else:
            predicted_test = model_randomForest(x_train, y_train[:,i], x_test, my_num=100)
            
        result_p_train.append(predicted_test)

    result_p_train = pd.DataFrame(result_p_train).T.values
    real_train_mean = y_train.mean(axis=0)    
    loss_list_RRMSE.append(computer_RRMSE_list(y_test,result_p_train,real_train_mean))


loss_list_RRMSE_np = np.array(loss_list_RRMSE)
error_results.append(('Random Forest Local', loss_list_RRMSE_np.mean(axis=0)))
# print('RRMSE: ', loss_list_RRMSE_np)
# print('RRMSE mean: ',loss_list_RRMSE_np.mean(axis=0))


#### ANN Global 

In [8]:
loss_list_RRMSE = []

kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
            shuffle=config.all_config[text_file].get('kfold_random'),
            random_state=config.all_config[text_file].get('kfold_random_num'))

for train_index, test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

    result_p_train = model_ann_global(x_train, y_train, x_test)  # Call model_ann_global once
    
    real_train_mean = y_train.mean(axis=0)    
    loss_list_RRMSE.append(computer_RRMSE_list(y_test, result_p_train, real_train_mean))

loss_list_RRMSE_np = np.array(loss_list_RRMSE)
error_results.append(('ANN Global', loss_list_RRMSE_np.mean(axis=0)))
# print('RRMSE: ', loss_list_RRMSE_np)
# print('RRMSE mean: ', loss_list_RRMSE_np.mean(axis=0))

2023-11-24 19:17:43.857152: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




#### ANN Local

In [9]:
loss_list_RRMSE = []

kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
            shuffle=config.all_config[text_file].get('kfold_random'),
            random_state=config.all_config[text_file].get('kfold_random_num'))

for train_index, test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

    result_p_train = model_ann_local(x_train, y_train, x_test)
   
    real_train_mean = y_train.mean(axis=0)    

    result_p_train = pd.DataFrame(result_p_train).values
    loss_list_RRMSE.append(computer_RRMSE_list(y_test, result_p_train, real_train_mean))

    
loss_list_RRMSE_np = np.array(loss_list_RRMSE)
error_results.append(('ANN Local', loss_list_RRMSE_np.mean(axis=0)))
# print('RRMSE: ', loss_list_RRMSE_np)
# print('RRMSE mean: ',loss_list_RRMSE_np.mean(axis=0))



#### XGBoost Local

In [10]:
loss_list_RRMSE = []

kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
            shuffle=config.all_config[text_file].get('kfold_random'),
            random_state=config.all_config[text_file].get('kfold_random_num'))

for train_index, test_index in kf.split(data):
    x_train, x_test = data[train_index], data[test_index]
    y_train, y_test = label[train_index], label[test_index]

    result_p_train = model_xgboost(x_train, y_train, x_test)
    
    real_train_mean = y_train.mean(axis=0)    
    loss_list_RRMSE.append(computer_RRMSE_list(y_test, result_p_train, real_train_mean))

loss_list_RRMSE_np = np.array(loss_list_RRMSE)
error_results.append(('XGBoost Local', loss_list_RRMSE_np.mean(axis=0)))
# print('RRMSE: ', loss_list_RRMSE_np)
# print('RRMSE mean: ', loss_list_RRMSE_np.mean(axis=0))

In [11]:
# print the name and value of each model's error result
for model, error in error_results:
    print(f'{model} RRMSE: {error}')

MultiOutput Decision Tree RRMSE: [0.40023333002332506, 0.02044705362869795, 0.7193043548966468, 0.7086654929709483, 0.6964982862873742, 0.9877161312990153]
Random Forest Global RRMSE: [0.31629909 0.2966071  0.30712246 0.30812334 0.52160143 0.55041904]
Random Forest Local RRMSE: [0.39453565 0.27027068 0.33402029 0.34314087 0.6460256  0.59859992]
ANN Global RRMSE: [0.87905475 0.86252126 0.96865246 0.99411114 1.03802282 1.00851647]
ANN Local RRMSE: [1.05904199 0.98890408 0.98386921 1.01469382 0.97670647 0.94233438]
XGBoost Local RRMSE: [0.3899991  0.13958916 0.36069351 0.33912269 0.6791775  0.6680659 ]


#### Parte errada ainda 

In [12]:
# # Ta errado ainda, 
# # A funcao model_ann_local ta iterando sobre as colunas
# # e nesse código ta iterando de novo sobre as colunas

# loss_list_RRMSE = []

# kf = KFold(n_splits=config.all_config[text_file].get('paper_kFold'),
#             shuffle=config.all_config[text_file].get('kfold_random'),
#             random_state=config.all_config[text_file].get('kfold_random_num'))

# for train_index, test_index in kf.split(data):
#     x_train, x_test = data[train_index], data[test_index]
#     y_train, y_test = label[train_index], label[test_index]

#     result_p_test = []

#     for i in range(label.shape[1]):
        
#         predicted_test = None 

#         if text_file in ['sf1', 'sf2']:
#             predicted_test = model_svr(x_train, y_train[:,i], x_test, y_test[:,i],
#                                     config.all_config[text_file].get('svr1_kernel'),
#                                     config.all_config[text_file].get('svr1_gamma'),
#                                     config.all_config[text_file].get('svr1_C'),
#                                     config.all_config[text_file].get('svr1_epsilon'),
#                                     config.all_config[text_file].get('svr1_shrinking'),
#                                     config.all_config[text_file].get('svr1_verbose'),
#                                     config.all_config[text_file].get('svr1_max_iter'))
#         else:
#             predicted_test = model_ann_local(x_train, y_train, x_test)
        
#         result_p_test.append(predicted_test)

#     result_p_test = pd.DataFrame(result_p_test).T.values
#     real_train_mean = y_train.mean(axis=0)    
#     loss_list_RRMSE.append(computer_RRMSE_list(y_test,result_p_test,real_train_mean))


# loss_list_RRMSE_np = np.array(loss_list_RRMSE)         
# print('RRMSE: ',np.mean(loss_list_RRMSE_np))