In [1]:
# Basic Libraries
import os
import git

# Data manipulation
import pandas as pd
import numpy as np

# set the display option to show all columns
pd.set_option("display.max_columns", None)

# Machine Learning algorithmics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler


# Weka libraries
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.attribute_selection import ASEvaluation, ASSearch, AttributeSelection

In [2]:
# Função para encontrar diretório do repositório git
def get_git_root(path: str) -> str:
    git_repo = git.Repo(path, search_parent_directories=True)
    git_root = git_repo.git.rev_parse("--show-toplevel")
    return git_root


def encode_categorical(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converte atributos numéricos discretos de um DataFrame

    :param df: DataFrame do Pandas para conversão
    """
    # Instanciar um objeto LabelEncoder
    le = LabelEncoder()

    # Iterar sobre todas as colunas do DataFrame
    for col in df.columns:
        # Verificar se a coluna é do tipo "object", o que geralmente indica
        # uma coluna categórica com strings
        if df[col].dtype == "object":
            # Se for uma coluna categórica, use o LabelEncoder para ajustar
            # e transformar os dados, substituindo os valores originais
            # pelos valores codificados como números inteiros
            df[col] = le.fit_transform(df[col])

    # Retornar o DataFrame com as colunas categóricas codificadas
    return df


def pandas2arff(
    df, filename, wekaname="pandasdata", cleanstringdata=True, cleannan=True
):
    """
    converts the pandas dataframe to a weka compatible file
    df: dataframe in pandas format
    filename: the filename you want the weka compatible file to be in
    wekaname: the name you want to give to the weka dataset (this will be visible to you when you open it in Weka)
    cleanstringdata: clean up data which may have spaces and replace with "_", special characters etc which seem to annoy Weka.
                     To suppress this, set this to False
    cleannan: replaces all nan values with "?" which is Weka's standard for missing values.
              To suppress this, set this to False
    """
    import re

    def cleanstring(s):
        if s != "?":
            return re.sub("[^A-Za-z0-9]+", "_", str(s))
        else:
            return "?"

    dfcopy = df  # all cleaning operations get done on this copy

    if cleannan != False:
        dfcopy = dfcopy.fillna(
            -999999999
        )  # this is so that we can swap this out for "?"
        # this makes sure that certain numerical columns with missing values don't get stuck with "object" type

    # Diretório do repositório git
    git_dir = get_git_root(os.getcwd())

    # Diretório de arquivos de dados
    data_dir = os.path.join(git_dir, "data")

    f = open(os.path.join(data_dir, filename), "w")
    arffList = []
    arffList.append("@relation " + wekaname + "\n")
    # look at each column's dtype. If it's an "object", make it "nominal" under Weka for now (can be changed in source for dates.. etc)
    for i in range(df.shape[1]):
        if dfcopy.dtypes[i] == "O" or (df.columns[i] in ["Class", "CLASS", "class"]):
            if cleannan != False:
                dfcopy.iloc[:, i] = dfcopy.iloc[:, i].replace(
                    to_replace=-999999999, value="?"
                )
            if cleanstringdata != False:
                dfcopy.iloc[:, i] = dfcopy.iloc[:, i].apply(cleanstring)
            _uniqueNominalVals = [str(_i) for _i in np.unique(dfcopy.iloc[:, i])]
            _uniqueNominalVals = ",".join(_uniqueNominalVals)
            _uniqueNominalVals = _uniqueNominalVals.replace("[", "")
            _uniqueNominalVals = _uniqueNominalVals.replace("]", "")
            _uniqueValuesString = "{" + _uniqueNominalVals + "}"
            arffList.append("@attribute " + df.columns[i] + _uniqueValuesString + "\n")
        else:
            arffList.append("@attribute " + df.columns[i] + " real\n")
            # even if it is an integer, let's just deal with it as a real number for now
    arffList.append("@data\n")
    for i in range(dfcopy.shape[0]):  # instances
        _instanceString = ""
        for j in range(df.shape[1]):  # features
            if dfcopy.dtypes[j] == "O":
                _instanceString += '"' + str(dfcopy.iloc[i, j]) + '"'
            else:
                _instanceString += str(dfcopy.iloc[i, j])
            if j != dfcopy.shape[1] - 1:  # if it's not the last feature, add a comma
                _instanceString += ","
        _instanceString += "\n"
        if cleannan != False:
            _instanceString = _instanceString.replace(
                "-999999999.0", "?"
            )  # for numeric missing values
            _instanceString = _instanceString.replace(
                '"?"', "?"
            )  # for categorical missing values
        arffList.append(_instanceString)
    f.writelines(arffList)
    f.close()
    del dfcopy
    return True


def evaluation_to_dataframe(predictions: list) -> pd.DataFrame:
    # Criar lista de dicionários com informações de previsão e distribuição usando list comprehension
    prediction_dict_list = [
        {
            "predicted": nominal_prediction.predicted,
            "actual": nominal_prediction.actual,
            "distribution": nominal_prediction.distribution.tolist(),
            "weight": nominal_prediction.weight,
        }
        for nominal_prediction in predictions
    ]

    # Gerar dataframe pandas a partir da lista de dicionários
    predictions_df = pd.DataFrame(prediction_dict_list)

    # Adicionar coluna que indica se a previsão foi correta ou não
    predictions_df["correct"] = predictions_df["predicted"] == predictions_df["actual"]

    return predictions_df

In [3]:
# Diretório do repositório git
git_dir = get_git_root(os.getcwd())

# Diretório dos dados
data_dir = os.path.join(git_dir, "data")

# Leitura dos arquivos de treinamento e teste
df_train = pd.read_csv(os.path.join(data_dir, "train.csv"))

In [4]:
# carregando os datos de treinamento e validação
train = df_train.copy()

In [5]:
# Lista de colunas que precisam ser preenchidas com "NA"
cols_to_fillna = [
    "PoolQC",
    "MiscFeature",
    "Alley",
    "Fence",
    "MasVnrType",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "BsmtExposure",
    "BsmtFinType2",
    "BsmtFinType1",
    "BsmtCond",
    "BsmtQual",
]

# Usando uma compreensão de lista para preencher os valores ausentes com "NA" em cada coluna
for col in cols_to_fillna:
    train[col].fillna("NA", inplace=True)

# Preenchendo os valores ausentes no atributo "LotFrontage" com o valor padrão "0"
train["LotFrontage"].fillna(0, inplace=True)

# Preenchendo os valores ausentes no atributo "GarageYrBlt" com o valor padrão "0"
train["GarageYrBlt"].fillna(0, inplace=True)

# Excluindo as instâncias com valores ausentes (conjunto pequeno em relação ao tamanho do conjunto de dados)
cols_to_dropna = ["MasVnrArea", "Electrical"]
train.dropna(subset=cols_to_dropna, inplace=True)

# retirar a coluna id
train.drop(columns=["Id"], inplace=True)

# Chamando a função encode_categorical() para aplicar LabelEncoder em variáveis categóricas
train = encode_categorical(train)

In [6]:
target_variable = "SalePrice"

# Ajustar e transformar os dados de treinamento e teste com MinMaxScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train[[target_variable]])

# Instanciar a classe KBinsDiscretizer com 5 bins (faixas)
discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile")

# Aplicar o discretizer nos dados 'SalePrice' escalonados
train_preco_discretizado = discretizer.fit_transform(train_scaled)

# Adicionar a nova coluna de categorias ao dataframe original
train["SalePriceCategorical"] = pd.Series(train_preco_discretizado.reshape(-1))

target_variable = "SalePriceCategorical"

# Renomear as categorias
train[target_variable].replace(
    {
        0: "House Price 1",
        1: "House Price 2",
        2: "House Price 3",
        3: "House Price 4",
        4: "House Price 5",
    },
    inplace=True,
)

train.dropna(inplace=True)

In [7]:
# Padronizar as entradas do modelo

# Criar um objeto StandardScaler
scaler = StandardScaler()

columns_to_standardize = [
    "LotArea",
    "BsmtFinSF1",
    "BsmtFinSF2",
    "BsmtUnfSF",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "LowQualFinSF",
    "GrLivArea",
    "GarageArea",
    "WoodDeckSF",
    "OpenPorchSF",
    "EnclosedPorch",
    "3SsnPorch",
    "ScreenPorch",
    "PoolArea",
    "MiscVal",
    "LotFrontage",
    "MasVnrArea",
]

# Padronizando as colunas selecionadas
train[columns_to_standardize] = scaler.fit_transform(train[columns_to_standardize])

In [8]:
train.drop(columns="SalePrice", inplace=True)

In [9]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePriceCategorical
0,60,3,0.21448,-0.205935,1,1,3,3,0,4,0,5,2,2,0,5,7,5,2003,2003,1,1,12,13,1,0.506248,2,4,2,2,4,4,2,0.576918,6,-0.288208,-0.943434,-0.455349,1,0,1,4,-0.791443,1.160387,-0.121001,0.371899,1,0,2,1,3,1,2,8,6,0,3,1,2003.0,2,2,0.35032,5,5,2,-0.757075,0.220059,-0.359613,-0.117073,-0.272014,-0.069121,3,4,1,-0.085485,2,2008,8,4,House Price 4
1,20,3,0.646515,-0.091124,1,1,3,3,0,2,0,24,1,2,0,2,6,8,1976,1976,1,1,8,8,2,-0.573827,3,4,1,2,4,1,0,1.173631,6,-0.288208,-0.63992,0.47064,1,0,1,4,0.262828,-0.796607,-0.121001,-0.479584,0,1,2,0,3,1,3,6,6,1,5,1,1976.0,2,2,-0.061007,5,5,2,1.643758,-0.701518,-0.359613,-0.117073,-0.272014,-0.069121,3,4,1,-0.085485,5,2007,8,4,House Price 4
2,60,3,0.300887,0.073603,1,1,0,3,0,4,0,5,2,2,0,5,7,5,2001,2002,1,1,12,13,1,0.318888,2,4,2,2,4,2,2,0.094282,6,-0.288208,-0.300165,-0.30938,1,0,1,4,-0.625253,1.187886,-0.121001,0.516347,1,0,2,1,3,1,2,6,6,1,5,1,2001.0,2,2,0.63077,5,5,2,-0.757075,-0.06699,-0.359613,-0.117073,-0.272014,-0.069121,3,4,1,-0.085485,9,2008,8,4,House Price 4
3,70,3,0.070468,-0.096116,1,1,0,3,0,0,0,6,2,2,0,5,7,5,1915,1970,1,1,13,15,2,-0.573827,3,4,0,4,1,4,0,-0.498044,6,-0.288208,-0.060072,-0.683425,1,2,1,4,-0.518787,0.935814,-0.121001,0.385204,1,0,1,0,3,1,2,7,6,1,2,5,1998.0,3,3,0.789692,5,5,2,-0.757075,-0.172745,4.084125,-0.117073,-0.272014,-0.069121,3,4,1,-0.085485,2,2006,8,0,House Price 2
4,60,3,0.761725,0.374106,1,1,0,3,0,2,0,15,2,2,0,5,8,5,2000,2000,1,1,12,13,1,1.354877,2,4,2,2,4,0,2,0.465034,6,-0.288208,-0.173324,0.203791,1,0,1,4,-0.040989,1.616408,-0.121001,1.299408,1,0,2,1,4,1,2,9,6,1,5,1,2000.0,2,3,1.696481,5,5,2,0.78977,0.567538,-0.359613,-0.117073,-0.272014,-0.069121,3,4,1,-0.085485,12,2008,8,4,House Price 5


In [10]:
# Diretório do repositório git
git_dir = get_git_root(os.getcwd())

# Diretório de arquivos de dados
data_dir = os.path.join(git_dir, "data")

# Definir o diretório onde os arquivos estão localizados
dir = data_dir

# Iterar sobre os arquivos no diretório e verificar se a extensão é .arff
arquivos_arff = [arquivo for arquivo in os.listdir(dir) if arquivo.endswith(".arff")]


# Exibir os arquivos encontrados
print(arquivos_arff)


if not arquivos_arff in ["train.arff"]:
    # converter dataframe pandas para arquivo arff
    pandas2arff(train, "train.arff", wekaname="train_data")

[]


In [11]:
# Inicializar JVM
jvm.start()

# Diretório do repositório git
git_dir = get_git_root(os.getcwd())

# Diretório de dados de entrada
data_dir = os.path.join(git_dir, "data")

# Diretório de arquivos de saída
output_dir = os.path.join(git_dir, "output")

# Carregar o conjunto de dados
loader = Loader("weka.core.converters.ArffLoader")
train_data = loader.load_file(os.path.join(data_dir, "train.arff"))

# usado para informar ao Weka que a classe é a última coluna do conjunto de dados
train_data.class_is_last()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\vinic\\repositories\\house-prices-prediction-ppgia\\.env\\Lib\\site-packages\\javabridge\\jars\\rhino-1.7R4.jar', 'C:\\Users\\vinic\\repositories\\house-prices-prediction-ppgia\\.env\\Lib\\site-packages\\javabridge\\jars\\runnablequeue.jar', 'C:\\Users\\vinic\\repositories\\house-prices-prediction-ppgia\\.env\\Lib\\site-packages\\javabridge\\jars\\cpython.jar', 'c:\\Users\\vinic\\repositories\\house-prices-prediction-ppgia\\.env\\lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'c:\\Users\\vinic\\repositories\\house-prices-prediction-ppgia\\.env\\lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [12]:
# Configurar o avaliador de atributos ReliefF
evaluator = ASEvaluation(classname="weka.attributeSelection.ReliefFAttributeEval")

# Configurar o método de pesquisa de atributos (Ranker)
search = ASSearch(classname="weka.attributeSelection.Ranker")

# Criar uma instância de AttributeSelection
attribute_selection = AttributeSelection()
attribute_selection.evaluator = evaluator
attribute_selection.search = search
attribute_selection.select_attributes(train_data)

In [13]:
# Exibir os índices dos atributos selecionados
selected_attributes = attribute_selection.selected_attributes
print("Índices dos atributos selecionados:", selected_attributes)

# Exibir os nomes dos atributos selecionados
selected_attribute_names = [
    train_data.attribute(index).name for index in selected_attributes
]
print("Nomes dos atributos selecionados:", selected_attribute_names)

Índices dos atributos selecionados: [ 3 60 79]
Nomes dos atributos selecionados: ['LotArea', 'GarageCars', 'SalePriceCategorical']


In [14]:
jvm.stop()