In [1]:
def delete_vars(df, thresh, exception):
    """
    Delete variables from a Pandas DataFrame that have more than `thresh` percentage of the same value,
    except for the `exception` variable.
    
    Parameters:
    df (Pandas DataFrame): The input DataFrame.
    thresh (float): The threshold percentage for identifying variables to be deleted.
    exception (str): The variable to be excluded from deletion.
    
    Returns:
    Pandas DataFrame: The modified DataFrame with variables deleted.
    """
    
    # Calculate the percentage of zeros in each column
    zeros_percentage = (df == 0).mean()
    
    # Identify columns that have more than `thresh` percentage of zeros, except for `exception`
    cols_to_delete = zeros_percentage[zeros_percentage > thresh].index.tolist()
    if exception in cols_to_delete:
        cols_to_delete.remove(exception)
    
    # Delete the identified columns from the DataFrame
    df = df.drop(cols_to_delete, axis=1)
    
    return df

In [2]:
def replace_outliers_with_nans(dataframe, excluded_columns=None):
    if excluded_columns is None:
        excluded_columns = []

    for column in dataframe.columns:
        if column not in excluded_columns:
            unique_values = dataframe[column].unique()
            if not (set(unique_values) <= {0, 1}):
                # Calculate the IQR for the column
                Q1 = dataframe[column].quantile(0.25)
                Q3 = dataframe[column].quantile(0.75)
                IQR = Q3 - Q1

                # Define the lower and upper bounds for outliers
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                # Replace outliers with NaNs
                dataframe[column] = np.where(
                    (dataframe[column] < lower_bound) | (dataframe[column] > upper_bound),
                    np.nan,
                    dataframe[column]
                )

    return dataframe


In [3]:
def get_nan_percentage(df):
    """
    This function takes a pandas DataFrame as input and returns the percentage of NaN values in each column.
    """
    total_cells = df.size
    nan_cells = df.isnull().sum().sum()
    nan_percentage = (nan_cells / total_cells) * 100
    return nan_percentage
    

In [4]:
def get_dummy_columns(df):
    object_columns = df.select_dtypes(include=['object']).columns
    for col in object_columns:
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
        df = pd.concat([df, dummies], axis=1)
        df.drop(col, axis=1, inplace=True)
    return df

In [5]:
def porcentaje_valores_nulos(dataframe):
    # Calcular el número total de filas en el DataFrame
    total_filas = len(dataframe)

    # Crear un diccionario vacío para almacenar los resultados
    resultado = {}

    # Iterar sobre las columnas del DataFrame
    for columna in dataframe.columns:
        # Contar el número de valores nulos en cada columna
        valores_nulos = dataframe[columna].isna().sum()

        # Calcular el porcentaje de valores nulos
        porcentaje_nulos = (valores_nulos / total_filas) * 100

        # Agregar el resultado al diccionario
        resultado[columna] = porcentaje_nulos

    return resultado


In [6]:
def eliminar_columnas_constantes(dataframe):
    columnas_constantes = []
    
    for columna in dataframe.columns:
        if dataframe[columna].nunique() == 1:
            columnas_constantes.append(columna)
            
    return dataframe.drop(columnas_constantes, axis=1)

In [7]:
def transformar_columnas_numericas(dataframe, threshold=0.1):
    columnas_transformadas = []
    
    for columna in dataframe.columns:
        zero_count = (dataframe[columna] == 0).sum()
        total_count = len(dataframe[columna])
        zero_percentage = zero_count / total_count

        if zero_percentage >= threshold:
            if dataframe[columna].lt(0).any():  # Si hay valores negativos
                dataframe[f'{columna}_neg'] = (dataframe[columna] < 0).astype(int)
                dataframe[f'{columna}_cero'] = (dataframe[columna] == 0).astype(int)
                dataframe[f'{columna}_pos'] = (dataframe[columna] > 0).astype(int)
                columnas_transformadas.append(columna)
            else:  # Si solo hay valores positivos y ceros
                dataframe[columna] = (dataframe[columna] > 0).astype(int)

    # Eliminar las columnas originales transformadas en tres columnas dummy
    dataframe = dataframe.drop(columnas_transformadas, axis=1)
    return dataframe

In [8]:
def eliminar_variables_correlacionadas(dataframe, threshold=0.95):
    corr_matrix = dataframe.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    columnas_eliminar = [columna for columna in upper_tri.columns if any(upper_tri[columna] > threshold)]

    return dataframe.drop(columnas_eliminar, axis=1)

In [9]:
def graficar_histogramas(dataframe):
    # Filtrar las columnas numéricas
    columnas_numericas = dataframe.select_dtypes(include=[np.number]).columns
    
    # Calcular el número de filas necesarias en función de las columnas numéricas
    num_filas = (len(columnas_numericas) + 3) // 4

    # Crear una figura con varios subplots (4 x n)
    fig, axes = plt.subplots(num_filas, 4, figsize=(20, 5 * num_filas))
    axes = axes.flatten()

    # Iterar sobre las columnas numéricas y graficar un histograma para cada una
    for i, columna in enumerate(columnas_numericas):
        sns.histplot(data=dataframe, x=columna, kde=True, ax=axes[i])
        axes[i].set_title(columna)

    # Eliminar los subplots vacíos
    for i in range(len(columnas_numericas), len(axes)):
        fig.delaxes(axes[i])

    # Mostrar la figura con los histogramas
    plt.show()

In [11]:
def eliminar_columnas_quasiconstantes(dataframe, threshold=0.98):
    columnas_quasiconstantes = []
    
    for columna in dataframe.columns:
        dominant_value_freq = dataframe[columna].value_counts(normalize=True).iloc[0]
        if dominant_value_freq >= threshold:
            columnas_quasiconstantes.append(columna)
            
    return dataframe.drop(columnas_quasiconstantes, axis=1)

