# Data Preparation
This notebook was created to prepare the training and test datasets for this product.

## Libraries and imports

In [None]:
import sys
sys.path.append('..')
from scripts.utils import initialize_bucket, plot_importance

from datetime import date

import pandas as pd
import numpy as np
import re
import ast

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, cross_val_score,cross_validate
from sklearn.metrics import r2_score, mean_squared_error, make_scorer, explained_variance_score,mean_absolute_percentage_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.decomposition import PCA


from sklearn.preprocessing import StandardScaler, Normalizer

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import shap

import pickle
import json


## Setup

## Methods and functions

### Feature engineering

In [None]:
def transform_dataframe(df, setor_encoding):
    """
    Apply various transformations to the given DataFrame.

    This function applies a series of calculations and replacements to columns
    in the provided DataFrame to derive new values based on existing columns.

    Args:
        df (pandas.DataFrame): The input DataFrame containing necessary columns.
        setor_encoding (dict): A dictionary mapping sector names to encoded values.

    Returns:
        None. The input DataFrame is modified in place with added/replaced columns.

    Example:
        >>> sector_encoding = {'Tech': 1, 'Finance': 2, 'Healthcare': 3}
        >>> transform_dataframe(input_df, sector_encoding)
        # The input_df DataFrame will be modified with new/replaced columns.
    """
    # Replace values in 'dy_mean_std' column
    df['dy_mean_std'][(df['dy_mean_std'] <= 0.005) & (df['dy_median_min'] <= 0.005)] = df['dy_mean_std'].max()

    # Encode sectors using the provided setor_encoding dictionary
    df['encoded_sectors'] = df['Setor'].replace(setor_encoding)

    # Calculate power_valuation column
    df['power_valuation'] = df['roe_mean_last'] * df['roic_mean_last'] * df['roe_mean_mean'] * df['roe_mean_mean']

    # Calculate power_price column using multiple columns
    df['power_price'] = (df['pl_mean_last'] * df['pa_mean_last']) / (df['pl_mean_ref_mean'] * df['pa_mean_ref_mean'])

    # Calculate power_margin column
    df['power_margin'] = (df['mrgliq_mean_last'] / 1_000_000) * (df['mrgebit_mean_last'] / 1_000_000)

    # Calculate power_buy_flag column
    df['power_buy_flag'] = df['pl_mean_last'] * df['dy_median_last'] * df['dy_median_min']

    # Calculate power_stabil column
    df['power_stabil'] = df['dy_median_min'] / df['dy_mean_std']

    # Calculate power_stabil_delta column
    df['power_stabil_delta'] = (df['dy_median_max'] - df['dy_median_min']) / df['dy_mean_std']

    # Calculate power_min column
    df['power_min'] = df['dy_median_min'] * df['dy_median_ref_min']

    return df

In [None]:
def create_trends(data, roots):
    """
    Calculate trends based on specified roots and add them to the data.

    This function calculates trends for each specified root by dividing the last value
    by the mean value and converting the result to a percentage change. The calculated
    trends are then added as new columns to the provided data.

    Args:
        data (pandas.DataFrame): The input DataFrame containing necessary columns.
        roots (list): A list of root names for which trends should be calculated.

    Returns:
        pandas.DataFrame: A modified DataFrame with added trend columns.

    Example:
        >>> roots_to_calculate = ['roe', 'roic', 'pl']
        >>> modified_df = create_trends(input_df, roots_to_calculate)
        # The modified_df DataFrame will contain additional columns for trends.
    """
    data_out = data.copy()
    for root in roots:
        try:
            trend_column_name = root + '_trend'
            trend_values = ((np.divide(data_out[root + '_last'], data_out[root + '_mean']) - 1) * 100)
            data_out[trend_column_name] = trend_values
        except:
            continue
    return data_out

# Assuming 'df' is your DataFrame and 'roots' is a list of root names
df = create_trends(df, roots)

In [None]:
def transform_dummy(data, features):
    """
    Transform categorical values into numerical values using one-hot encoding.
    
    Parameters:
        data (DataFrame): The DataFrame containing the categorical features to be transformed.
        features (list): List of categorical features to be transformed.
        
    Returns:
        DataFrame: The transformed DataFrame with numerical values for categorical features.
    """
    for feature in features:
        # Generate one-hot encoded columns and append to the DataFrame
        one_hot_encoded = pd.get_dummies(data[feature])
        new_columns = [(str(feature) + '_' + str(col)) for col in one_hot_encoded.columns]
        data[new_columns] = one_hot_encoded
        
        # Drop the original categorical feature from the DataFrame
        data.drop(feature, axis=1, inplace=True)
        
    return data

In [None]:
def column_name_cleaner(col):
    """
    Clean column names by replacing non-UTF-8 characters with a replacement character.

    Parameters:
        col (str): The column name to be cleaned.

    Returns:
        str: The cleaned column name.
    """
    return col.encode('utf-8', 'replace').decode('utf-8')

In [None]:
def create_year_month_column(data: pd.DataFrame, year_column: str, month_column: str) -> pd.DataFrame:
    """
    Create a new column 'year_month' in the DataFrame by concatenating values from two columns.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the columns 'year_last' and 'month_last'.
        year_column (str): Name of the column containing the year values.
        month_column (str): Name of the column containing the month values.

    Returns:
        pd.DataFrame: The DataFrame with a new 'year_month' column.
    """
    # Ensure that the specified columns exist in the DataFrame
    if year_column not in data.columns or month_column not in data.columns:
        raise ValueError(f"Columns '{year_column}' and '{month_column}' not found in the DataFrame.")
    
    # Create the 'year_month' column by concatenating the values from 'year_last' and 'month_last' columns
    data['year_month'] = data[year_column].astype(str) + data[month_column].astype(str)
    
    return data

## Load Data

In [None]:
pd.set_option("display.max_columns", 400)

credentials_path = '../datascience-capstone-project-05b1642f45c3.json'

In [None]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

path = "gs://storage-barsianize/03_refined/df_windowed_full.parquet"
base_dataset =  pd.read_parquet(path)

df = base_dataset.copy()

## Feature Selection

In [None]:
features = [
                'Empresa', 'Papel',
                'year_last', 'month_last',
                'besst_1', 'besst_2',
                'Setor','Tipo',
                'Lucro_Liquido_12m_category',
                # 'Valor_de_mercado_category',
                # 'Patrim_Liq_category',
                'dy_label',
                'dy_median_last', 'dy_median_max','dy_median_min','dy_spread','dy_mean_std','dy_mean_last', 'dy_median_ref_min',#add dy relativo
                'euro_last','euro_mean', #add euro relativo
                'evebit_mean_ref_mean','evebit_mean_last','evebit_mean_mean', #add evebit relativo
                'c5y_mean_last','c5y_mean_mean', #add c5y relativo
                'cotacao_max_ref_max','cotacao_mean_ref_mean','cotacao_mean_ref_std', 'cotacao_mean_last', 'cotacao_mean_mean',#add cotacao relativo
                'divbpatr_max_ref_max','divbpatr_mean_last','divbpatr_mean_mean', #add divbpatr relativo
                'dolar_comercial_last','dolar_comercial_mean', #add euro relativo
                'ibovespa_spread','ibovespa_spread_relevance','ibovespa_last','ibovespa_mean',#add ibovespa relativo
                'pib_dolarizado_last','pib_dolarizado_mean', #add pib_dolarizado relativo
                'preco_do_petroleo_last','preco_do_petroleo_mean', #add preco_do_petroleo relativo
                'igpm_last','igpm_mean',#add igpm relativo
                'ipca_last','ipca_mean',#add ipca relativo
                'selic_last','selic_mean',#add igpm relativo
                'liq2m_mean_ref_mean','liq2m_mean_last','liq2m_mean_mean', #add liq2m relativo
                'liqc_mean_ref_mean','liqc_mean_last','liqc_mean_mean', #add liqc relativo
                'mrgebit_mean_ref_mean','mrgebit_mean_last','mrgebit_mean_mean', #add mrgebit relativo
                'mrgliq_mean_ref_mean','mrgliq_mean_last','mrgliq_mean_mean', #add mrgliq relativo
                'pa_mean_ref_mean','pa_mean_last', 'pa_mean_mean',#add pa relativo
                'pl_mean_ref_mean','pl_mean_last', 'pl_mean_mean',#add pa relativo
                'pcg_mean_ref_mean','pcg_mean_last', 'pcg_mean_mean',#add pcg relativo
                'pebit_mean_ref_mean','pebit_mean_last', 'pebit_mean_mean',#add pebit relativo
                'pacl_mean_ref_mean','pacl_mean_last', 'pacl_mean_mean',#add pacl relativo
                'psr_mean_ref_mean','psr_mean_last', 'psr_mean_mean',#add psr relativo
                'pvp_mean_ref_mean','pvp_mean_last', 'pvp_mean_mean',#add pvp relativo
                'roe_mean_ref_mean','roe_mean_last', 'roe_mean_mean',#add roe relativo
                'roic_mean_ref_mean','roic_mean_last', 'roic_mean_mean',#add roic relativo
                'patrliq_mean_ref_mean','patrliq_mean_last','patrliq_mean_mean', #add patrliq relativo
            ]

roots = [
            'euro',
            'dy_mean',
            'evebit_mean',
            'c5y_mean',
            'cotacao_mean',
            'divbpatr_mean',
            'dolar_comercial',
            'ibovespa',
            'pib_dolarizado',
            'preco_do_petroleo',
            'igpm',
            'ipca',
            'selic',
            'liq2m',
            'liqc',
            'mrgebit',
            'mrgliq',
            'pa_mean',
            'pl_mean',
            'pcg_mean',
            'pebit_mean',
            'pacl_mean',
            'psr_mean',
            'pvp_mean',
            'roe_mean',
            'roic_mean_last',
            'patrliq_mean'
        ]

setor_encoding = {
    'Computadores e Equipamentos':1,
    'Tecidos, Vestuário e Calçados':2,
    'Máquinas e Equipamentos':3,
    'Equipamentos':4,
    'Automóveis e Motocicletas':5,
    'Bebidas':6,
    'Alimentos Processados':7,
    'Produtos de Uso Pessoal e de Limpeza':8,
    'Utilidades Domésticas':9,
    'Embalagens':10,
    'Materiais Diversos':11,
    'Medicamentos e Outros Produtos':12,
    'Serv.Méd.Hospit. Análises e Diagnósticos':13,
    'Hoteis e Restaurantes':14,
    'Viagens e Lazer':15,
    'Transporte':16,
    'Material de Transporte':17,
    'Serviços Diversos':18,
    'Programas e Serviços':19,
    'Comércio e Distribuição':20,
    'Comércio':21,
    'Telecomunicações':22,
    'Mídia':23,
    'Holdings Diversificadas':24,
    'Serviços Financeiros Diversos':25,
    'Intermediários Financeiros':26,
    'Previdência e Seguros':27,
    'Exploração de Imóveis':28,
    'Químicos':29,
    'Construção e Engenharia':30,
    'Construção Civil':31,
    'Siderurgia e Metalurgia':32,
    'Energia Elétrica':33,
    'Água e Saneamento':34,
    'Gás':35,
    'Petróleo, Gás e Biocombustíveis':36,
    'Mineração':37,
    'Madeira e Papel':38,
    'Agropecuária':39,
    'Diversos':40,
    'Outros':41,
}

In [None]:
df = df[features]

df = transform_dataframe(df, setor_encoding)

df.columns = [column_name_cleaner(col) for col in df.columns]

df = create_year_month_column(df, 'year_last', 'month_last')

df = transform_dummy(df,['Tipo'])

object_cols = df.columns[df.dtypes == 'object'] 
print(object_cols)
bool_cols = df.columns[df.dtypes == 'bool'] 
print(bool_cols)

# dropping columns with maximum values larger than 1 trillion dollars
to_drop = df.columns[df.dtypes=='float64'][(df[df.columns[df.dtypes=='float64']].max()>1_000_000_000_000)]
print(df[to_drop].max())
df = df.drop(to_drop, axis=1)

# transforming bool_cols to float
df[bool_cols] = df[bool_cols].astype('float64')

In [None]:
# checking NaN values and inputing median
df.isna().mean().sort_values(ascending=False)[:20]

In [None]:
# inpute 0 for missing values
df.fillna(0, inplace=True)

In [None]:
cor = df.corr()['dy_label']

training_cols = np.abs(cor).sort_values(ascending=False)[0:150].index

plt.figure(figsize=(17,7))
np.abs(cor).sort_values(ascending=False)[1:20].plot(kind='bar')

In [None]:
df_to_train = df[['Empresa', 'Papel', 'year_month'] + list(training_cols)].copy()

df_to_pred  = df_to_train[df_to_train['year_month']=='2022.04.0'].copy()
df = df_to_train[df_to_train['year_month']!='2022.04.0']

## Store Data

In [None]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

path_local = '../data/04_datasets/'
filename = 'df_base_dataset.parquet'
blob_name = '05_datasets/'

# save the DataFrame as a parquet file
df.to_parquet(path_local + filename)

# upload the parquet file to Google Cloud Storage
blob = bucket.blob(blob_name + filename)
blob._chunk_size = 8388608
blob.upload_from_filename(path_local + filename, num_retries=10, )



client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

path_local = '../data/04_datasets/'
filename = 'df_to_pred.parquet'
blob_name = '05_datasets/'

# save the DataFrame as a parquet file
df.to_parquet(path_local + filename)

# upload the parquet file to Google Cloud Storage
blob = bucket.blob(blob_name + filename)
blob._chunk_size = 8388608
blob.upload_from_filename(path_local + filename, num_retries=10, )