1- Concat Files, create datasets

In [None]:
import glob
import pandas as pd
import ast
import re
import numpy as np

# --- Concatena os arquivos de treinamento ---
# Usa glob para encontrar todos os arquivos CSV que correspondem ao padrão './training/treino_parte*.csv'
train_files = glob.glob('./training/treino_parte*.csv')
train_dfs = []
for file in train_files:
    df = pd.read_csv(file)
    train_dfs.append(df)
train_df = pd.concat(train_dfs, ignore_index=True)

item_files = glob.glob('./itens/itens-parte*.csv')
item_dfs = []
for file in item_files:
    df = pd.read_csv(file)
    item_dfs.append(df)
item_df = pd.concat(item_dfs, ignore_index=True)

validation_df = pd.read_csv('./validacao.csv')

# Mostra as dimensões dos DataFrames para verificação
print(f"Train DataFrame shape: {train_df.shape}")
print(f"Item DataFrame shape: {item_df.shape}")
print(f"Validation DataFrame shape: {validation_df.shape}")

# Mostra as primeiras linhas para verificação
print("\nTrain DataFrame Head:")
print(train_df.head())
print("\nItem DataFrame Head:")
print(item_df.head())
print("\nValidation DataFrame Head")
print(validation_df.head())

# --- Verificando por items faltando antes da limpeza ---
print("Missing values in train_df BEFORE cleaning:")
print(train_df.isnull().sum())

print("\nMissing values in item_df BEFORE cleaning:")
print(item_df.isnull().sum())

print("\nMissing values in validation_df BEFORE cleaning:")
print(validation_df.isnull().sum())

Train DataFrame shape: (577942, 10)
Item DataFrame shape: (255603, 7)
Validation DataFrame shape: (112184, 4)

Train DataFrame Head:
                                              userId    userType  historySize  \
0  f98d1132f60d46883ce49583257104d15ce723b3bbda21...  Non-Logged            3   
1  2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...  Non-Logged           60   
2  0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...      Logged          107   
3  c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...  Non-Logged           56   
4  e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...  Non-Logged            4   

                                             history  \
0  c8aab885-433d-4e46-8066-479f40ba7fb2, 68d2039c...   
1  3325b5a1-979a-4cb3-82b6-63905c9edbe8, fe856057...   
2  04756569-593e-4133-a95a-83d35d43dbbd, 29b6b142...   
3  1f2b9c2f-a2d2-4192-b009-09065da8ec23, 04756569...   
4  bebdeb3e-1699-43e0-a1b8-989f5a6ab679, f4b484a7...   

                                    timesta

2 - Clean up datasets

In [4]:
# --- Tratando o campo History dos datasets train_df e validation_df --- #

def clean_history_string(history_str):
    """Limpa uma string de histórico (de treino OU validação)."""
    cleaned_str = history_str.replace('\n', '').replace("'", "").replace("[", "").replace("]", "").strip()
    page_ids = [item.strip() for item in re.split(r'[,\s]+', cleaned_str) if item.strip()]
    return page_ids

# Cria copias dos dataframes originais para fazer a limpeza

train_df_cleaned = train_df.copy()
train_df_cleaned['history'] = train_df_cleaned['history'].apply(clean_history_string)

validation_df_cleaned = validation_df.copy()
validation_df_cleaned['history'] = validation_df_cleaned['history'].apply(clean_history_string)

# Tratamento das outras colunas dos dataset de treinamento e converte pra array do numpy

def parse_list_of_numbers_train(list_str):
    """Analisa com segurança uma representação em string de uma lista de números,
       e *sempre* retorna um array NumPy.
    """
    try:
        parsed_list = ast.literal_eval(list_str) # eval da string como python list
        if isinstance(parsed_list, (int, float)):  # verifica se é um numero unico
            return np.array([parsed_list])       # Cria o NumPy array de um item
        else:
            return np.array(parsed_list)         # Converte a lista para array do NumPy
    except (ValueError, SyntaxError):
        return np.array([])

list_cols = ['timestampHistory', 'numberOfClicksHistory',
             'timeOnPageHistory', 'scrollPercentageHistory', 'pageVisitsCountHistory']

for col in list_cols:
    train_df_cleaned[col] = train_df_cleaned[col].apply(parse_list_of_numbers_train)

# Removendo a coluna duplicada timestampHistory_new
train_df_cleaned = train_df_cleaned.drop(columns=['timestampHistory_new'], errors='ignore')

# Tratamento das outras colunas do dataset de validação

def parse_list_of_numbers_validation(list_str):
    """Analisa com segurança uma representação em string de uma lista de números,
       tratando valores separados por espaço, e sempre retorna um array NumPy.
    """
    try:
        # Remove colchetes e divide em espaços.
        cleaned_str = list_str.replace("[", "").replace("]", "").strip()
        # Trata o caso em que é um único número (sem espaços)
        if ' ' not in cleaned_str:
            return np.array([int(cleaned_str)]) 
        numbers = [int(item.strip()) for item in re.split(r'\s+', cleaned_str) if item.strip()]
        return np.array(numbers)
    except (ValueError, SyntaxError):
        return np.array([])

list_cols = ['timestampHistory']

for col in list_cols:
    validation_df_cleaned[col] = validation_df_cleaned[col].apply(parse_list_of_numbers_validation)

# --- Carga e limpza de dados (itens)
item_df_cleaned = item_df.copy() # faz uma copia do DF original para limpar
item_df_cleaned['issued'] = pd.to_datetime(item_df_cleaned['issued'], errors='coerce')
item_df_cleaned['modified'] = pd.to_datetime(item_df_cleaned['modified'], errors='coerce')
item_df_cleaned = item_df_cleaned.drop(columns=['body', 'caption'], errors='ignore') # removendo colunas que não usaremos nesse exemplo

# Validação que não ficou nada vazio, visto que o original não tinha linhas com celulas vazias.
print("Number of empty histories in train_df_cleaned:", train_df_cleaned['history'].apply(lambda x: len(x) == 0).sum())
print("Number of empty histories in validation_df_cleaned:", validation_df_cleaned['history'].apply(lambda x: len(x) == 0).sum())

# Obtendo informações sobre o periodo que está sendo analisado
min_train_timestamp = train_df_cleaned['timestampHistory'].apply(lambda x: np.nanmin(x)).min()  
max_train_timestamp = train_df_cleaned['timestampHistory'].apply(lambda x: np.nanmax(x)).max()
print(f"\nTrain Data Timestamp Range: Min = {min_train_timestamp}, Max = {max_train_timestamp}")

min_val_timestamp = validation_df_cleaned['timestampHistory'].apply(lambda x: np.nanmin(x)).min()
max_val_timestamp = validation_df_cleaned['timestampHistory'].apply(lambda x: np.nanmax(x)).max()
print(f"Validation Data Timestamp Range: Min = {min_val_timestamp}, Max = {max_val_timestamp}")

print("\nIn Datetime format:")
min_train_timestamp_dt = pd.to_datetime(min_train_timestamp, unit='ms')
max_train_timestamp_dt = pd.to_datetime(max_train_timestamp, unit='ms')
print(f"Train Data Timestamp Range: Min = {min_train_timestamp_dt}, Max = {max_train_timestamp_dt}")

min_val_timestamp_dt = pd.to_datetime(min_val_timestamp, unit='ms')
max_val_timestamp_dt = pd.to_datetime(max_val_timestamp, unit='ms')
print(f"Validation Data Timestamp Range: Min = {min_val_timestamp_dt}, Max = {max_val_timestamp_dt}")

Number of empty histories in train_df_cleaned: 0
Number of empty histories in validation_df_cleaned: 0

Train Data Timestamp Range: Min = 1656644400247, Max = 1660532387472
Validation Data Timestamp Range: Min = 1660532401657, Max = 1660705198494

In Datetime format:
Train Data Timestamp Range: Min = 2022-07-01 03:00:00.247000, Max = 2022-08-15 02:59:47.472000
Validation Data Timestamp Range: Min = 2022-08-15 03:00:01.657000, Max = 2022-08-17 02:59:58.494000


In [None]:
# Salvando datasets como parquets:
train_df_cleaned.to_parquet('train_df_cleaned.parquet')
validation_df_cleaned.to_parquet('validation_df_cleaned.parquet')
item_df_cleaned.to_parquet('item_df_cleaned.parquet')

print("Cleaned dataframes saved to Parquet files.")

Cleaned dataframes saved to Parquet files.


In [5]:
# Verificando head dos datasets limpos para comparar com o original, após a limpeza
print("\nTrain DataFrame Head:")
print(train_df_cleaned.head())
print("\nItem DataFrame Head:")
print(item_df_cleaned.head())
print("\nValidation DataFrame Head")
print(validation_df_cleaned.head())

#Verificando novamente se ficou algo vazio, que de acordo com nossa tratativa de erros significa que algo deu errado se não for zero.

print("Missing values in train_df AFTER cleaning:")
print(train_df_cleaned.isnull().sum())

print("\nMissing values in item_df AFTER cleaning:")
print(item_df_cleaned.isnull().sum())

print("\nMissing values in validation_df AFTER cleaning:")
print(validation_df_cleaned.isnull().sum())


Train DataFrame Head:
                                              userId    userType  historySize  \
0  f98d1132f60d46883ce49583257104d15ce723b3bbda21...  Non-Logged            3   
1  2c1080975e257ed630e26679edbe4d5c850c65f3e09f65...  Non-Logged           60   
2  0adffd7450d3b9840d8c6215f0569ad942e782fb19b805...      Logged          107   
3  c1e8d644329a78ea1f994292db624c57980b2886cfbc2d...  Non-Logged           56   
4  e777d1f31d4d955b63d60acc13df336d3903f52ab8f8f4...  Non-Logged            4   

                                             history  \
0  [c8aab885-433d-4e46-8066-479f40ba7fb2, 68d2039...   
1  [3325b5a1-979a-4cb3-82b6-63905c9edbe8, fe85605...   
2  [04756569-593e-4133-a95a-83d35d43dbbd, 29b6b14...   
3  [1f2b9c2f-a2d2-4192-b009-09065da8ec23, 0475656...   
4  [bebdeb3e-1699-43e0-a1b8-989f5a6ab679, f4b484a...   

                                    timestampHistory  \
0      [1657146417045, 1657146605778, 1657146698738]   
1  [1656684240278, 1656761266729, 1656761