In [None]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import chi2_contingency
import os

Importing data

In [None]:
import os
for dirname, _, filenames in os.walk('/main-path...'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
def load_datasets(dataset_names, base_path):
    datasets = {}
    for name in dataset_names:
        file_path = os.path.join(base_path, name + '.csv')
        try:
            datasets[name] = pd.read_csv(file_path)
        except FileNotFoundError:
            print(f"File not found: {file_path}")
            datasets[name] = None
    return datasets

In [None]:
base_path = '/base-path...'

# Dataset names: assuming you have mulpiple dataset to merge
dataset_names = ['datasets-names-list']

# Assuming you have a function load_datasets that loads the datasets given the names and path
datasets = load_datasets(dataset_names, base_path)

# Displaying the first few rows of each dataset to understand their structure
for name, df in datasets.items():
    if df is not None:
        print(f"\nFirst few rows of {name}:")
        print(df.head())
    else:
        print(f"Failed to load {name}")

Use the code below only if you need to merge multiple dataset

In [None]:
import pandas as pd

def merge_datasets(datasets):
    # Start by merging orders with customers
    merged = pd.merge(datasets['olist_orders_dataset'], datasets['olist_customers_dataset'], on='customer_id', how='left')

    # Add other datasets with the correct merge keys
    # i.e. 'order_items': 'order_id', 'order_payments': 'order_id'
    merge_keys = {'name':'key'}

    for name, key in merge_keys.items():
        if name in datasets:
            merged = pd.merge(merged, datasets[name], on=key, how='left')

    return merged

import pandas as pd

# Load individual datasets
# i.e. olist_customers_df = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_customers_dataset.csv')


# Define the datasets dictionary: associate name to df object
# i.e. 'olist_customers_dataset': olist_customers_df,
datasets = { 'name':'key'}

# Now, you can call the merge_datasets function with the loaded datasets
merged_df = merge_datasets(datasets)

# Clean and preprocess data

In [None]:
# check for duplicates
merged_df.duplicated().sum()

In [None]:
# check for missing values by percentage in each column
merged_df.isnull().sum() / len(merged_df) * 100

In [None]:
# drop missing values column with more than 50% missing values
merged_df = merged_df.dropna(thresh=len(merged_df) * 0.5, axis=1)

# drop rows with missing values
merged_df = merged_df.dropna()

In [None]:
# check for missing values by percentage in each column
merged_df.info()

In [None]:
# Clean and preprocess data
def preprocess_data(df):
    # Drop columns with more than 50% missing values
    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)
    
    # Convert datetime columns: only if you have column with date and time values
    datetime_cols = ['columns to convert']
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Calculate new features
    # i.e. df['time_to_delivery'] = (df['order_delivered_customer_date'] - df['order_approved_at']).dt.days

    # Drop rows with missing values
    df.dropna(inplace=True)

    # create seasonal features from order_purchase_timestamp
    # i.e. df['order_month'] = df['order_purchase_timestamp'].dt.month

    return df

merged_df = preprocess_data(merged_df)

In [None]:
# drop unnecessary columns
merged_df.drop(['columns names'], axis=1, inplace=True) 
# save the cleaned dataset
merged_df.to_csv('ds_merged_data_clean.csv', index=False)

In [None]:
# check summary statistics
merged_df.describe()

# Correlation between variables

Correlation for numerical values (Pearson)

In [None]:
# Calculate the Pearson correlation matrix
correlation_matrix = df.corr(method='pearson')

# Plot the heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)

# Show the plot
plt.title('Correlation Matrix (Pearson)')
plt.show()

Correlation for categorical values (Cramer's V)

In [None]:
# Function to calculate Cramér's V
def cramers_v(x, y):
    # Create a contingency table
    contingency_table = pd.crosstab(x, y)
    
    # Perform Chi-Square test
    chi2, _, _, _ = chi2_contingency(contingency_table)
    
    # Calculate Cramér's V
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape
    return np.sqrt(chi2 / (n * (min(r-1, k-1))))

df = pd.DataFrame(data)

# List of categorical columns
categorical_columns = ['list of categorical column']

# Create an empty matrix to store Cramér's V values
n = len(categorical_columns)
cramers_v_matrix = pd.DataFrame(np.zeros((n, n)), index=categorical_columns, columns=categorical_columns)

# Calculate Cramér's V for each pair of variables
for col1 in categorical_columns:
    for col2 in categorical_columns:
        cramers_v_matrix.loc[col1, col2] = cramers_v(df[col1], df[col2])

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', vmin=0, vmax=1, linewidths=0.5)
plt.title("Cramér's V Correlation Matrix")
plt.show()

# Min-Max Data Standardization

In [None]:
def data_scaler(df, columns_to_scale): 
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    
    # Apply the Min-Max scaling (normalization) to the dataset
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=columns_to_scale)

    return df_scaled

In [None]:
# Load DataFrame
df = pd.DataFrame(data)

# Select columns to scale
columns_to_scale = ['list of columns']

# Scale Data
df_scaled = data_scaler(df, columns_to_scale)

# Display the normalized dataset
print("Original Data:\n", df)
print("\nNormalized Data:\n", df_scaled)

# Data Preprocessing of Categorical Variables

Extract classification features from feature sets, and 
save thes6 feature sets in a new table file for further
analysis. Many feature labels about date and classificati n
that have no obvious correlation w ud
predic 
labeltlts are excluded from the total dataset.

# INITIAL SCREENING

# SECONDARY SCREENING