# Introduction
## Data Analysis Approach
The objective of this analysis is ...


# Data Preparation
## Import csv files
### Import Libraries

In [None]:
%pip install python-dotenv
%pip install seaborn


import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import psycopg2
import pandas as pd
from psycopg2 import sql
from sqlalchemy import create_engine, text
import seaborn as sns
import matplotlib.pyplot as plt

### create .env

In [None]:
# Load environment variables
load_dotenv(override=True)

# Test if variables are loaded
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_port = os.getenv('DB_PORT')
database_url = os.getenv("DATABASE_URL")
secret_key = os.getenv("SECRET_KEY")
debug_mode = os.getenv("DEBUG")

# file path
supply_chain_file_path = "../resources/DataCoSupplyChainDataset.csv"
access_log_file_path = "../resources/tokenized_access_logs.csv"

print("✓ Environment variables loaded:")
# print(f"DB_HOST: {os.getenv('DB_HOST')}")
# print(f"DB_NAME: {os.getenv('DB_NAME')}")
# print(f"DB_USER: {os.getenv('DB_USER')}")
# print(f"DB_PASSWORD: {os.getenv('DB_PASSWORD')}")
# print(f"DB_PORT: {os.getenv('DB_PORT')}")
# print(f"Database URL: {database_url}")
# print(f"Secret Key: {secret_key}")
# print(f"Debug Mode: {debug_mode}")

## Create Tables and Import Data Using Python

In [None]:
import psycopg2
import pandas as pd
from psycopg2 import sql

conn_params = {
    'host':     db_host,
    'database': db_name,
    'user':     db_user,
    'password': db_password,
    'port':     db_port
}

try:
    conn = psycopg2.connect(**conn_params)
    conn.autocommit = True
    cursor = conn.cursor()
    cursor.execute("CREATE DATABASE final_project;")
    print("Database created successfully!")
    
except psycopg2.errors.DuplicateDatabase:
    print("Database already exists")

except Exception as e:
    print(f"Error: {e}")

finally:
    cursor.close()
    conn.close()

### Create Tables from Your CSV Files

In [None]:
# Connect to your project database
conn_params['database'] = os.getenv('DB_NAME')

try:
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    
    # Create table with proper data types
    create_table_query = """
    CREATE TABLE IF NOT EXISTS supply_chain_df (
        type VARCHAR(50),
        days_for_shipping_real INTEGER,
        days_for_shipment_scheduled INTEGER,
        benefit_per_order NUMERIC(10,2),
        sales_per_customer NUMERIC(10,2),
        delivery_status VARCHAR(50),
        late_delivery_risk INTEGER,
        category_id INTEGER,
        category_name VARCHAR(100),
        customer_city VARCHAR(100),
        customer_country VARCHAR(100),
        customer_email VARCHAR(150),
        customer_fname VARCHAR(100),
        customer_id INTEGER,
        customer_lname VARCHAR(100),
        customer_password VARCHAR(100),
        customer_segment VARCHAR(50),
        customer_state VARCHAR(100),
        customer_street VARCHAR(200),
        customer_zipcode VARCHAR(20),
        department_id INTEGER,
        department_name VARCHAR(100),
        latitude NUMERIC(10,6),
        longitude NUMERIC(10,6),
        market VARCHAR(50),
        order_city VARCHAR(100),
        order_country VARCHAR(100),
        order_customer_id INTEGER,
        order_date DATE,
        order_id INTEGER PRIMARY KEY,
        order_item_cardprod_id INTEGER,
        order_item_discount NUMERIC(10,2),
        order_item_discount_rate NUMERIC(5,4),
        order_item_id INTEGER,
        order_item_product_price NUMERIC(10,2),
        order_item_profit_ratio NUMERIC(5,4),
        order_item_quantity INTEGER,
        sales NUMERIC(10,2),
        order_item_total NUMERIC(10,2),
        order_profit_per_order NUMERIC(10,2),
        order_region VARCHAR(50),
        order_state VARCHAR(100),
        order_status VARCHAR(50),
        order_zipcode VARCHAR(20),
        product_card_id INTEGER,
        product_category_id INTEGER,
        product_description TEXT,
        product_image VARCHAR(200),
        product_name VARCHAR(200),
        product_price NUMERIC(10,2),
        product_status INTEGER,
        shipping_date DATE,
        shipping_mode VARCHAR(50)
    );
    """
    
    cursor.execute(create_table_query)
    conn.commit()
    print("✓ Table created successfully!")
    
except Exception as e:
    print(f"Error: {e}")
    conn.rollback()
    
finally:
    cursor.close()
    conn.close()


### Import DataCoSupplyChainDataset


In [None]:

db_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
engine = create_engine(db_url)

supply_chain_df = pd.read_csv('resources/DataCoSupplyChainDataset.csv')
supply_chain_df.columns = supply_chain_df.columns.str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
supply_chain_df.to_sql('supply_chain_df', engine, if_exists='replace', index=False)

print(f"✓ Successfully imported {len(supply_chain_df)} rows!")

### Import tokenized_access_logs

In [None]:
# Connection parameters
try:
    conn = psycopg2.connect(**conn_params)
    cursor = conn.cursor()
    
    # CREATE TABLE SQL statement
    create_table_sql = """
    CREATE TABLE IF NOT EXISTS access_log_df (
        product VARCHAR(200),
        category VARCHAR(100),
        date DATE,
        month VARCHAR(20),
        hour TIME,
        department VARCHAR(100),
        ip VARCHAR(50),
        url TEXT
    );
    """
    
    cursor.execute(create_table_sql)
    conn.commit()
    print("✓ Table created successfully!")
    
except Exception as e:
    print(f"Error: {e}")
    conn.rollback()
finally:
    cursor.close()
    conn.close()


In [None]:

access_log_df = pd.read_csv('resources/tokenized_access_logs.csv')
access_log_df.columns = access_log_df.columns.str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
access_log_df.to_sql('access_log_df', engine, if_exists='replace', index=False)

print(f"✓ Successfully imported {len(access_log_df)} rows!")

In [None]:
# Write SQL query
conn = psycopg2.connect(**conn_params)
cur = conn.cursor()
query = "SELECT * FROM supply_chain_df;"
supply_chain_df = pd.read_sql_query(query, conn)

# Convert to lowercase all
for col in supply_chain_df.select_dtypes(include=['object']).columns:
    supply_chain_df[col] = supply_chain_df[col].str.lower()

supply_chain_df.head()

In [None]:
access_log_df = pd.read_sql_query("SELECT * FROM access_log_df", conn)

for col in access_log_df.select_dtypes(include=['object']).columns:
    access_log_df[col] = access_log_df[col].str.lower()

access_log_df.head()

In [None]:
pd.set_option('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 200)