In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()
DB_CONFIG = {
    'dbname': os.getenv('DB_NAME'),
    'user': os.getenv('DB_USER'),
    'password': os.getenv('DB_PASSWORD'),
    'host': os.getenv('DB_HOST'),
    'port': os.getenv('DB_PORT')
}

engine= create_engine(f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}")

In [None]:
user_features = pd.read_sql("SELECT * FROM user_features", engine)
item_features = pd.read_sql("SELECT * FROM item_features", engine)

print(f"Users: {len(user_features):,}")
print(f"Items: {len(item_features):,}")

display(user_features.head())
display(item_features.head())

In [None]:
# USER FEATURE EXPLORATION

In [None]:
# User segment distribution
plt.figure(figsize=(10, 6))
user_features['user_segment'].value_counts().plot(kind='bar')
plt.title('User Segment Distribution')
plt.ylabel('Count')
plt.show()

In [None]:
# Conversion by segment
segment_conversion = user_features.groupby('user_segment').agg({
    'total_events': 'mean',
    'total_views': 'mean',
    'total_transactions': 'mean'
})
print("Average Behavior by Segment:")
print(segment_conversion)

In [None]:
# ITEM FEATURE EXPLORATION

In [None]:
# Conversion rate distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(item_features['conversion_rate'], bins=50, edgecolor='black')
plt.title('Item Conversion Rate Distribution')
plt.xlabel('Conversion Rate')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.hist(item_features['popularity_score'], bins=50, edgecolor='black')
plt.title('Popularity Score Distribution')
plt.xlabel('Popularity Score')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
print("Top 10 by views:")
display(item_features.nlargest(10, 'total_views')[['itemid', 'total_views', 'total_transactions', 'conversion_rate']].reset_index(drop=True))

print("\nTop 10 by conversion (min 100 views):")
display(item_features[item_features['total_views'] >= 100].nlargest(10, 'conversion_rate')[['itemid', 'total_views', 'conversion_rate']].reset_index(drop=True))

In [None]:
# FEATURE CORRELATIONS

In [None]:
# User feature correlations
user_numeric = user_features[['total_events', 'total_views', 'total_addtocarts', 'total_transactions']]
sns.heatmap(user_numeric.corr(), annot=True, cmap='coolwarm')
plt.title('User Feature Correlations')
plt.show()

In [None]:
# Item feature correlations
item_numeric = item_features[['total_views', 'total_addtocarts', 'total_transactions', 'conversion_rate']]
sns.heatmap(item_numeric.corr(), annot=True, cmap='coolwarm')
plt.title('Item Feature Correlations')
plt.show()