# Visualizations

Visualize session patterns, types, and revenue distribution.

In [None]:
# 📦 Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline


## Graphs and Charts

In [None]:
#############################
# Importing Essential Libraries for Data Analysis, Visualization, Statistical Modeling, and Machine Learning
#############################

# -----------------------------
# Basic Libraries and Environment Setup
# -----------------------------
import warnings                          # For handling warnings
warnings.filterwarnings("ignore")        # Suppress warnings for cleaner output

import numpy as np                       # Numerical operations and basic array handling
import pandas as pd                      # Data manipulation and analysis
import matplotlib.pyplot as plt          # Basic plotting functions
import seaborn as sns                    # High-level statistical data visualization
from IPython.display import display, HTML  # Display enhancements in IPython notebooks
import ipywidgets as widgets

# -----------------------------
# Data Manipulation and Statistical Analysis
# -----------------------------
# SciPy and Statsmodels for advanced statistical analysis and estimation.
from scipy.stats import gaussian_kde, pearsonr, spearmanr, chi2_contingency  # Statistical functions and tests
import scipy.stats as st                 # Additional statistical functions
from scipy.spatial.distance import cdist, pdist  # Distance calculations for clustering

import statsmodels.api as sm             # Comprehensive statistical analysis
import statsmodels.formula.api as smf      # Formula-based statistical models
from statsmodels.graphics.mosaicplot import mosaic  # Mosaic plots for categorical data visualization
import statsmodels.stats.api as sms        # For hypothesis testing and statistical details
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Multicollinearity diagnostics
from statsmodels.tools.tools import add_constant  # Add intercept term in regression models

# -----------------------------
# Machine Learning and Clustering Libraries (scikit-learn and imbalanced-learn)
# -----------------------------
# Datasets, Preprocessing, and Model Selection
from sklearn import datasets             # Access built-in datasets like Iris
from sklearn.decomposition import PCA    # Principal Component Analysis for dimensionality reduction
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer  # Preprocessing tools: scaling, encoding, custom transformations
from sklearn.impute import SimpleImputer   # Imputation for missing values
from sklearn.model_selection import (
    train_test_split,                    # Split the dataset into training and testing subsets
    GridSearchCV,                        # Exhaustive search over specified parameter values
    RandomizedSearchCV,                  # Randomized search for hyperparameters
    learning_curve                       # Generate learning curves to diagnose model performance
)
from sklearn.compose import ColumnTransformer  # Combine multiple preprocessing steps
from sklearn.pipeline import Pipeline      # Pipeline for sequential data processing and modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

# Clustering and Classification Algorithms
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering  # Common clustering algorithms
from sklearn.neighbors import KNeighborsClassifier  # K-Nearest Neighbors classifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor  # Decision trees for classification and regression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor  # Ensemble methods: Random Forests
from sklearn.linear_model import LinearRegression, LogisticRegression  # Linear and logistic regression models

# Evaluation Metrics
from sklearn.metrics import (
    f1_score,                           # F1 score, harmonic mean of precision and recall
    accuracy_score,                     # Accuracy metric
    recall_score,                       # Recall metric
    precision_score,                    # Precision metric
    confusion_matrix,                   # Confusion matrix to summarize model predictions
    roc_auc_score,                      # Area Under the ROC Curve
    classification_report,              # Detailed classification report
    precision_recall_curve,             # Precision-Recall curve data
    roc_curve,                          # ROC curve data
    make_scorer,                        # Create custom scoring functions
    silhouette_score                    # Evaluate clustering quality
)
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE  # Synthetic Minority Over-sampling Technique for balancing classes

# -----------------------------
# Miscellaneous Utilities and Settings
# -----------------------------
import math  # For mathematical functions like ceiling and floor
from pprint import pprint

# Set the visual theme for seaborn plots for a consistent aesthetic.
sns.set_theme(style='darkgrid')

# Adjust pandas display options to ensure all columns and up to 200 rows are displayed.
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

In [None]:
user_revenue_df['Revenue'].hist(bins=400, figsize=(10, 6))
plt.suptitle('Distribution of Revenue by User', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Use features: Revenue and Number of Transactions
features = user_data[['Revenue', 'Num_Transactions']]

# Scale the features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Run K-Means clustering with 3 clusters (same as current segments)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
user_data['KMeans_Cluster'] = kmeans.fit_predict(scaled_features)

# Visualize the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(data=user_data, x='Num_Transactions', y='Revenue', hue='KMeans_Cluster', palette='Set2')
plt.title('K-Means Clustering of Users (Based on Revenue and Transactions)')
plt.xlabel('Number of Transactions')
plt.ylabel('Revenue ($)')
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()

# Show sample counts per cluster
user_data['KMeans_Cluster'].value_counts().sort_index()

In [None]:
user_freq = df.groupby('Full Name').size().reset_index(name='Num_Sessions')

plt.figure(figsize=(8, 4))
sns.histplot(user_freq['Num_Sessions'], bins=30, kde=True)
plt.title('Distribution of Sessions per User')
plt.xlabel('Number of Sessions')
plt.ylabel('Number of Users')
plt.tight_layout()
plt.show()

In [None]:
df_sorted = df.sort_values(by=['Full Name', 'Session Date'])
df_sorted['Prev Session Date'] = df_sorted.groupby('Full Name')['Session Date'].shift(1)
df_sorted['Days Between Sessions'] = (df_sorted['Session Date'] - df_sorted['Prev Session Date']).dt.days

plt.figure(figsize=(8, 4))
sns.histplot(df_sorted['Days Between Sessions'].dropna(), bins=30, kde=True)
plt.title('Days Between Sessions')
plt.xlabel('Days')
plt.ylabel('Session Count')
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(DF_play)

In [None]:

top_users = DF_play['Full Name'].value_counts().head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_users.values, y=top_users.index, orient='h')
plt.title('Top 10 Users by Entry Count')
plt.xlabel('Number of Entries')
plt.ylabel('User (Full Name)')
plt.tight_layout()
plt.show()

In [None]:
entry_distribution = DF_play['Full Name'].value_counts().value_counts().sort_index()

plt.figure(figsize=(10, 6))
sns.barplot(x=entry_distribution.index, y=entry_distribution.values)
plt.title('Distribution of Entry Counts Across Users')
plt.xlabel('Number of Entries per User')
plt.ylabel('Number of Users')
plt.tight_layout()
plt.show()