In [20]:
import json
import pandas as pd

In [None]:
with open("datapin.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

In [22]:
records = json_data[0]["dataset_pl"]

In [23]:
df = pd.DataFrame(records)
df

Unnamed: 0,created_user_id,action,desc,created,data_date_key,session_id,new_value,old_value,pkg_order,duration,result_status
0,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1233875325,75,FAIL
1,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1754763006,18898,FAIL
2,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1830114930,697,FAIL
3,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1165733924,571,FAIL
4,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1558571191,163,FAIL
...,...,...,...,...,...,...,...,...,...,...,...
9067,801070,confirmTmpPickedPackageStatus,t·ª´ <b>ƒê√£ ƒëi·ªÅu ph·ªëi l·∫•y h√†ng/ƒêang l·∫•y h√†ng</b> ...,2025-06-11 17:58:48,20250611,20250611,3,12,1279898806,0,SUCCESS
9068,801070,PickupBill,H√≥a ƒë∆°n nh·∫≠p h√†ng v√†o kho <b>BLH2149097803.T72...,2025-06-11 17:58:48,20250611,20250611,,,1935754352,0,FAIL
9069,801070,PickupBill,H√≥a ƒë∆°n nh·∫≠p h√†ng v√†o kho <b>BLH2149097803.T72...,2025-06-11 17:58:48,20250611,20250611,,,1279898806,0,FAIL
9070,801070,updatePickedByCod,C·∫≠p nh·∫≠t ƒë√£ l·∫•y h√†ng b·ªüi COD,2025-06-11 17:58:48,20250611,20250611,1,,1935754352,0,SUCCESS


In [27]:
"""
Optimized Sequence Analysis Code with Start/End Point Analysis
==============================================================
T·ªëi ∆∞u h√≥a code ph√¢n t√≠ch sequence v·ªõi:
- C·∫£i thi·ªán hi·ªáu su·∫•t v√† memory usage
- C·∫•u tr√∫c code r√µ r√†ng h∆°n
- X·ª≠ l√Ω l·ªói v√† validation t·ªët h∆°n
- S·ª≠ d·ª•ng vectorization v√† parallel processing
- Ph√¢n t√≠ch start point v√† end point c·ªßa sequence
"""

import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter, defaultdict
from itertools import combinations
import json
from typing import Dict, List, Tuple, Any, Optional
from dataclasses import dataclass
from functools import lru_cache
import warnings
warnings.filterwarnings('ignore')

# ================================
# 1. DATA CLASSES & CONFIGURATION
# ================================

@dataclass
class AnalysisConfig:
    """Configuration for sequence analysis"""
    min_pattern_support: float = 0.05
    min_subsequence_length: int = 2
    max_subsequence_length: int = 5
    n_clusters: int = 5
    anomaly_threshold: float = 0.95
    required_fields: List[str] = None
    
    def __post_init__(self):
        if self.required_fields is None:
            self.required_fields = ['created_user_id', 'session_id', 'action', 'desc', 'created', 'pkg_order']

@dataclass
class SequenceMetrics:
    """Container for sequence metrics"""
    session_id: str
    sequence: str
    action_count: int
    unique_action_count: int
    action_diversity: float
    start_point: str = None
    end_point: str = None
    total_time: float = 0.0
    success_rate: float = 0.0
    efficiency_score: float = 0.0
    performance_category: str = 'Unknown'
    actions: List[Dict] = None

@dataclass
class StartEndPointAnalysis:
    """Container for start/end point analysis results"""
    start_points: Dict[str, Dict] = None
    end_points: Dict[str, Dict] = None
    start_end_patterns: Dict[str, Dict] = None
    journey_flows: List[Dict] = None
    
    def __post_init__(self):
        if self.start_points is None:
            self.start_points = {}
        if self.end_points is None:
            self.end_points = {}
        if self.start_end_patterns is None:
            self.start_end_patterns = {}
        if self.journey_flows is None:
            self.journey_flows = []

class SequenceAnalyzer:
    """Main analyzer class with optimized methods"""
    
    def __init__(self, config: Optional[AnalysisConfig] = None):
        self.config = config or AnalysisConfig()
        self.df_clean = None
        self.cycles = []
        self.analysis_results = {}
        self.start_end_analysis = StartEndPointAnalysis()
        
    def validate_and_prepare_data(self, input_data: List[Dict]) -> pd.DataFrame:
        """
        Stage 1: Optimized data validation and preparation
        """
        # Convert to DataFrame efficiently
        #
        # df = pd.DataFrame(input_data)

        # Validate required fields
        missing_fields = [field for field in self.config.required_fields if field not in df.columns]
        if missing_fields:
            raise ValueError(f"Missing required fields: {missing_fields}")
        
        # Remove records with null values in required fields
        total_records = len(df)
        df_clean = df.dropna(subset=self.config.required_fields).copy()
        
        # Optimize data types
        df_clean['timestamp'] = pd.to_datetime(df_clean['created'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
        df_clean['group'] = df_clean['created_user_id'].astype(str) + '_' + df_clean['session_id'].astype(str)
                
        # Store validation metrics
        validation_rate = len(df_clean) / total_records if total_records > 0 else 0
        self.analysis_results['validation'] = {
            'total_records': total_records,
            'valid_records': len(df_clean),
            'validation_rate': f"{validation_rate * 100:.2f}%",
            'null_records_removed': total_records - len(df_clean)
        }
        
        self.df_clean = df_clean
        return df_clean
    
    def create_sessions_and_cycles(self) -> List[SequenceMetrics]:
        """
        Stage 2 & 3: Optimized session grouping and cycle creation with start/end points
        """
        if self.df_clean is None:
            raise ValueError("Data must be validated first")
        
        # Group by session efficiently using groupby
        grouped = self.df_clean.groupby(['session_id'])
        
        cycles = []
        for (session_id), group in grouped:
            # Sort by timestamp
            actions_df = group.sort_values('timestamp')
            actions_list = actions_df.to_dict('records')
            
            # Create action sequence
            action_sequence = actions_df['action'].tolist()
            sequence_string = ' -> '.join(action_sequence)
            
            # Extract start and end points
            start_point = action_sequence[0] if action_sequence else None
            end_point = action_sequence[-1] if action_sequence else None
            
            # Calculate metrics efficiently
            total_time = actions_df.get('duration', pd.Series([0] * len(actions_df))).sum()
            success_count = (actions_df.get('result_status') == 'SUCCESS').sum()
            success_rate = success_count / len(actions_list) if len(actions_list) > 0 else 0
            
            # Create cycle metrics
            unique_actions = len(set(action_sequence))
            action_diversity = unique_actions / len(action_sequence) if len(action_sequence) > 0 else 0
            
            cycle = SequenceMetrics(
                session_id=str(session_id),
                sequence=sequence_string,
                action_count=len(action_sequence),
                unique_action_count=unique_actions,
                action_diversity=action_diversity,
                start_point=start_point,
                end_point=end_point,
                total_time=float(total_time),
                success_rate=success_rate,
                actions=actions_list
            )
            
            cycles.append(cycle)
        
        self.cycles = cycles
        
        # Calculate session statistics
        self._calculate_session_statistics()
        return cycles
    
    def _calculate_session_statistics(self):
        """Calculate and store session-level statistics"""
        if not self.cycles:
            return
        
        cycles_df = pd.DataFrame([
            {
                'action_count': c.action_count,
                'unique_action_count': c.unique_action_count,
                'success_rate': c.success_rate,
                'total_time': c.total_time
            }
            for c in self.cycles
        ])
        
        self.analysis_results['session_stats'] = {
            'session_count': len(self.cycles),
            'avg_actions_per_session': cycles_df['action_count'].mean(),
            'avg_unique_actions_per_session': cycles_df['unique_action_count'].mean(),
            'avg_success_rate': cycles_df['success_rate'].mean(),
            'sequence_diversity': len(set(c.sequence for c in self.cycles)) / len(self.cycles)
        }
    
    def analyze_start_end_points(self) -> StartEndPointAnalysis:
        """
        Comprehensive analysis of start points and end points in sequences
        """
        if not self.cycles:
            raise ValueError("Cycles must be created first")
        
        # Initialize counters
        start_point_counter = Counter()
        end_point_counter = Counter()
        start_end_patterns = Counter()
        journey_flows = []
        
        # Collect metrics for each start/end point
        start_point_metrics = defaultdict(lambda: {'success_rates': [], 'durations': [], 'counts': 0})
        end_point_metrics = defaultdict(lambda: {'success_rates': [], 'durations': [], 'counts': 0})
        
        # Process all cycles
        for cycle in self.cycles:
            if cycle.start_point and cycle.end_point:
                # Count occurrences
                start_point_counter[cycle.start_point] += 1
                end_point_counter[cycle.end_point] += 1
                
                # Count start-end patterns
                pattern = f"{cycle.start_point} ‚Üí {cycle.end_point}"
                start_end_patterns[pattern] += 1
                
                # Collect metrics
                start_point_metrics[cycle.start_point]['success_rates'].append(cycle.success_rate)
                start_point_metrics[cycle.start_point]['durations'].append(cycle.total_time)
                start_point_metrics[cycle.start_point]['counts'] += 1
                
                end_point_metrics[cycle.end_point]['success_rates'].append(cycle.success_rate)
                end_point_metrics[cycle.end_point]['durations'].append(cycle.total_time)
                end_point_metrics[cycle.end_point]['counts'] += 1
                
                # Create journey flow
                journey_flows.append({
                    'session_id': cycle.session_id,
                    'start_point': cycle.start_point,
                    'end_point': cycle.end_point,
                    'journey_length': cycle.action_count,
                    'success_rate': cycle.success_rate,
                    'total_time': cycle.total_time,
                    'sequence': cycle.sequence
                })
        
        total_sequences = len(self.cycles)
        
        # Process start points analysis
        start_points_analysis = {}
        for start_point, count in start_point_counter.items():
            metrics = start_point_metrics[start_point]
            start_points_analysis[start_point] = {
                'frequency': count,
                'percentage': f"{(count / total_sequences) * 100:.2f}%",
                'avg_success_rate': f"{np.mean(metrics['success_rates']) * 100:.2f}%" if metrics['success_rates'] else "0%",
                'avg_duration': round(np.mean(metrics['durations']), 2) if metrics['durations'] else 0,
                'sequences_started': count
            }
        
        # Process end points analysis
        end_points_analysis = {}
        for end_point, count in end_point_counter.items():
            metrics = end_point_metrics[end_point]
            end_points_analysis[end_point] = {
                'frequency': count,
                'percentage': f"{(count / total_sequences) * 100:.2f}%",
                'avg_success_rate': f"{np.mean(metrics['success_rates']) * 100:.2f}%" if metrics['success_rates'] else "0%",
                'avg_duration': round(np.mean(metrics['durations']), 2) if metrics['durations'] else 0,
                'sequences_ended': count
            }
        
        # Process start-end patterns
        start_end_patterns_analysis = {}
        for pattern, count in start_end_patterns.items():
            start_end_patterns_analysis[pattern] = {
                'frequency': count,
                'percentage': f"{(count / total_sequences) * 100:.2f}%",
                'pattern_strength': count / total_sequences
            }
        
        # Sort by frequency
        start_points_sorted = dict(sorted(start_points_analysis.items(), 
                                        key=lambda x: x[1]['frequency'], reverse=True))
        end_points_sorted = dict(sorted(end_points_analysis.items(), 
                                      key=lambda x: x[1]['frequency'], reverse=True))
        patterns_sorted = dict(sorted(start_end_patterns_analysis.items(), 
                                    key=lambda x: x[1]['frequency'], reverse=True))
        
        # Update analysis object
        self.start_end_analysis = StartEndPointAnalysis(
            start_points=start_points_sorted,
            end_points=end_points_sorted,
            start_end_patterns=patterns_sorted,
            journey_flows=journey_flows
        )
        
        # Store in analysis results
        self.analysis_results['start_end_analysis'] = {
            'start_points': start_points_sorted,
            'end_points': end_points_sorted,
            'start_end_patterns': patterns_sorted,
            'journey_summary': {
                'total_unique_start_points': len(start_points_sorted),
                'total_unique_end_points': len(end_points_sorted),
                'total_unique_patterns': len(patterns_sorted),
                'most_common_start': max(start_points_sorted.items(), key=lambda x: x[1]['frequency'])[0] if start_points_sorted else None,
                'most_common_end': max(end_points_sorted.items(), key=lambda x: x[1]['frequency'])[0] if end_points_sorted else None,
                'most_common_pattern': max(patterns_sorted.items(), key=lambda x: x[1]['frequency'])[0] if patterns_sorted else None
            }
        }
        
        return self.start_end_analysis
    
    def analyze_journey_paths(self) -> Dict:
        """
        Analyze complete journey paths from start to end
        """
        if not self.start_end_analysis.journey_flows:
            raise ValueError("Start-end analysis must be performed first")
        
        # Group journeys by start-end pattern
        pattern_groups = defaultdict(list)
        for journey in self.start_end_analysis.journey_flows:
            pattern = f"{journey['start_point']} ‚Üí {journey['end_point']}"
            pattern_groups[pattern].append(journey)
        
        # Analyze each pattern group
        pattern_analysis = {}
        for pattern, journeys in pattern_groups.items():
            # Calculate statistics
            journey_lengths = [j['journey_length'] for j in journeys]
            success_rates = [j['success_rate'] for j in journeys]
            durations = [j['total_time'] for j in journeys]
            
            pattern_analysis[pattern] = {
                'total_journeys': len(journeys),
                'avg_journey_length': round(np.mean(journey_lengths), 2),
                'min_journey_length': min(journey_lengths),
                'max_journey_length': max(journey_lengths),
                'avg_success_rate': f"{np.mean(success_rates) * 100:.2f}%",
                'avg_duration': round(np.mean(durations), 2),
                'efficiency_score': round(np.mean(success_rates) / np.mean(journey_lengths) * 100, 2),
                'sample_sequences': [j['sequence'] for j in journeys[:3]]  # Show top 3 examples
            }
        
        # Sort by efficiency score
        pattern_analysis_sorted = dict(sorted(pattern_analysis.items(), 
                                            key=lambda x: x[1]['efficiency_score'], reverse=True))
        
        self.analysis_results['journey_path_analysis'] = pattern_analysis_sorted
        return pattern_analysis_sorted
    
    def identify_optimal_paths(self) -> Dict:
        """
        Identify optimal paths based on success rate and efficiency
        """
        if 'journey_path_analysis' not in self.analysis_results:
            self.analyze_journey_paths()
        
        journey_analysis = self.analysis_results['journey_path_analysis']
        
        # Find optimal paths
        optimal_paths = {}
        
        # Top performing paths by efficiency
        efficiency_sorted = sorted(journey_analysis.items(), 
                                 key=lambda x: x[1]['efficiency_score'], reverse=True)
        
        # Top performing paths by success rate
        success_sorted = sorted(journey_analysis.items(), 
                              key=lambda x: float(x[1]['avg_success_rate'].rstrip('%')), reverse=True)
        
        # Shortest paths
        length_sorted = sorted(journey_analysis.items(), 
                             key=lambda x: x[1]['avg_journey_length'])
        
        optimal_paths = {
            'most_efficient_paths': dict(efficiency_sorted[:5]),
            'highest_success_paths': dict(success_sorted[:5]),
            'shortest_paths': dict(length_sorted[:5]),
            'recommendations': {
                'best_overall_path': efficiency_sorted[0][0] if efficiency_sorted else None,
                'most_reliable_path': success_sorted[0][0] if success_sorted else None,
                'quickest_path': length_sorted[0][0] if length_sorted else None
            }
        }
        
        self.analysis_results['optimal_paths'] = optimal_paths
        return optimal_paths
    
    def calculate_performance_metrics(self):
        """
        Stage 4: Optimized performance metrics calculation
        """
        if not self.cycles:
            raise ValueError("Cycles must be created first")
        
        # Convert to DataFrame for vectorized operations
        metrics_data = []
        for cycle in self.cycles:
            metrics_data.append({
                'total_time': cycle.total_time,
                'success_rate': cycle.success_rate,
                'action_count': cycle.action_count
            })
        
        df_metrics = pd.DataFrame(metrics_data)
        
        # Calculate percentiles efficiently
        time_percentiles = df_metrics['total_time'].quantile([0.1, 0.25, 0.5, 0.75, 0.9])
        success_percentiles = df_metrics['success_rate'].quantile([0.1, 0.25, 0.5, 0.75, 0.9])
        
        # Calculate efficiency scores and performance categories vectorized
        efficiency_scores = (df_metrics['success_rate'] / np.maximum(df_metrics['total_time'], 1)) * 1000
        
        # Update cycles with performance metrics
        for i, cycle in enumerate(self.cycles):
            cycle.efficiency_score = efficiency_scores.iloc[i]
            
            # Determine performance category
            success_rate = cycle.success_rate
            time_percentile = self._get_percentile_rank(cycle.total_time, df_metrics['total_time'])
            
            if success_rate >= 0.9 and time_percentile <= 30:
                cycle.performance_category = 'Excellent'
            elif success_rate >= 0.7 and time_percentile <= 50:
                cycle.performance_category = 'Good'
            elif success_rate >= 0.5:
                cycle.performance_category = 'Average'
            else:
                cycle.performance_category = 'Poor'
        
        # Store performance results
        self.analysis_results['performance'] = {
            'avg_cycle_time': df_metrics['total_time'].mean(),
            'avg_success_rate': df_metrics['success_rate'].mean(),
            'time_percentiles': time_percentiles.to_dict(),
            'success_percentiles': success_percentiles.to_dict(),
            'performance_distribution': Counter([c.performance_category for c in self.cycles])
        }
    
    @staticmethod
    def _get_percentile_rank(value: float, series: pd.Series) -> float:
        """Calculate percentile rank efficiently"""
        return (series < value).sum() / len(series) * 100
    
    @lru_cache(maxsize=1000)
    def _extract_subsequences_cached(self, sequence: str) -> Tuple[str, ...]:
        """Cached subsequence extraction for better performance"""
        actions = sequence.split(' -> ')
        subsequences = []
        
        for length in range(self.config.min_subsequence_length, 
                          min(len(actions) + 1, self.config.max_subsequence_length + 1)):
            for i in range(len(actions) - length + 1):
                subseq = ' -> '.join(actions[i:i + length])
                subsequences.append(subseq)
        
        return tuple(subsequences)
    
    def find_frequent_patterns(self) -> List[Dict]:
        """
        Optimized frequent pattern mining with caching
        """
        if not self.cycles:
            raise ValueError("Cycles must be created first")
        
        # Use counter for efficient counting
        all_subsequences = []
        for cycle in self.cycles:
            subsequences = self._extract_subsequences_cached(cycle.sequence)
            all_subsequences.extend(subsequences)
        
        # Count patterns efficiently
        pattern_counts = Counter(all_subsequences)
        total_cycles = len(self.cycles)
        
        # Filter and format results
        frequent_patterns = []
        for pattern, count in pattern_counts.items():
            support = count / total_cycles
            if support >= self.config.min_pattern_support:
                frequent_patterns.append({
                    'pattern': pattern,
                    'count': count,
                    'support': support,
                    'frequency_percentage': f"{support * 100:.2f}%"
                })
        
        # Sort by support
        frequent_patterns.sort(key=lambda x: x['support'], reverse=True)
        
        self.analysis_results['frequent_patterns'] = frequent_patterns
        return frequent_patterns
    
    def analyze_transitions(self) -> Tuple[Dict, Dict]:
        """
        Optimized transition analysis
        """
        if not self.cycles:
            raise ValueError("Cycles must be created first")
        
        transition_counts = Counter()
        action_counts = Counter()
        
        # Process all cycles in one pass
        for cycle in self.cycles:
            actions = cycle.sequence.split(' -> ')
            
            # Count actions
            action_counts.update(actions)
            
            # Count transitions
            for i in range(len(actions) - 1):
                transition = f"{actions[i]} -> {actions[i+1]}"
                transition_counts[transition] += 1
        
        # Calculate probabilities efficiently
        transition_probabilities = {}
        for transition, count in transition_counts.items():
            from_action = transition.split(' -> ')[0]
            probability = count / action_counts[from_action]
            transition_probabilities[transition] = {
                'count': count,
                'probability': probability,
                'percentage': f"{probability * 100:.2f}%"
            }
        
        # Sort by count
        sorted_transitions = dict(
            sorted(transition_probabilities.items(), 
                  key=lambda x: x[1]['count'], reverse=True)
        )
        
        self.analysis_results['transitions'] = sorted_transitions
        self.analysis_results['action_frequencies'] = dict(action_counts)
        
        return sorted_transitions, dict(action_counts)
    
    def detect_anomalies(self) -> List[Dict]:
        """
        Optimized anomaly detection including start/end point anomalies
        """
        if not self.cycles:
            raise ValueError("Cycles must be created first")
        
        # Prepare data for vectorized operations  
        metrics_df = pd.DataFrame([
            {
                'action_count': c.action_count,
                'success_rate': c.success_rate,
                'efficiency_score': c.efficiency_score,
                'sequence': c.sequence,
                'start_point': c.start_point,
                'end_point': c.end_point
            }
            for c in self.cycles
        ])
        
        # Calculate percentiles
        length_percentiles = metrics_df['action_count'].quantile([0.05, 0.95])
        success_percentiles = metrics_df['success_rate'].quantile([0.05, 0.95])
        efficiency_percentiles = metrics_df['efficiency_score'].quantile([0.05, 0.95])
        
        # Count sequence and start/end point occurrences
        sequence_counts = metrics_df['sequence'].value_counts()
        start_point_counts = metrics_df['start_point'].value_counts()
        end_point_counts = metrics_df['end_point'].value_counts()
        
        anomalies = []
        for i, cycle in enumerate(self.cycles):
            anomaly_reasons = []
            
            # Check length anomaly
            if (cycle.action_count < length_percentiles[0.05] or 
                cycle.action_count > length_percentiles[0.95]):
                percentile = self._get_percentile_rank(cycle.action_count, metrics_df['action_count'])
                anomaly_reasons.append(f"Unusual length (percentile: {percentile:.1f}%)")
            
            # Check success rate anomaly
            if cycle.success_rate < success_percentiles[0.05]:
                percentile = self._get_percentile_rank(cycle.success_rate, metrics_df['success_rate'])
                anomaly_reasons.append(f"Very low success rate (percentile: {percentile:.1f}%)")
            
            # Check unique sequence
            if sequence_counts[cycle.sequence] == 1 and len(self.cycles) > 10:
                anomaly_reasons.append("Unique sequence pattern")
            
            # Check efficiency
            if cycle.efficiency_score < efficiency_percentiles[0.05]:
                anomaly_reasons.append("Very low efficiency")
            
            # Check rare start/end points
            if cycle.start_point and start_point_counts[cycle.start_point] == 1 and len(self.cycles) > 10:
                anomaly_reasons.append("Rare start point")
            
            if cycle.end_point and end_point_counts[cycle.end_point] == 1 and len(self.cycles) > 10:
                anomaly_reasons.append("Rare end point")
            
            if anomaly_reasons:
                anomalies.append({
                    'cycle_index': i,
                    'session_id': cycle.session_id,
                    'sequence': cycle.sequence,
                    'start_point': cycle.start_point,
                    'end_point': cycle.end_point,
                    'anomaly_reasons': anomaly_reasons,
                    'metrics': {
                        'action_count': cycle.action_count,
                        'success_rate': f"{cycle.success_rate * 100:.2f}%",
                        'efficiency_score': round(cycle.efficiency_score, 2)
                    }
                })
        
        self.analysis_results['anomalies'] = anomalies
        return anomalies
    
    def create_comprehensive_summary(self) -> Dict:
        """
        Create comprehensive analysis summary including start/end point analysis
        """
        if not self.analysis_results:
            raise ValueError("Analysis must be performed first")
        
        summary = {
            'data_quality': self.analysis_results.get('validation', {}),
            'session_overview': self.analysis_results.get('session_stats', {}),
            'performance_metrics': self.analysis_results.get('performance', {}),
            'frequent_patterns': self.analysis_results.get('frequent_patterns', []),          
            'pattern_analysis': {
                'total_patterns_found': len(self.analysis_results.get('frequent_patterns', [])),
                'top_patterns': self.analysis_results.get('frequent_patterns', [])[:5]
            },
            'transition_analysis': {
                'total_transitions': len(self.analysis_results.get('transitions', {})),
                'top_transitions': dict(list(self.analysis_results.get('transitions', {}).items())[:5])
            },
            'start_end_point_analysis': self.analysis_results.get('start_end_analysis', {}),
            'journey_path_analysis': self.analysis_results.get('journey_path_analysis', {}),
            'optimal_paths': self.analysis_results.get('optimal_paths', {}),
            'anomaly_detection': {
                'total_anomalies': len(self.analysis_results.get('anomalies', [])),
                'anomaly_rate': f"{len(self.analysis_results.get('anomalies', [])) / len(self.cycles) * 100:.2f}%" if self.cycles else "0%",
                'sample_anomalies': self.analysis_results.get('anomalies', [])[:3]
            }
        }
        
        return summary
    
    def run_full_analysis(self, input_data: List[Dict]) -> Dict:
        """
        Run complete optimized analysis pipeline with start/end point analysis
        """
        try:
            # Stage 1: Data validation and preparation
            print("Stage 1: Validating and preparing data...")
            self.validate_and_prepare_data(input_data)
            
            # Stage 2-3: Session grouping and cycle creation
            print("Stage 2-3: Creating sessions and cycles...")
            self.create_sessions_and_cycles()
            
            # Stage 4: Performance metrics
            print("Stage 4: Calculating performance metrics...")
            self.calculate_performance_metrics()
            
            # Stage 5: Start/End point analysis
            print("Stage 5: Analyzing start and end points...")
            self.analyze_start_end_points()
            
            # Stage 6: Journey path analysis
            print("Stage 6: Analyzing journey paths...")
            self.analyze_journey_paths()
            
            # Stage 7: Identify optimal paths
            print("Stage 7: Identifying optimal paths...")
            self.identify_optimal_paths()
            
            # Stage 8: Pattern analysis
            print("Stage 8: Analyzing patterns...")
            self.find_frequent_patterns()
            
            # Stage 9: Transition analysis
            print("Stage 9: Analyzing transitions...")
            self.analyze_transitions()
            
            # Stage 10: Anomaly detection
            print("Stage 10: Detecting anomalies...")
            self.detect_anomalies()
            
            # Create final summary
            print("Creating comprehensive summary...")
            summary = self.create_comprehensive_summary()
            
            return {
                'sequence_analysis_summary': summary,
                'cycles_count': len(self.cycles)
            }
            
        except Exception as e:
            return {
                'error': str(e),
                'stage': 'analysis_pipeline'
            }

# ================================
# Main execution function
# ================================

def run_optimized_analysis(input_data: List[Dict], config: Optional[AnalysisConfig] = None) -> Dict:
    """
    Main function to run optimized sequence analysis with start/end point analysis
    
    Args:
        input_data: List of dictionaries containing sequence data
        config: Optional configuration object
    
    Returns:
        Dictionary containing analysis results including start/end point insights
    """
    analyzer = SequenceAnalyzer(config)
    return analyzer.run_full_analysis(input_data)

# ================================
# USAGE 
#input_data = _input.all()[0].json.dataset_pl
input_data = df

# Configure analysis
config = AnalysisConfig(
    min_pattern_support=0.05,
    min_subsequence_length=2,
    max_subsequence_length=6,
    n_clusters=10
)

# Example usage function
def analyze_sequences_with_start_end_points(input_data):
    """
    Example function showing how to use the enhanced sequence analyzer
    """
    try:
        # Run the complete analysis
        results = run_optimized_analysis(input_data, config)
        
        # Extract specific insights
        if 'sequence_analysis_summary' in results:
            summary = results['sequence_analysis_summary']
            
            # Print key insights about start/end points
            print("=== START/END POINT ANALYSIS INSIGHTS ===")
            
            if 'start_end_point_analysis' in summary:
                start_end_analysis = summary['start_end_point_analysis']
                
                # Most common entry points
                print("\nüöÄ TOP ENTRY POINTS:")
                if 'start_points' in start_end_analysis:
                    for i, (start_point, data) in enumerate(list(start_end_analysis['start_points'].items())[:5]):
                        print(f"  {i+1}. {start_point}: {data['percentage']} (Success: {data['avg_success_rate']})")
                
                # Most common exit points
                print("\nüèÅ TOP EXIT POINTS:")
                if 'end_points' in start_end_analysis:
                    for i, (end_point, data) in enumerate(list(start_end_analysis['end_points'].items())[:5]):
                        print(f"  {i+1}. {end_point}: {data['percentage']} (Success: {data['avg_success_rate']})")
                
                # Most common journey patterns
                print("\nüõ§Ô∏è  TOP JOURNEY PATTERNS:")
                if 'start_end_patterns' in start_end_analysis:
                    for i, (pattern, data) in enumerate(list(start_end_analysis['start_end_patterns'].items())[:5]):
                        print(f"  {i+1}. {pattern}: {data['percentage']}")
            
            # Optimal paths recommendations
            if 'optimal_paths' in summary:
                optimal_paths = summary['optimal_paths']
                print("\nüéØ OPTIMAL PATH RECOMMENDATIONS:")
                
                if 'recommendations' in optimal_paths:
                    recs = optimal_paths['recommendations']
                    if recs.get('best_overall_path'):
                        print(f"  ‚Ä¢ Best Overall: {recs['best_overall_path']}")
                    if recs.get('most_reliable_path'):
                        print(f"  ‚Ä¢ Most Reliable: {recs['most_reliable_path']}")
                    if recs.get('quickest_path'):
                        print(f"  ‚Ä¢ Quickest: {recs['quickest_path']}")
        
        return results
        
    except Exception as e:
        print(f"Error in analysis: {str(e)}")
        return {'error': str(e)}

# Run analysis (uncomment when you have actual data)
results = analyze_sequences_with_start_end_points(input_data)

return results

Stage 1: Validating and preparing data...
Stage 2-3: Creating sessions and cycles...
Stage 4: Calculating performance metrics...
Stage 5: Analyzing start and end points...
Stage 6: Analyzing journey paths...
Stage 7: Identifying optimal paths...
Stage 8: Analyzing patterns...
Stage 9: Analyzing transitions...
Stage 10: Detecting anomalies...
Creating comprehensive summary...
=== START/END POINT ANALYSIS INSIGHTS ===

üöÄ TOP ENTRY POINTS:
  1. verifySessionPackage: 50.00% (Success: 56.64%)
  2. distributorCfmDeliver: 25.00% (Success: 50.00%)
  3. updateDeliverWorkshiftByCod: 25.00% (Success: 44.53%)

üèÅ TOP EXIT POINTS:
  1. smartScaleScanningPackage: 75.00% (Success: 54.43%)
  2. verifySessionPackage: 25.00% (Success: 44.53%)

üõ§Ô∏è  TOP JOURNEY PATTERNS:
  1. verifySessionPackage ‚Üí smartScaleScanningPackage: 50.00%
  2. distributorCfmDeliver ‚Üí smartScaleScanningPackage: 25.00%
  3. updateDeliverWorkshiftByCod ‚Üí verifySessionPackage: 25.00%

üéØ OPTIMAL PATH RECOMMENDATION

SyntaxError: 'return' outside function (1928346954.py, line 795)

In [None]:
results

{'sequence_analysis_summary': {'data_quality': {'total_records': 9072,
   'valid_records': 9063,
   'validation_rate': '99.90%',
   'null_records_removed': 9},
  'session_overview': {'session_count': 4,
   'avg_actions_per_session': np.float64(2265.75),
   'avg_unique_actions_per_session': np.float64(41.0),
   'avg_success_rate': np.float64(0.5195345174962163),
   'sequence_diversity': 1.0},
  'performance_metrics': {'avg_cycle_time': np.float64(18428582.0),
   'avg_success_rate': np.float64(0.5195345174962163),
   'time_percentiles': {0.1: 1138692.5000000002,
    0.25: 2819521.25,
    0.5: 16824384.0,
    0.75: 32433444.75,
    0.9: 37001829.9},
   'success_percentiles': {0.1: 0.46173490924440697,
    0.25: 0.48633389615871675,
    0.5: 0.5271940667490729,
    0.75: 0.5603946880865723,
    0.9: 0.5712064863457401},
   'performance_distribution': Counter({'Average': 3, 'Poor': 1})},
  'frequent_patterns': [{'pattern': 'distributorCfmDeliver -> distributorCfmDeliver',
    'count': 2388,

In [None]:
with open("datapin.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

In [30]:
import json

# L·∫•y d·ªØ li·ªáu t·ª´ input
input = _input.all()
input = input[0].json

result = input.get('result')
content = result.get('content')
text = content[0].get('text')

pkg_orders = text
pkg_orders_json = json.loads(pkg_orders)

# Tr√≠ch xu·∫•t pkg_orders
pkg_orders = list(set([str(pkg_order['pkg_order']) for pkg_order in pkg_orders_json]))

return {'pkg_orders': pkg_orders}

NameError: name '_input' is not defined

In [31]:
df

Unnamed: 0,created_user_id,action,desc,created,data_date_key,session_id,new_value,old_value,pkg_order,duration,result_status
0,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1233875325,75,FAIL
1,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1754763006,18898,FAIL
2,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1830114930,697,FAIL
3,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1165733924,571,FAIL
4,800822,distributorCfmDeliver,X√°c nh·∫≠n giao h√†ng cho COD: vudt51 - D∆Ø∆†NG TU·∫§...,2025-06-09 15:52:26,20250609,20250609,,,1558571191,163,FAIL
...,...,...,...,...,...,...,...,...,...,...,...
9067,801070,confirmTmpPickedPackageStatus,t·ª´ <b>ƒê√£ ƒëi·ªÅu ph·ªëi l·∫•y h√†ng/ƒêang l·∫•y h√†ng</b> ...,2025-06-11 17:58:48,20250611,20250611,3,12,1279898806,0,SUCCESS
9068,801070,PickupBill,H√≥a ƒë∆°n nh·∫≠p h√†ng v√†o kho <b>BLH2149097803.T72...,2025-06-11 17:58:48,20250611,20250611,,,1935754352,0,FAIL
9069,801070,PickupBill,H√≥a ƒë∆°n nh·∫≠p h√†ng v√†o kho <b>BLH2149097803.T72...,2025-06-11 17:58:48,20250611,20250611,,,1279898806,0,FAIL
9070,801070,updatePickedByCod,C·∫≠p nh·∫≠t ƒë√£ l·∫•y h√†ng b·ªüi COD,2025-06-11 17:58:48,20250611,20250611,1,,1935754352,0,SUCCESS


1. Logic l·∫•y data t·ªõi t·ª´ng ƒë·ªëi t∆∞·ª£ng
- L·∫•y data action c·ªßa Nh√¢n vi√™n b∆∞u c·ª•c
- L·ªçc data ƒë∆°n h√†ng
- L·∫•y data id theo t·ª´ng ƒë·ªëi t∆∞·ª£ng d·ª±a v√†o ƒë∆°n h√†ng (pkg_order)
- L·∫•y data c·ª• th·ªÉ theo t·ª´ng ƒë·ªëi t∆∞·ª£ng d·ª±a v√†o id 

Ph√¢n t√≠ch theo ƒë·ªëi t∆∞·ª£ng
1. Shop

In [35]:
with open("datapin_shop.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

In [37]:
records = json_data[0]["dataset_shop"]

In [38]:
df_shops = pd.DataFrame(records)
df_shops

Unnamed: 0,shop_id,shop_order,shop_name,shop_type
0,60c8878b-91e8-48dd-9c44-47760a784c68,19617420,nguy·ªÖn l∆∞·ª£ng,1
1,60cc2bd7-0a54-43a3-94de-40b10a6e4c24,19642397,Athena,1
2,60dfdb97-049c-438e-90f7-401d0a6e4d59,19759339,ƒê√†o Th·ªã H√†,0
3,5fbccb28-2288-4667-8364-4a3b0a6e4c25,18297104,Doscom,0
4,5fc3c57b-39ec-4a1e-865a-4a280a784c65,18327588,Ti·ªán √≠ch Xanh - Chi Nh√°nh Linh ƒê√†m 2,1
...,...,...,...,...
922,f75e8fc3-df0a-4b94-bdf9-682ec7b365a2,22700122,X∆∞·ªüng Chuy√™n H√†ng Da,1
923,fab42c05-75d6-47bd-b9e5-50409fb4b7e4,22671998,Apple No1 - 178 Th√°i H√†,0
924,fc41a485-9206-4670-a155-14f260e85587,22907179,Shop H∆∞∆°ng Tr√¢ÃÄn 17,1
925,fe7dcefe-a0a3-49be-814a-92652f756d81,22449901,T·ªïng kho lk VP,1
