In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from datetime import datetime, timedelta
import networkx as nx
from typing import Dict, List, Tuple
import warnings
import os # Import the os module for path manipulation

warnings.filterwarnings('ignore')


In [14]:
class IndovestDKGAnalyzer:
    """Comprehensive analysis of IndovestDKG dataset characteristics"""
    
    def __init__(self, csv_path: str):
        """
        Initializes the analyzer by loading the CSV data and converting the 'date' column.

        Args:
            csv_path (str): The full or relative path to the IndovestDKG CSV file.
        """
        try:
            self.df = pd.read_csv(csv_path)
            # --- START OF FIX ---
            # Explicitly convert 'subject', 'object', and 'relation' to string type
            # This handles cases where some entries might be interpreted as numbers/floats (e.g., due to NaN)
            self.df['subject'] = self.df['subject'].astype(str)
            self.df['object'] = self.df['object'].astype(str)
            self.df['relation'] = self.df['relation'].astype(str)
            self.df['subject_type'] = self.df['subject_type'].astype(str)
            self.df['object_type'] = self.df['object_type'].astype(str)
            # --- END OF FIX ---
            self.df['date'] = pd.to_datetime(self.df['date'])
            self.prepare_analysis()
            print(f"Successfully loaded data from: {csv_path}")
        except FileNotFoundError:
            print(f"Error: The file was not found at {csv_path}. Please check the path.")
            # You might want to exit or raise an error here, depending on desired behavior
            self.df = pd.DataFrame() # Initialize an empty DataFrame to prevent further errors
        except Exception as e:
            print(f"An error occurred during initialization: {e}")
            self.df = pd.DataFrame()

    def prepare_analysis(self):
        """
        Prepares data for analysis by creating sets of all entities and relations,
        and mapping entities to their types.
        """
        if self.df.empty:
            print("DataFrame is empty. Skipping analysis preparation.")
            self.all_entities = set()
            self.all_relations = set()
            self.entity_types = {}
            return

        # Create entity mappings - these columns are already forced to str in __init__
        self.all_entities = set(self.df['subject'].tolist() + self.df['object'].tolist())
        self.all_relations = set(self.df['relation'].tolist())
        
        # Create type mappings - these columns are already forced to str in __init__
        self.entity_types = {}
        for _, row in self.df.iterrows():
            self.entity_types[row['subject']] = row['subject_type']
            self.entity_types[row['object']] = row['object_type']
    
    def coverage_analysis(self) -> Dict:
        """
        Analyzes dataset coverage characteristics, including entity, relation,
        and temporal coverage, as well as entity type distribution.

        Returns:
            Dict: A dictionary containing various coverage statistics.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform coverage analysis.")
            return {}

        # Entity coverage analysis - columns are already str
        entity_freq = Counter(self.df['subject'].tolist() + self.df['object'].tolist())
        
        # Relation coverage analysis - column is already str
        relation_freq = Counter(self.df['relation'].tolist())
        
        # Temporal coverage
        date_range = self.df['date'].max() - self.df['date'].min()
        unique_dates = self.df['date'].dt.date.nunique()
        
        # Entity type distribution - values are already str
        type_distribution = Counter(self.entity_types.values())
        
        coverage_stats = {
            'total_entities': len(self.all_entities),
            'total_relations': len(self.all_relations),
            'total_quadruplets': len(self.df),
            'unique_dates': unique_dates,
            'temporal_span_days': date_range.days,
            'entity_frequency_stats': {
                'mean': np.mean(list(entity_freq.values())) if entity_freq else 0,
                'std': np.std(list(entity_freq.values())) if entity_freq else 0,
                'min': min(entity_freq.values()) if entity_freq else 0,
                'max': max(entity_freq.values()) if entity_freq else 0,
                'entities_single_occurrence': sum(1 for freq in entity_freq.values() if freq == 1),
                'entities_single_occurrence_pct': (sum(1 for freq in entity_freq.values() if freq == 1) / len(entity_freq) * 100) if entity_freq else 0
            },
            'relation_frequency_stats': {
                'mean': np.mean(list(relation_freq.values())) if relation_freq else 0,
                'std': np.std(list(relation_freq.values())) if relation_freq else 0,
                'min': min(relation_freq.values()) if relation_freq else 0,
                'max': max(relation_freq.values()) if relation_freq else 0,
                'relations_single_occurrence': sum(1 for freq in relation_freq.values() if freq == 1),
                'relations_single_occurrence_pct': (sum(1 for freq in relation_freq.values() if freq == 1) / len(relation_freq) * 100) if relation_freq else 0
            },
            'entity_type_distribution': dict(type_distribution),
            'temporal_density': len(self.df) / unique_dates if unique_dates > 0 else 0
        }
        
        return coverage_stats
    
    def sparsity_analysis(self) -> Dict:
        """
        Analyzes dataset sparsity characteristics, including basic sparsity,
        entity-relation sparsity, temporal sparsity, and entity co-occurrence.

        Returns:
            Dict: A dictionary containing various sparsity statistics.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform sparsity analysis.")
            return {}

        num_entities = len(self.all_entities)
        num_relations = len(self.all_relations)
        num_facts = len(self.df)
        
        # Theoretical maximum facts (each entity can have each relation with each other entity)
        # Handle cases where num_entities or num_relations might be zero
        max_possible_facts = num_entities * num_relations * num_entities
        
        # Basic sparsity
        basic_sparsity = 1 - (num_facts / max_possible_facts) if max_possible_facts > 0 else 1.0
        
        # Entity-relation sparsity
        entity_relation_pairs = set()
        for _, row in self.df.iterrows():
            # Columns are already string due to __init__
            entity_relation_pairs.add((row['subject'], row['relation']))
            entity_relation_pairs.add((row['object'], row['relation']))
        
        max_entity_relation_pairs = num_entities * num_relations
        entity_relation_sparsity = 1 - (len(entity_relation_pairs) / max_entity_relation_pairs) if max_entity_relation_pairs > 0 else 1.0
        
        # Temporal sparsity
        unique_dates = self.df['date'].dt.date.nunique()
        date_range = (self.df['date'].max() - self.df['date'].min()).days + 1
        temporal_sparsity = 1 - (unique_dates / date_range) if date_range > 0 else 1.0
        
        # Entity co-occurrence analysis
        entity_pairs = set()
        for _, row in self.df.iterrows():
            # Columns are already string due to __init__, so sorted() should work now
            pair = tuple(sorted([row['subject'], row['object']]))
            entity_pairs.add(pair)
        
        max_entity_pairs = (num_entities * (num_entities - 1)) // 2
        entity_pair_coverage = len(entity_pairs) / max_entity_pairs if max_entity_pairs > 0 else 0
        
        sparsity_stats = {
            'basic_sparsity': basic_sparsity,
            'entity_relation_sparsity': entity_relation_sparsity,
            'temporal_sparsity': temporal_sparsity,
            'entity_pair_coverage': entity_pair_coverage,
            'facts_per_entity': num_facts / num_entities if num_entities > 0 else 0,
            'facts_per_relation': num_facts / num_relations if num_relations > 0 else 0,
            'facts_per_day': num_facts / unique_dates if unique_dates > 0 else 0,
            'avg_relations_per_entity': len(entity_relation_pairs) / num_entities if num_entities > 0 else 0,
            'density_ratio': num_facts / max_possible_facts if max_possible_facts > 0 else 0
        }
        
        return sparsity_stats
    
    def temporal_pattern_analysis(self) -> Dict:
        """
        Analyzes temporal patterns within the dataset, including daily, weekly,
        and monthly distributions, as well as entity and relation temporal spans.

        Returns:
            Dict: A dictionary containing various temporal pattern statistics.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform temporal pattern analysis.")
            return {}

        # Daily distribution
        daily_counts = self.df.groupby(self.df['date'].dt.date).size()
        
        # Weekly patterns
        weekly_counts = self.df.groupby(self.df['date'].dt.dayofweek).size()
        
        # Monthly patterns
        monthly_counts = self.df.groupby(self.df['date'].dt.month).size()
        
        # Entity temporal behavior
        entity_temporal_spans = {}
        for entity in self.all_entities: # self.all_entities already contains strings due to .astype(str) in __init__
            entity_dates = self.df[
                (self.df['subject'] == entity) | (self.df['object'] == entity)
            ]['date']
            
            if len(entity_dates) > 1:
                span = (entity_dates.max() - entity_dates.min()).days
                entity_temporal_spans[entity] = span
        
        # Relation temporal behavior
        relation_temporal_spans = {}
        for relation in self.all_relations: # self.all_relations already contains strings
            relation_dates = self.df[self.df['relation'] == relation]['date']
            
            if len(relation_dates) > 1:
                span = (relation_dates.max() - relation_dates.min()).days
                relation_temporal_spans[relation] = span
        
        temporal_stats = {
            'daily_variance': daily_counts.var() if not daily_counts.empty else 0,
            'daily_mean': daily_counts.mean() if not daily_counts.empty else 0,
            'days_with_zero_events': (daily_counts == 0).sum() if not daily_counts.empty else 0,
            'max_events_per_day': daily_counts.max() if not daily_counts.empty else 0,
            'entity_temporal_spans': {
                'mean': np.mean(list(entity_temporal_spans.values())) if entity_temporal_spans else 0,
                'std': np.std(list(entity_temporal_spans.values())) if entity_temporal_spans else 0,
                'entities_with_temporal_span': len(entity_temporal_spans)
            },
            'relation_temporal_spans': {
                'mean': np.mean(list(relation_temporal_spans.values())) if relation_temporal_spans else 0,
                'std': np.std(list(relation_temporal_spans.values())) if relation_temporal_spans else 0,
                'relations_with_temporal_span': len(relation_temporal_spans)
            },
            'temporal_concentration': self._calculate_temporal_concentration(daily_counts)
        }
        
        return temporal_stats
    
    def _calculate_temporal_concentration(self, daily_counts) -> float:
        """
        Calculates how concentrated events are in time (Gini coefficient-like measure).

        Args:
            daily_counts (pd.Series): A series of daily event counts.

        Returns:
            float: The temporal concentration score (Gini coefficient).
        """
        if len(daily_counts) == 0 or daily_counts.sum() == 0:
            return 0
        
        sorted_counts = sorted(daily_counts.values)
        n = len(sorted_counts)
        cumsum = np.cumsum(sorted_counts)
        
        # Gini coefficient calculation
        gini = (2 * np.sum((np.arange(1, n + 1) * sorted_counts))) / (n * cumsum[-1]) - (n + 1) / n
        return gini
    
    def knowledge_graph_structure_analysis(self) -> Dict:
        """
        Analyzes the structural characteristics of the knowledge graph,
        including nodes, edges, connected components, degree distribution,
        clustering coefficient, and density.

        Returns:
            Dict: A dictionary containing various graph structure statistics.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform graph structure analysis.")
            return {}

        # Create NetworkX graph
        G = nx.Graph()
        
        for _, row in self.df.iterrows():
            # Columns are already string due to __init__
            G.add_edge(row['subject'], row['object'], relation=row['relation'], date=row['date'])
        
        # Basic graph metrics
        if len(G.nodes()) > 0:
            # Connected components
            connected_components = list(nx.connected_components(G))
            largest_component_size = max(len(cc) for cc in connected_components) if connected_components else 0
            
            # Degree analysis
            degrees = dict(G.degree())
            avg_degree = np.mean(list(degrees.values())) if degrees else 0
            degree_std = np.std(list(degrees.values())) if degrees else 0
            
            # Clustering coefficient
            clustering_coeff = nx.average_clustering(G) if len(G.nodes()) > 2 else 0
            
            # Density
            density = nx.density(G)
            
        else:
            largest_component_size = 0
            avg_degree = 0
            degree_std = 0
            clustering_coeff = 0
            density = 0
            connected_components = []
        
        structure_stats = {
            'num_nodes': len(G.nodes()),
            'num_edges': len(G.edges()),
            'num_connected_components': len(connected_components),
            'largest_component_size': largest_component_size,
            'largest_component_ratio': largest_component_size / len(G.nodes()) if len(G.nodes()) > 0 else 0,
            'average_degree': avg_degree,
            'degree_std': degree_std,
            'clustering_coefficient': clustering_coeff,
            'graph_density': density,
            'isolated_nodes': sum(1 for node in G.nodes() if G.degree(node) == 0)
        }
        
        return structure_stats
    
    def relation_pattern_analysis(self) -> Dict:
        """
        Analyzes relation patterns and semantic consistency within the dataset.

        Returns:
            Dict: A dictionary containing various relation pattern statistics.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform relation pattern analysis.")
            return {}

        # Relation-type compatibility analysis
        relation_type_pairs = defaultdict(set)
        for _, row in self.df.iterrows():
            # Columns are already string due to __init__
            relation_type_pairs[row['relation']].add((row['subject_type'], row['object_type']))
        
        # Semantic consistency metrics
        relation_consistency = {}
        for relation, type_pairs in relation_type_pairs.items():
            consistency_score = 1.0 / len(type_pairs)  # More type pairs = less consistent
            relation_consistency[relation] = {
                'num_type_combinations': len(type_pairs),
                'consistency_score': consistency_score,
                'type_pairs': list(type_pairs)
            }
        
        # Most/least consistent relations
        most_consistent = min(relation_consistency.keys(), 
                              key=lambda r: relation_consistency[r]['num_type_combinations']) if relation_consistency else None
        least_consistent = max(relation_consistency.keys(), 
                               key=lambda r: relation_consistency[r]['num_type_combinations']) if relation_consistency else None
        
        pattern_stats = {
            'relation_consistency': relation_consistency,
            'most_consistent_relation': most_consistent,
            'least_consistent_relation': least_consistent,
            'avg_type_combinations_per_relation': np.mean([
                info['num_type_combinations'] for info in relation_consistency.values()
            ]) if relation_consistency else 0,
            'relations_with_single_type_pair': sum(
                1 for info in relation_consistency.values() 
                if info['num_type_combinations'] == 1
            )
        }
        
        return pattern_stats
    
    def compare_with_benchmarks(self, benchmark_stats: Dict) -> Dict:
        """
        Compares the current IndovestDKG dataset with provided benchmark statistics
        across coverage, sparsity, and structure metrics.

        Args:
            benchmark_stats (Dict): A dictionary containing benchmark statistics for comparison.

        Returns:
            Dict: A dictionary showing the comparison results.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot perform benchmark comparison.")
            return {}

        current_coverage = self.coverage_analysis()
        current_sparsity = self.sparsity_analysis()
        current_structure = self.knowledge_graph_structure_analysis()
        
        comparison = {
            'coverage_comparison': {},
            'sparsity_comparison': {},
            'structure_comparison': {}
        }
        
        # Coverage comparison
        for metric in ['total_entities', 'total_relations', 'total_quadruplets']:
            if metric in benchmark_stats and current_coverage:
                ratio = current_coverage[metric] / benchmark_stats[metric] if benchmark_stats[metric] != 0 else float('inf')
                comparison['coverage_comparison'][metric] = {
                    'indovest_value': current_coverage[metric],
                    'benchmark_value': benchmark_stats[metric],
                    'ratio': ratio,
                    'status': 'higher' if ratio > 1 else 'lower'
                }
        
        # Sparsity comparison
        for metric in ['basic_sparsity', 'temporal_sparsity', 'facts_per_entity']:
            if metric in benchmark_stats and current_sparsity:
                ratio = current_sparsity[metric] / benchmark_stats[metric] if benchmark_stats[metric] != 0 else float('inf')
                comparison['sparsity_comparison'][metric] = {
                    'indovest_value': current_sparsity[metric],
                    'benchmark_value': benchmark_stats[metric],
                    'ratio': ratio,
                    'status': 'higher' if ratio > 1 else 'lower'
                }
        
        return comparison
    
    def generate_comprehensive_report(self) -> Dict:
        """
        Generates a comprehensive analysis report including coverage, sparsity,
        temporal patterns, graph structure, and relation patterns, along with
        performance hypotheses and recommendations.

        Returns:
            Dict: The complete analysis report.
        """
        if self.df.empty:
            print("DataFrame is empty. Cannot generate comprehensive report.")
            return {'error': 'DataFrame is empty'}

        report = {
            'coverage_analysis': self.coverage_analysis(),
            'sparsity_analysis': self.sparsity_analysis(),
            'temporal_analysis': self.temporal_pattern_analysis(),
            'structure_analysis': self.knowledge_graph_structure_analysis(),
            'relation_analysis': self.relation_pattern_analysis()
        }
        
        # Add performance hypothesis
        report['performance_hypothesis'] = self._generate_performance_hypothesis(report)
        
        return report
    
    def _generate_performance_hypothesis(self, analysis_results: Dict) -> Dict:
        """
        Generates hypotheses about potential poor model performance based on
        the dataset analysis results.

        Args:
            analysis_results (Dict): The full analysis report dictionary.

        Returns:
            Dict: A dictionary containing identified hypotheses, primary issues, and recommendations.
        """
        hypotheses = []
        
        # Check if analysis results are available before accessing keys
        if 'coverage_analysis' in analysis_results:
            coverage = analysis_results['coverage_analysis']
            if coverage and coverage['entity_frequency_stats']['entities_single_occurrence_pct'] > 50:
                hypotheses.append({
                    'category': 'Coverage',
                    'issue': 'High percentage of entities with single occurrence',
                    'impact': 'Limited learning opportunities for entity embeddings',
                    'value': f"{coverage['entity_frequency_stats']['entities_single_occurrence_pct']:.1f}%",
                    'severity': 'High'
                })
        
        if 'sparsity_analysis' in analysis_results:
            sparsity = analysis_results['sparsity_analysis']
            if sparsity:
                if sparsity['basic_sparsity'] > 0.99:
                    hypotheses.append({
                        'category': 'Sparsity',
                        'issue': 'Extremely high sparsity',
                        'impact': 'Insufficient training signal for link prediction',
                        'value': f"{sparsity['basic_sparsity']:.6f}",
                        'severity': 'Critical'
                    })
                
                if sparsity['facts_per_entity'] < 2:
                    hypotheses.append({
                        'category': 'Sparsity',
                        'issue': 'Low facts per entity ratio',
                        'impact': 'Insufficient entity-level patterns for learning',
                        'value': f"{sparsity['facts_per_entity']:.2f}",
                        'severity': 'High'
                    })
        
        if 'temporal_analysis' in analysis_results:
            temporal = analysis_results['temporal_analysis']
            if temporal and temporal['temporal_concentration'] > 0.7:
                hypotheses.append({
                    'category': 'Temporal',
                    'issue': 'High temporal concentration',
                    'impact': 'Uneven temporal distribution limits temporal learning',
                    'value': f"{temporal['temporal_concentration']:.3f}",
                    'severity': 'Medium'
                })
        
        if 'structure_analysis' in analysis_results:
            structure = analysis_results['structure_analysis']
            if structure:
                if structure['num_nodes'] > 0 and structure['num_connected_components'] > structure['num_nodes'] * 0.1:
                    hypotheses.append({
                        'category': 'Structure',
                        'issue': 'Highly fragmented graph structure',
                        'impact': 'Limited global structure learning',
                        'value': f"{structure['num_connected_components']} components",
                        'severity': 'High'
                    })
                
                if structure['graph_density'] < 0.01:
                    hypotheses.append({
                        'category': 'Structure',
                        'issue': 'Very low graph density',
                        'impact': 'Sparse connectivity limits message passing effectiveness',
                        'value': f"{structure['graph_density']:.6f}",
                        'severity': 'High'
                    })
        
        return {
            'hypotheses': hypotheses,
            'primary_issues': [h for h in hypotheses if h['severity'] in ['Critical', 'High']],
            'recommendations': self._generate_recommendations(hypotheses)
        }
    
    def _generate_recommendations(self, hypotheses: List[Dict]) -> List[str]:
        """
        Generates recommendations based on identified issues from the hypotheses.

        Args:
            hypotheses (List[Dict]): A list of identified hypotheses.

        Returns:
            List[str]: A list of recommended actions.
        """
        recommendations = []
        
        issue_categories = set(h['category'] for h in hypotheses)
        
        if 'Coverage' in issue_categories:
            recommendations.append("Consider entity consolidation or filtering to reduce single-occurrence entities")
            recommendations.append("Implement entity linking to merge similar entities")
        
        if 'Sparsity' in issue_categories:
            recommendations.append("Add more data sources to increase fact density")
            recommendations.append("Consider data augmentation techniques for knowledge graphs")
            recommendations.append("Use pre-trained embeddings to handle sparse entities")
        
        if 'Temporal' in issue_categories:
            recommendations.append("Implement temporal smoothing or interpolation")
            recommendations.append("Consider sliding window approaches for temporal modeling")
        
        if 'Structure' in issue_categories:
            recommendations.append("Focus on largest connected component for initial experiments")
            recommendations.append("Consider graph construction preprocessing to improve connectivity")
        
        return recommendations

# Usage example
def analyze_indovest_dataset(csv_path: str):
    """
    Performs a complete analysis of the IndovestDKG dataset and prints a report.

    Args:
        csv_path (str): The path to the CSV dataset.

    Returns:
        Dict: The comprehensive analysis report.
    """
    analyzer = IndovestDKGAnalyzer(csv_path)
    if analyzer.df.empty: # Check if DataFrame was loaded successfully
        return {"status": "Analysis skipped due to file loading error."}

    report = analyzer.generate_comprehensive_report()
    
    print("=== INDOVESTDKG COMPREHENSIVE ANALYSIS ===\n")
    
    # Coverage Analysis
    coverage = report.get('coverage_analysis', {}) # Use .get to safely access keys
    print("📊 COVERAGE ANALYSIS:")
    print(f"  • Total Entities: {coverage.get('total_entities', 0):,}")
    print(f"  • Total Relations: {coverage.get('total_relations', 0):,}")
    print(f"  • Total Quadruplets: {coverage.get('total_quadruplets', 0):,}")
    print(f"  • Entities with single occurrence: {coverage.get('entity_frequency_stats', {}).get('entities_single_occurrence_pct', 0):.1f}%")
    print(f"  • Relations with single occurrence: {coverage.get('relation_frequency_stats', {}).get('relations_single_occurrence_pct', 0):.1f}%")
    print()
    
    # Sparsity Analysis
    sparsity = report.get('sparsity_analysis', {})
    print("🕳️ SPARSITY ANALYSIS:")
    print(f"  • Basic Sparsity: {sparsity.get('basic_sparsity', 1.0):.6f} ({(1-sparsity.get('basic_sparsity', 1.0))*100:.4f}% filled)")
    print(f"  • Facts per Entity: {sparsity.get('facts_per_entity', 0):.2f}")
    print(f"  • Facts per Relation: {sparsity.get('facts_per_relation', 0):.2f}")
    print(f"  • Entity Pair Coverage: {sparsity.get('entity_pair_coverage', 0):.6f}")
    print()
    
    # Structure Analysis
    structure = report.get('structure_analysis', {})
    print("🔗 GRAPH STRUCTURE ANALYSIS:")
    print(f"  • Graph Density: {structure.get('graph_density', 0):.6f}")
    print(f"  • Connected Components: {structure.get('num_connected_components', 0)}")
    print(f"  • Largest Component Ratio: {structure.get('largest_component_ratio', 0):.3f}")
    print(f"  • Average Degree: {structure.get('average_degree', 0):.2f}")
    print(f"  • Clustering Coefficient: {structure.get('clustering_coefficient', 0):.4f}")
    print()
    
    # Performance Hypothesis
    hypothesis = report.get('performance_hypothesis', {'primary_issues': [], 'recommendations': []})
    print("🎯 PERFORMANCE HYPOTHESIS:")
    print(f"  Primary Issues Identified: {len(hypothesis['primary_issues'])}")
    
    for issue in hypothesis['primary_issues']:
        print(f"  • {issue['category']}: {issue['issue']}")
        print(f"    - Impact: {issue['impact']}")
        print(f"    - Value: {issue['value']}")
        print(f"    - Severity: {issue['severity']}")
        print()
    
    print("💡 RECOMMENDATIONS:")
    for i, rec in enumerate(hypothesis['recommendations'], 1):
        print(f"  {i}. {rec}")
    
    return report

In [15]:
if __name__ == "__main__":
    # Get the current working directory. In a Jupyter/IPython notebook,
    # this is typically the directory where the notebook was launched from.
    current_working_dir = os.getcwd()

    # Construct the path to the CSV file relative to the current working directory.
    # Assuming your structure is:
    # D:\Yaffa\IndovestDKG\
    # └── KG_CONSTRUCTION/
    #     ├── notebook/
    #     │   └── evaluate-construction-llm.ipynb (your current script, and likely CWD)
    #     └── data/
    #         └── dataset/
    #             └── KOMPAS/
    #                 └── 2-FULL/
    #                     └── IndovestDKG_FULL.csv
    
    # Go up one directory from 'notebook' (current_working_dir) to 'KG_CONSTRUCTION/',
    # then navigate into 'data/dataset/KOMPAS/2-FULL/'
    relative_csv_path = os.path.join(
        current_working_dir,
        '..', # Go up one level from 'notebook' to 'KG_CONSTRUCTION/'
        'data', 'dataset', 'KOMPAS', '2-FULL',
        'IndovestDKG_FULL.csv'
    )

    # Normalize the path to handle '..' correctly and get the absolute path
    csv_file_path = os.path.abspath(relative_csv_path)

    print(f"Attempting to load CSV from: {csv_file_path}")
    report = analyze_indovest_dataset(csv_file_path)


Attempting to load CSV from: d:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\dataset\KOMPAS\2-FULL\IndovestDKG_FULL.csv
Successfully loaded data from: d:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\dataset\KOMPAS\2-FULL\IndovestDKG_FULL.csv
=== INDOVESTDKG COMPREHENSIVE ANALYSIS ===

📊 COVERAGE ANALYSIS:
  • Total Entities: 53,522
  • Total Relations: 3,314
  • Total Quadruplets: 69,336
  • Entities with single occurrence: 74.0%
  • Relations with single occurrence: 58.5%

🕳️ SPARSITY ANALYSIS:
  • Basic Sparsity: 1.000000 (0.0000% filled)
  • Facts per Entity: 1.30
  • Facts per Relation: 20.92
  • Entity Pair Coverage: 0.000044

🔗 GRAPH STRUCTURE ANALYSIS:
  • Graph Density: 0.000044
  • Connected Components: 5428
  • Largest Component Ratio: 0.752
  • Average Degree: 2.36
  • Clustering Coefficient: 0.0171

🎯 PERFORMANCE HYPOTHESIS:
  Primary Issues Identified: 5
  • Coverage: High percentage of entities with single occurrence
    - Impact: Limited learning opportunities for entity embeddings


In [11]:
pd.read_csv(csv_file_path).head(5)

Unnamed: 0,subject,subject_type,relation,object,object_type,date
0,pt manulife aset manajemen indonesia,PERUSAHAAN,Mengumumkan,diversifikasi investasi,KONSEP,2025-03-07 21:18:00
1,ihsg,INDIKATOR_EKONOMI,Menghasilkan,pertumbuhan tahunan tertinggi,INDIKATOR_EKONOMI,2025-03-07 21:18:00
2,strategi diversifikasi,KONSEP,Meningkatkan,peluang return,KONSEP,2025-03-07 21:18:00
3,investor,ORANG,Mengendalikan,berbagai jenis aset investasi,KONSEP,2025-03-07 21:18:00
4,dimas ardinugraha,ORANG,Mempengaruhi,strategi diversifikasi,KONSEP,2025-03-07 21:18:00


### Analisis di ICEWS14

In [None]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, List, Tuple

In [16]:
def load_id_to_name_map(filepath: str) -> Dict[int, str]:
    """
    Loads entity or relation ID to name mappings from a text file.
    Expected format: <name> <id> (e.g., China 0)
    """
    mapping = {}
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    name = parts[0].strip()
                    id_val = int(parts[1].strip())
                    mapping[id_val] = name
                else:
                    print(f"Warning: Skipping malformed line in {filepath}: {line.strip()}")
    except FileNotFoundError:
        print(f"Error: Mapping file not found at {filepath}")
    except Exception as e:
        print(f"An error occurred while reading {filepath}: {e}")
    return mapping

def process_icews_triplets(
    filepath: str,
    entity_map: Dict[int, str],
    relation_map: Dict[int, str],
    base_date: datetime
) -> List[Dict]:
    """
    Processes a single ICEWS triplet file (train, test, or valid)
    and converts it into a list of dictionaries with mapped names and dates.
    Expected triplet format: head_id relation_id tail_id timestamp_id 0
    """
    data = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 5: # Expecting 5 parts: head, relation, tail, timestamp, and the trailing '0'
                    try:
                        head_id = int(parts[0])
                        relation_id = int(parts[1])
                        tail_id = int(parts[2])
                        timestamp_id = int(parts[3])

                        subject_name = entity_map.get(head_id, f"UNKNOWN_ENTITY_ID_{head_id}")
                        relation_name = relation_map.get(relation_id, f"UNKNOWN_RELATION_ID_{relation_id}")
                        object_name = entity_map.get(tail_id, f"UNKNOWN_ENTITY_ID_{tail_id}")

                        # Calculate the actual date from the timestamp_id
                        # Assuming timestamp_id 0 corresponds to the base_date
                        event_date = base_date + timedelta(days=timestamp_id)

                        data.append({
                            'subject': subject_name,
                            'subject_type': 'ACTOR', # Placeholder as ICEWS doesn't provide explicit types
                            'relation': relation_name,
                            'object': object_name,
                            'object_type': 'ACTOR', # Placeholder
                            'date': event_date.strftime('%Y-%m-%d %H:%M:%S') # Format to match your previous CSV
                        })
                    except ValueError as ve:
                        print(f"Warning: Skipping line with invalid ID/timestamp format in {filepath}: {line.strip()} - {ve}")
                else:
                    print(f"Warning: Skipping malformed triplet line in {filepath}: {line.strip()}")
    except FileNotFoundError:
        print(f"Error: Triplet file not found at {filepath}")
    except Exception as e:
        print(f"An error occurred while reading {filepath}: {e}")
    return data

def convert_icews_to_csv(icews_data_dir: str, output_csv_filename: str = 'ICEWS14_Combined_Dataset.csv', current_working_dir: str = None):
    """
    Converts ICEWS14 text datasets (train, test, valid) into a single CSV file.

    Args:
        icews_data_dir (str): The path to the ICEWS14 data directory.
        output_csv_filename (str): The name of the output CSV file.
        current_working_dir (str): The current working directory for path calculation.
    """
    print(f"Starting conversion for ICEWS14 dataset from: {icews_data_dir}")

    # Define paths to mapping files
    entity2id_path = os.path.join(icews_data_dir, 'entity2id.txt')
    relation2id_path = os.path.join(icews_data_dir, 'relation2id.txt')

    # Load ID to name mappings
    entity_map = load_id_to_name_map(entity2id_path)
    relation_map = load_id_to_name_map(relation2id_path)

    if not entity_map or not relation_map:
        print("Error: Entity or relation mappings could not be loaded. Aborting conversion.")
        return

    # Define paths to triplet files
    train_path = os.path.join(icews_data_dir, 'train.txt')
    test_path = os.path.join(icews_data_dir, 'test.txt')
    valid_path = os.path.join(icews_data_dir, 'valid.txt')

    # Base date for ICEWS14 timestamps (Day 0)
    # ICEWS14 typically starts on January 1, 2014
    base_date = datetime(2014, 1, 1, 0, 0, 0)

    all_data = []

    print(f"Processing train.txt...")
    all_data.extend(process_icews_triplets(train_path, entity_map, relation_map, base_date))
    print(f"Processing test.txt...")
    all_data.extend(process_icews_triplets(test_path, entity_map, relation_map, base_date))
    print(f"Processing valid.txt...")
    all_data.extend(process_icews_triplets(valid_path, entity_map, relation_map, base_date))

    if not all_data:
        print("No data was processed. Output CSV will be empty.")
        return

    # Create DataFrame
    df = pd.DataFrame(all_data)

    # Convert 'date' column to datetime objects for sorting
    df['date'] = pd.to_datetime(df['date'])

    # Sort by date (timestamp)
    df = df.sort_values(by='date').reset_index(drop=True)

    # --- START OF NEW OUTPUT PATH LOGIC ---
    if current_working_dir is None:
        current_working_dir = os.getcwd() # Fallback, should be provided by __main__

    # Calculate the path to D:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\benchmark
    # Assuming current_working_dir is D:\Yaffa\IndovestDKG\KG_CONSTRUCTION\notebook
    project_kg_construction_dir = os.path.abspath(os.path.join(current_working_dir, '..'))
    
    benchmark_data_dir = os.path.join(project_kg_construction_dir, 'data', 'benchmark')
    dataset_output_dir = os.path.join(benchmark_data_dir, 'ICEWS14') # Create ICEWS14 subfolder

    # Create the directory if it doesn't exist
    os.makedirs(dataset_output_dir, exist_ok=True)

    # Define output CSV path within the new directory
    output_csv_path = os.path.join(dataset_output_dir, output_csv_filename)
    # --- END OF NEW OUTPUT PATH LOGIC ---

    # Save to CSV
    df.to_csv(output_csv_path, index=False)
    print(f"Successfully converted ICEWS14 data to CSV: {output_csv_path}")
    print(f"Total quadruplets in CSV: {len(df):,}")

    return df # Return the DataFrame for potential further use

In [17]:
if __name__ == "__main__":
    # The base directory where your ICEWS14 data is located
    # D:\Yaffa\IndovestDKG\DKG\GNN-based\EvoKG\data\ICEWS14
    
    # Construct the absolute path to the ICEWS14 data directory
    # Assuming this script is run from KG_CONSTRUCTION/notebook/
    current_working_dir = os.getcwd()
    
    icews14_data_base_path = os.path.join(
        current_working_dir,
        '..', '..', # From notebook/ to IndovestDKG/
        'DKG', 'GNN-based', 'EvoKG', 'data', 'ICEWS14'
    )
    
    icews14_data_abs_path = os.path.abspath(icews14_data_base_path)

    print(f"Resolved ICEWS14 data directory: {icews14_data_abs_path}")

    # Perform the conversion, passing the current_working_dir
    converted_df = convert_icews_to_csv(icews14_data_abs_path, current_working_dir=current_working_dir)

    if converted_df is not None and not converted_df.empty:
        print("\nSample of the converted DataFrame:")
        print(converted_df.head())
        print("\nDataFrame Info:")
        converted_df.info()
        print("Alhamdulillah! Konversi berhasil.")
    else:
        print("Konversi gagal atau tidak ada data yang diproses. Haha.")

Resolved ICEWS14 data directory: d:\Yaffa\IndovestDKG\DKG\GNN-based\EvoKG\data\ICEWS14
Starting conversion for ICEWS14 dataset from: d:\Yaffa\IndovestDKG\DKG\GNN-based\EvoKG\data\ICEWS14
Processing train.txt...
Processing test.txt...
Processing valid.txt...
Successfully converted ICEWS14 data to CSV: d:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\benchmark\ICEWS14\ICEWS14_Combined_Dataset.csv
Total quadruplets in CSV: 665,304

Sample of the converted DataFrame:
             subject subject_type  \
0  Citizen_(Nigeria)        ACTOR   
1          Hezbollah        ACTOR   
2          Hezbollah        ACTOR   
3   Ministry_(Egypt)        ACTOR   
4      Media_(India)        ACTOR   

                                            relation             object  \
0                              Criticize_or_denounce   Catherine_Ashton   
1  Express_intent_to_engage_in_diplomatic_coopera...              Japan   
2                            Engage_in_mass_killings  Citizen_(Nigeria)   
3               

In [18]:
if __name__ == "__main__":
    # Get the current working directory. In a Jupyter/IPython notebook,
    # this is typically the directory where the notebook was launched from.
    current_working_dir = os.getcwd()

    # Construct the path to the CSV file relative to the current working directory.
    # Assuming your structure is:
    # D:\Yaffa\IndovestDKG\
    # └── KG_CONSTRUCTION/
    #     ├── notebook/
    #     │   └── evaluate-construction-llm.ipynb (your current script, and likely CWD)
    #     └── data/
    #         └── benchmark/
    #             └── ICEWS14/
    #                 └── ICEWS14_Combined_Dataset.csv
    
    # Go up one directory from 'notebook' (current_working_dir) to 'KG_CONSTRUCTION/',
    # then navigate into 'data/dataset/KOMPAS/2-FULL/'
    relative_csv_path = os.path.join(
        current_working_dir,
        '..', # Go up one level from 'notebook' to 'KG_CONSTRUCTION/'
        'data', 'benchmark', 'ICEWS14',
        'ICEWS14_Combined_Dataset.csv'
    )

    # Normalize the path to handle '..' correctly and get the absolute path
    csv_file_path = os.path.abspath(relative_csv_path)

    print(f"Attempting to load CSV from: {csv_file_path}")
    report = analyze_indovest_dataset(csv_file_path)


Attempting to load CSV from: d:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\benchmark\ICEWS14\ICEWS14_Combined_Dataset.csv
Successfully loaded data from: d:\Yaffa\IndovestDKG\KG_CONSTRUCTION\data\benchmark\ICEWS14\ICEWS14_Combined_Dataset.csv
=== INDOVESTDKG COMPREHENSIVE ANALYSIS ===

📊 COVERAGE ANALYSIS:
  • Total Entities: 12,498
  • Total Relations: 260
  • Total Quadruplets: 665,304
  • Entities with single occurrence: 17.7%
  • Relations with single occurrence: 3.8%

🕳️ SPARSITY ANALYSIS:
  • Basic Sparsity: 0.999984 (0.0016% filled)
  • Facts per Entity: 53.23
  • Facts per Relation: 2558.86
  • Entity Pair Coverage: 0.001534

🔗 GRAPH STRUCTURE ANALYSIS:
  • Graph Density: 0.001534
  • Connected Components: 21
  • Largest Component Ratio: 0.997
  • Average Degree: 19.17
  • Clustering Coefficient: 0.4714

🎯 PERFORMANCE HYPOTHESIS:
  Primary Issues Identified: 2
  • Sparsity: Extremely high sparsity
    - Impact: Insufficient training signal for link prediction
    - Value: 0.999984
 