In [1]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import json

def load_all_csv_reports(base_dir):
    """
    Load all CSV files under `base_dir` and concatenate
    them into a single pandas DataFrame.
    
    The CSVs contain:
        - Category (Company name)
        - Year
        - Statement
        - TaxonomyA_Categories
        - TaxonomyA_Explanations
        - TaxonomyB_Categories
        - TaxonomyB_Explanations
    """
    csv_files = glob.glob(os.path.join(base_dir, "*.csv"))
    all_dfs = []
    for fpath in csv_files:
        try:
            df_temp = pd.read_csv(fpath)
            # Add source file info
            df_temp["SourceCSV"] = os.path.basename(fpath)
            all_dfs.append(df_temp)
        except Exception as e:
            print(f"[WARNING] Skipping {fpath}, error loading: {e}")
    
    if all_dfs:
        df = pd.concat(all_dfs, ignore_index=True)
        return df
    else:
        print("[INFO] No CSV files found in directory.")
        return pd.DataFrame()

def analyze_taxonomy_distribution(df):
    """
    Shows how often each taxonomy category appears in the dataset.
    Analyzes both TaxonomyA (Forward-Looking) and TaxonomyB (Past Commitments).
    """
    # For Taxonomy A (Forward-Looking)
    catA_series = df["TaxonomyA_Categories"].dropna().astype(str).apply(lambda x: [s.strip() for s in str(x).split(",")])
    catA_flat = []
    for row in catA_series:
        catA_flat.extend([c for c in row if c and c != 'nan'])

    catA_counts = pd.Series(catA_flat).value_counts()
    
    # For Taxonomy B (Past Commitments)
    catB_series = df["TaxonomyB_Categories"].dropna().astype(str).apply(lambda x: [s.strip() for s in str(x).split(",")])
    catB_flat = []
    for row in catB_series:
        catB_flat.extend([c for c in row if c and c != 'nan'])

    catB_counts = pd.Series(catB_flat).value_counts()
    
    print("\n=== Taxonomy A Distribution (Forward-Looking) ===")
    print("Category counts:")
    print(catA_counts)
    print(f"\nTotal statements with forward-looking commitments: {len(catA_flat)}")
    
    print("\n=== Taxonomy B Distribution (Past Commitments) ===")
    print("Category counts:")
    print(catB_counts)
    print(f"\nTotal statements with past achievements: {len(catB_flat)}")
    
    # Company-level analysis
    print("\n=== Company-Level Analysis ===")
    company_stats = df.groupby("Category").agg({
        "Statement": "count",
        "TaxonomyA_Categories": lambda x: x.notna().sum(),
        "TaxonomyB_Categories": lambda x: x.notna().sum()
    }).rename(columns={
        "Statement": "Total Statements",
        "TaxonomyA_Categories": "Forward-Looking",
        "TaxonomyB_Categories": "Past Achievements"
    })
    print("\nCommitments by company:")
    print(company_stats)

def cluster_statements(df, n_clusters=6):
    """
    Performs KMeans clustering of statement text
    to group statements with similar themes.
    Returns the clustered dataframe and cluster descriptions.
    """
    # Some CSV rows might have missing statements
    df_nonempty = df.dropna(subset=["Statement"])
    statements = df_nonempty["Statement"].astype(str).tolist()

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(
        stop_words='english', 
        max_features=2000,
        ngram_range=(1, 2)  # Include bigrams for better context
    )
    X = vectorizer.fit_transform(statements)
    
    # KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)

    # Add cluster labels back to the dataframe
    df_nonempty["Cluster"] = kmeans.labels_

    # Analyze top features per cluster and create descriptions
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    
    cluster_descriptions = []
    print(f"\n=== Top terms per cluster (k={n_clusters}) ===")
    for i in range(n_clusters):
        top_indices = order_centroids[i, :10]
        top_terms = [terms[idx] for idx in top_indices]
        cluster_size = int(sum(kmeans.labels_ == i))  # Convert np.int64 to Python int
        
        # Get example statements
        cluster_examples = df_nonempty[df_nonempty["Cluster"] == i]["Statement"].head(2)
        example_texts = [stmt[:150] + "..." for stmt in cluster_examples]
        
        # Create cluster description
        description = {
            "cluster": int(i),  # Convert np.int64 to Python int
            "size": cluster_size,
            "top_terms": list(top_terms),  # Convert np.ndarray to Python list
            "examples": list(example_texts)  # Convert pd.Series to Python list
        }
        cluster_descriptions.append(description)
        
        print(f"\nCluster #{i} ({cluster_size} statements):")
        print(f"Top terms: {', '.join(top_terms)}")
        print("\nExample statements:")
        for stmt in example_texts:
            print(f"- {stmt}")

    return df_nonempty, cluster_descriptions

def visualize_normalized_commitments(df, output_dir):
    """
    Creates a normalized visualization of commitments per report for each company
    with detailed taxonomy descriptions
    """
    # Count number of unique reports per company
    reports_per_company = df.groupby("Category")["Year"].nunique()
    
    # Get the existing commitment counts
    company_stats = df.groupby("Category").agg({
        "TaxonomyA_Categories": lambda x: x.notna().sum(),
        "TaxonomyB_Categories": lambda x: x.notna().sum()
    })
    
    # Normalize by number of reports
    normalized_stats = company_stats.div(reports_per_company, axis=0)
    
    # Create figure with extra space for descriptions
    fig = plt.figure(figsize=(15, 12))
    
    # Create gridspec for bar plot and text
    gs = fig.add_gridspec(2, 1, height_ratios=[2, 1])
    
    # Plot normalized commitments in top subplot
    ax = fig.add_subplot(gs[0])
    normalized_stats.plot(kind='bar', stacked=True, ax=ax, 
                         color=['skyblue', 'lightgreen'])
    plt.title('Average Commitments per Report by Company', pad=20)
    plt.xlabel('Company')
    plt.ylabel('Average Number of Commitments per Report')
    plt.legend(['Forward-Looking (Taxonomy A)', 'Past Achievements (Taxonomy B)'],
              title='Commitment Type')
    plt.xticks(rotation=45, ha='right')
    
    # Add taxonomy descriptions in bottom subplot
    taxonomy_text = """
Taxonomy A: Present/Future Commitments
1. No Mention - General sustainability statements without specific emissions targets
2. Generic/High-Level - Non-specific intent to reduce emissions
3. Specific Numeric Target - Quantitative goals with timeline
4. Net-Zero/Carbon-Neutral - Explicit net-zero commitments
5. Detailed Plan/Roadmap - Specific implementation steps
6. Science-Based/Framework - SBTi, TCFD alignment

Taxonomy B: References to Past Commitments
1. No Past Commitment - No prior targets mentioned
2. Status Unknown - Referenced but unclear
3. Progress/Achieved - Met or partially met targets
4. Shortfall - Missed targets
5. Updated/Extended - Revised goals
6. Discontinued - Replaced targets
"""
    
    # Add text box with taxonomy descriptions
    plt.figtext(0.1, 0.02, taxonomy_text, fontsize=9, 
                bbox=dict(facecolor='white', edgecolor='gray', alpha=0.8))
    
    # Adjust layout
    plt.subplots_adjust(bottom=0.4)  # Make room for text
    
    # Save the plot
    fig_dir = os.path.join(output_dir, "figures")
    os.makedirs(fig_dir, exist_ok=True)
    plt.savefig(os.path.join(fig_dir, 'company_commitments.png'), 
                dpi=300, bbox_inches='tight', pad_inches=0.5)
    plt.close()

def visualize_taxonomy_distribution(df, output_dir):
    """
    Creates visualizations for taxonomy distributions:
    1. Bar plots of taxonomy categories with descriptions
    2. Company-wise normalized commitments
    """
    # Set style to a default matplotlib style
    plt.style.use('default')
    
    # Create figures directory
    fig_dir = os.path.join(output_dir, "figures")
    os.makedirs(fig_dir, exist_ok=True)
    
    try:
        # Define taxonomy descriptions
        taxonomy_A_desc = {
            "1": "No Mention\n(General sustainability statements)",
            "2": "Generic/High-Level Commitment\n(Non-specific intent to reduce emissions)",
            "3": "Specific Numeric Target\n(Quantitative goals with timeline)",
            "4": "Net-Zero/Carbon-Neutral Target\n(Explicit net-zero commitments)",
            "5": "Detailed Plan/Roadmap\n(Specific implementation steps)",
            "6": "Science-Based/Framework\n(SBTi, TCFD alignment)"
        }
        
        taxonomy_B_desc = {
            "1": "No Past Commitment\n(No prior targets mentioned)",
            "2": "Past Commitment Status Unknown\n(Referenced but unclear)",
            "3": "Acknowledged Progress/Achieved\n(Met or partially met)",
            "4": "Acknowledged Shortfall\n(Missed targets)",
            "5": "Updated/Extended Commitments\n(Revised goals)",
            "6": "Discontinued/Superseded\n(Replaced targets)"
        }
        
        # Create figure with two subplots and extra space for text
        fig = plt.figure(figsize=(15, 16))
        gs = fig.add_gridspec(3, 1, height_ratios=[1, 1, 0.2])
        
        # Process and plot Taxonomy A
        ax1 = fig.add_subplot(gs[0])
        catA_series = df["TaxonomyA_Categories"].dropna().astype(str).apply(
            lambda x: [s.strip() for s in str(x).split(",")]
        )
        catA_counts = pd.Series([c for row in catA_series for c in row if c and c != 'nan']).value_counts()
        
        # Rename index with descriptions
        catA_counts.index = [taxonomy_A_desc.get(str(idx), str(idx)) for idx in catA_counts.index]
        
        # Plot Taxonomy A
        bars1 = catA_counts.plot(kind='bar', ax=ax1, color='skyblue')
        ax1.set_title('Taxonomy A: Present/Future Commitments\nDistribution of Forward-Looking Statements', pad=20)
        ax1.set_xlabel('')
        ax1.set_ylabel('Count')
        plt.setp(ax1.xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        # Add value labels on the bars
        for i, v in enumerate(catA_counts):
            ax1.text(i, v, str(v), ha='center', va='bottom')
        
        # Process and plot Taxonomy B
        ax2 = fig.add_subplot(gs[1])
        catB_series = df["TaxonomyB_Categories"].dropna().astype(str).apply(
            lambda x: [s.strip() for s in str(x).split(",")]
        )
        catB_counts = pd.Series([c for row in catB_series for c in row if c and c != 'nan']).value_counts()
        
        # Rename index with descriptions
        catB_counts.index = [taxonomy_B_desc.get(str(idx), str(idx)) for idx in catB_counts.index]
        
        # Plot Taxonomy B
        bars2 = catB_counts.plot(kind='bar', ax=ax2, color='lightgreen')
        ax2.set_title('Taxonomy B: References to Past Commitments\nDistribution of Historical Achievement Statements', pad=20)
        ax2.set_xlabel('')
        ax2.set_ylabel('Count')
        plt.setp(ax2.xaxis.get_majorticklabels(), rotation=45, ha='right')
        
        # Add value labels on the bars
        for i, v in enumerate(catB_counts):
            ax2.text(i, v, str(v), ha='center', va='bottom')
        
        # Add summary text at the bottom
        summary_text = (
            f"Total Forward-Looking Commitments: {len(catA_flat)}\n"
            f"Total Past Achievement References: {len(catB_flat)}\n"
            f"Number of Companies: {df['Category'].nunique()}"
        )
        fig.text(0.1, 0.02, summary_text, fontsize=10, 
                bbox=dict(facecolor='white', edgecolor='gray', alpha=0.8))
        
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, 'taxonomy_distributions.png'), 
                   dpi=300, bbox_inches='tight', pad_inches=0.5)
        plt.close()
        
        # Create normalized commitments visualization
        reports_per_company = df.groupby("Category")["Year"].nunique()
        company_stats = df.groupby("Category").agg({
            "TaxonomyA_Categories": lambda x: x.notna().sum(),
            "TaxonomyB_Categories": lambda x: x.notna().sum()
        })
        
        # Normalize by number of reports
        normalized_stats = company_stats.div(reports_per_company, axis=0)
        
        # Plot normalized commitments
        fig, ax = plt.subplots(figsize=(12, 6))
        normalized_stats.plot(kind='bar', stacked=True, ax=ax, 
                         color=['skyblue', 'lightgreen'])
        plt.title('Average Commitments per Report by Company', pad=20)
        plt.xlabel('Company')
        plt.ylabel('Average Number of Commitments per Report')
        plt.legend(['Forward-Looking (Taxonomy A)', 'Past Achievements (Taxonomy B)'],
                  title='Commitment Type')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig(os.path.join(fig_dir, 'company_commitments.png'), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        print("[INFO] Successfully generated taxonomy distribution visualizations")
        
    except Exception as e:
        print(f"[WARNING] Error generating taxonomy visualizations: {str(e)}")

def visualize_clusters(df_clustered, cluster_descriptions, output_dir):
    """
    Creates visualizations for clustering results:
    1. Cluster size distribution with descriptions
    2. Heatmap of company distribution across clusters (in percentages)
    """
    fig_dir = os.path.join(output_dir, "figures")
    os.makedirs(fig_dir, exist_ok=True)
    
    try:
        # 1. Cluster Size Distribution with Descriptions
        plt.figure(figsize=(15, 12))  # Increased height for more description space
        
        # Create the bar plot
        cluster_sizes = df_clustered['Cluster'].value_counts().sort_index()
        ax = cluster_sizes.plot(kind='bar', color='skyblue')
        
        plt.title('Distribution of Statements Across Clusters', pad=20)
        plt.xlabel('Cluster')
        plt.ylabel('Number of Statements')
        plt.xticks(rotation=0)
        
        # Define cluster interpretations based on terms and examples
        cluster_interpretations = {
            0: {
                "title": "General Sustainability Statements",
                "description": "Statements that discuss general sustainability practices and environmental responsibility without specific emissions targets.",
                "typical_content": "References to sustainability reports, environmental management, and general green initiatives."
            },
            1: {
                "title": "Specific Emissions Targets",
                "description": "Quantitative commitments to reduce carbon emissions with specific percentage targets and deadlines.",
                "typical_content": "Numeric reduction targets, timeline commitments, and scope-specific goals (Scope 1, 2, or 3)."
            },
            2: {
                "title": "Net-Zero Commitments",
                "description": "Explicit commitments to achieve net-zero or carbon neutrality by a specific date.",
                "typical_content": "Net-zero targets, carbon neutrality goals, and long-term climate commitments."
            },
            3: {
                "title": "Implementation Strategies",
                "description": "Detailed plans and specific measures for achieving emissions reduction targets.",
                "typical_content": "Energy efficiency initiatives, renewable energy adoption, and specific technological solutions."
            },
            4: {
                "title": "Progress Reporting",
                "description": "Statements reporting on progress towards previously set targets and achievements.",
                "typical_content": "Achievement updates, progress metrics, and performance against targets."
            },
            5: {
                "title": "Framework Alignment",
                "description": "References to external frameworks and standards for emissions reduction.",
                "typical_content": "SBTi alignment, TCFD reporting, and industry standard compliance."
            }
        }
        
        # Add cluster descriptions as text below the plot
        description_text = []
        for cluster_id, interp in cluster_interpretations.items():
            desc = cluster_descriptions[cluster_id]
            cluster_text = (
                f"Cluster {cluster_id}: {interp['title']} ({desc['size']} statements)\n"
                f"Description: {interp['description']}\n"
                f"Typical Content: {interp['typical_content']}\n"
                f"Example: {desc['examples'][0]}\n"
            )
            description_text.append(cluster_text)
        
        # Add descriptions as text
        plt.figtext(0.1, -0.6, '\n\n'.join(description_text), 
                   fontsize=9, wrap=True, 
                   bbox=dict(facecolor='white', alpha=0.8))
        
        plt.tight_layout()
        # Adjust bottom margin to fit descriptions
        plt.subplots_adjust(bottom=0.5)
        plt.savefig(os.path.join(fig_dir, 'cluster_distribution.png'), 
                    dpi=300, bbox_inches='tight', pad_inches=1.5)  # Increased padding
        plt.close()
        
        # 2. Company distribution across clusters (as percentages)
        company_cluster_dist = pd.crosstab(df_clustered['Category'], df_clustered['Cluster'])
        
        # Convert to percentages (rows sum to 100%)
        company_cluster_pct = company_cluster_dist.div(company_cluster_dist.sum(axis=1), axis=0) * 100
        
        # Reset all matplotlib parameters to default
        plt.rcdefaults()
        plt.style.use('default')
        
        # Create a new figure with specified size
        plt.figure(figsize=(15, 10))
        
        # Create heatmap with adjusted parameters
        ax = plt.gca()
        
        # Create heatmap
        heatmap = sns.heatmap(company_cluster_pct, 
                            cmap='YlOrRd',
                            annot=True,
                            fmt='.1f',
                            annot_kws={'size': 9},
                            cbar_kws={'label': 'Percentage of Company Statements'},
                            square=True,
                            vmin=0,
                            vmax=100,
                            ax=ax)
        
        # Ensure all annotations are visible by adjusting their properties
        for t in ax.texts:
            t.set_text(f"{float(t.get_text()):.1f}")
            t.set_fontweight('bold')
        
        # Customize title and labels
        ax.set_title('Company Distribution Across Clusters (Percentage)', pad=20, fontsize=12)
        ax.set_xlabel('Cluster', fontsize=10)
        ax.set_ylabel('Company', fontsize=10)
        
        # Rotate x-axis labels for better readability
        plt.xticks(rotation=0)
        plt.yticks(rotation=0)
        
        # Adjust layout to prevent label cutoff
        plt.tight_layout()
        
        # Save with extra padding and high DPI
        plt.savefig(os.path.join(fig_dir, 'company_cluster_distribution.png'), 
                   dpi=300, 
                   bbox_inches='tight',
                   pad_inches=0.5)
        
        # Clear the current figure
        plt.close('all')
        
        print("[INFO] Successfully generated clustering visualizations")
        
    except Exception as e:
        print(f"[WARNING] Error generating cluster visualizations: {str(e)}")

def main():
    # Path to classification results
    base_dir = "Classification_Results"
    
    # 1. Load all CSVs into a single DataFrame
    df_all = load_all_csv_reports(base_dir)
    print(f"[INFO] Loaded {len(df_all)} statements from {df_all['Category'].nunique()} companies")

    if df_all.empty:
        return  # nothing to analyze

    # Create output directory
    output_dir = "3_report_commitments"
    os.makedirs(output_dir, exist_ok=True)

    # Generate visualizations for taxonomy distributions
    visualize_taxonomy_distribution(df_all, output_dir)
    
    # Generate normalized commitments with taxonomy descriptions
    visualize_normalized_commitments(df_all, output_dir)
    
    # 2. Analyze taxonomy distributions and company-level stats
    analyze_taxonomy_distribution(df_all)
    
    # 3. Perform clustering and visualize results
    df_clustered, cluster_descriptions = cluster_statements(df_all, n_clusters=6)
    visualize_clusters(df_clustered, cluster_descriptions, output_dir)
    
    # Print number of reports per company
    reports_per_company = df_all.groupby("Category")["Year"].nunique()
    print("\nNumber of reports per company:")
    print(reports_per_company)
    
    # 4. Save results
    output_csv = os.path.join(output_dir, "sustainability_commitments_analysis.csv")
    df_clustered.to_csv(output_csv, index=False)
    print(f"\n[INFO] Analysis results saved to {output_csv}")
    
    # Save cluster descriptions
    cluster_desc_file = os.path.join(output_dir, "cluster_descriptions.json")
    with open(cluster_desc_file, 'w', encoding='utf-8') as f:
        json.dump(cluster_descriptions, f, indent=2, ensure_ascii=False)
    print(f"[INFO] Cluster descriptions saved to {cluster_desc_file}")
    
    print("\n[INFO] Visualizations have been saved to 3_report_commitments/figures/")
    
    # Save summary statistics
    summary_stats = pd.DataFrame({
        'Metric': [
            'Total Companies',
            'Total Statements',
            'Forward-Looking Commitments',
            'Past Achievements',
            'Statements with Both'
        ],
        'Value': [
            df_all['Category'].nunique(),
            len(df_all),
            df_all['TaxonomyA_Categories'].notna().sum(),
            df_all['TaxonomyB_Categories'].notna().sum(),
            (df_all['TaxonomyA_Categories'].notna() & df_all['TaxonomyB_Categories'].notna()).sum()
        ]
    })
    summary_csv = os.path.join(output_dir, "analysis_summary.csv")
    summary_stats.to_csv(summary_csv, index=False)
    print(f"[INFO] Summary statistics saved to {summary_csv}")

if __name__ == "__main__":
    main()


[INFO] Loaded 1447 statements from 11 companies

=== Taxonomy A Distribution (Forward-Looking) ===
Category counts:
2      449
3      273
5      235
4      107
6       60
2.0     56
3.0     15
5.0     10
1.0      3
4.0      1
6.0      1
Name: count, dtype: int64

Total statements with forward-looking commitments: 1210

=== Taxonomy B Distribution (Past Commitments) ===
Category counts:
3.0    342
3      148
2.0     30
5.0     23
2       15
5       14
4.0      4
1.0      3
4        2
6.0      2
1        1
Name: count, dtype: int64

Total statements with past achievements: 584

=== Company-Level Analysis ===

Commitments by company:
                                          Total Statements  Forward-Looking  \
Category                                                                      
AG_Real_Estate                                          62               50   
Brookfield_Asset_Management                            166              133   
China_Overseas_Property_Holdings_Limited     



[INFO] Successfully generated clustering visualizations

Number of reports per company:
Category
AG_Real_Estate                              2
Brookfield_Asset_Management                 1
China_Overseas_Property_Holdings_Limited    1
China_Vanke                                 1
Country_Garden_Holdings                     1
Hines                                       1
Kilroy_Reality                              9
New_World_Development_Company_Limited       2
Prologis__Inc.                              1
Savills_UK                                  2
Seazen_Group                                1
Name: Year, dtype: int64

[INFO] Analysis results saved to 3_report_commitments/sustainability_commitments_analysis.csv
[INFO] Cluster descriptions saved to 3_report_commitments/cluster_descriptions.json

[INFO] Visualizations have been saved to 3_report_commitments/figures/
[INFO] Summary statistics saved to 3_report_commitments/analysis_summary.csv
