# Comprehensive Ontology Analysis Overview

This document provides a concise overview of the ontology analysis process, explaining each major step in the analysis pipeline.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Set plot style
plt.style.use('ggplot')
%matplotlib inline


## 1. Data Loading and Preprocessing

In this initial step, we load the ontology data from a JSON file and perform necessary preprocessing. This includes:
- Extracting institution and class year information from the filepath
- Handling null values in key metrics by filling them with zeros
- Ensuring the data is in a suitable format for further analysis

This preprocessing helps prevent errors due to missing or inconsistent data and sets the stage for meaningful analysis.














In [None]:
def load_and_preprocess_data(file_path):
    """
    Load data from JSON file and preprocess it.
    """
    # Load the JSON data
    df = pd.read_json(file_path)
    
    # Extract institution and class year from filepath
    df['institution'], df['class_year'] = zip(*df['filepath'].apply(extract_info))
    
    # Handle null values in key metrics
    key_metrics = ['class_count', 'object_property_count', 'data_property_count', 
                   'individual_count', 'axioms', 'logical_axioms']
    for metric in key_metrics:
        df[metric] = df[metric].fillna(0)
    
    return df

def extract_info(filepath):
    """
    Extract institution and class year from filepath.
    """
    parts = filepath.split('\\')
    return parts[1], parts[2]

# Load the data
df = load_and_preprocess_data('data/honest_benchmark_v4.json')
df

## 2. Basic Data Analysis


After loading and preprocessing the data, we perform some basic analysis to get an overview of our dataset. This includes:
- Counting the total number of ontologies
- Identifying the number of unique institutions and class-year combinations
- Examining the distribution of ontologies across institutions and class years

These basic statistics give us a high-level understanding of the dataset's composition and help identify any initial patterns or imbalances in the data.

In [None]:
def display_basic_stats(df):
    """
    Display basic statistics about the dataset, including class year counts grouped by institution.
    """
    print("Data Overview:")
    print(f"Total number of ontologies: {len(df)}")
    print(f"Number of institutions: {df['institution'].nunique()}")
    print(f"Number of class-year combinations: {df['class_year'].nunique()}")
    
    print("\nDistribution of ontologies by institution:")
    display(df['institution'].value_counts())
    
    print("\nDistribution of ontologies by class year:")
    display(df['class_year'].value_counts())
    
    print("\nDistribution of class years grouped by institution:")
    grouped = df.groupby(['institution', 'class_year']).size().unstack(fill_value=0)
    display(grouped)
    
    # Optional: Display total counts per institution
    print("\nTotal ontologies per institution:")
    display(grouped.sum(axis=1).sort_values(ascending=False))

display_basic_stats(df)

## 3. Metrics Analysis

Next, we dive deeper into analyzing key metrics of the ontologies. This analysis includes:
- Generating summary statistics (mean, median, standard deviation, etc.) for key metrics
- Performing correlation analysis between different metrics
- Creating distribution plots for each key metric

Understanding these metrics and their relationships helps us gauge the complexity and characteristics of the ontologies in our dataset.

In [None]:
def analyze_metrics(df, key_metrics):
    """
    Analyze key metrics of the ontologies.
    """
    # Summary statistics
    summary = df[key_metrics].describe()
    print("\nSummary Statistics of Key Metrics:")
    display(summary)
    
    # Correlation analysis
    correlation = df[key_metrics].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
    plt.title('Correlation Heatmap of Key Metrics')
    plt.tight_layout()
    plt.show()
    
    # Distribution plots
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle('Distribution of Key Metrics', fontsize=16)
    for i, metric in enumerate(key_metrics):
        ax = axes[i // 3, i % 3]
        sns.histplot(df[metric], kde=True, ax=ax)
        ax.set_title(metric)
        ax.set_xlabel('')
    plt.tight_layout()
    plt.show()

key_metrics = ['class_count', 'object_property_count', 'data_property_count', 
               'individual_count', 'axioms', 'logical_axioms']
analyze_metrics(df, key_metrics)

## 4. Institution and Class Year Analysis

We examine how key metrics vary across different institutions and class years. This analysis provides:
- Average metrics by institution
- Average metrics by class year
- Visualizations of key metrics across institutions and class years

This helps identify trends or differences in ontology development practices across different educational settings and over time.

In [None]:

def analyze_by_institution_and_class_year(df, key_metrics):
    """
    Analyze metrics by institution and class year.
    """
    # Analysis by institution
    institution_summary = df.groupby('institution')[key_metrics].mean()
    print("\nAverage Metrics by Institution:")
    display(institution_summary)
    
    # Analysis by class year
    class_year_summary = df.groupby('class_year')[key_metrics].mean().sort_index()
    print("\nAverage Metrics by Class-Year:")
    display(class_year_summary)
    
    # Visualize key metrics by institution and class year
    fig, axes = plt.subplots(2, 3, figsize=(20, 12))
    fig.suptitle("Key Metrics by Institution and Class-Year", fontsize=16)
    for i, metric in enumerate(key_metrics):
        sns.barplot(x='class_year', y=metric, hue='institution', data=df, ax=axes[i//3, i%3])
        axes[i//3, i%3].set_title(metric, fontsize=14)
        axes[i//3, i%3].set_xticklabels(axes[i//3, i%3].get_xticklabels(), rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

analyze_by_institution_and_class_year(df, key_metrics)

## 5. Complexity Analysis

We assess the complexity of ontologies using a composite score derived from key metrics. This analysis includes:
- Calculation of a complexity score for each ontology
- Visualization of average complexity scores by institution and class year
- Identification of the most and least complex ontologies

Understanding ontology complexity is crucial for assessing the sophistication of the developed ontologies and identifying potential areas for improvement in ontology design.

In [None]:

def analyze_complexity(df, key_metrics):
    """
    Analyze the complexity of ontologies.
    """
    # Calculate complexity score
    df['complexity_score'] = df[key_metrics].sum(axis=1)
    
    # Heatmap of average complexity score by institution and class year
    pivot_table = df.pivot_table(values='complexity_score', index='institution', columns='class_year', aggfunc='mean')
    plt.figure(figsize=(14, 10))
    sns.heatmap(pivot_table, annot=True, cmap='YlOrRd', fmt='.0f')
    plt.title('Average Complexity Score by Institution and Class Year')
    plt.xlabel('Class Year')
    plt.ylabel('Institution')
    plt.tight_layout()
    plt.show()
    
    # Top and bottom 5 ontologies by complexity
    print("\nTop 5 most complex ontologies:")
    display(df.nlargest(5, 'complexity_score')[['institution', 'class_year', 'complexity_score'] + key_metrics])
    
    print("\nBottom 5 least complex ontologies:")
    display(df.nsmallest(5, 'complexity_score')[['institution', 'class_year', 'complexity_score'] + key_metrics])

analyze_complexity(df, key_metrics)

## 6. Error Analysis

We analyze errors detected in the ontologies using two different tools: Prock and OOPS. This analysis covers:
- Distribution of error counts
- Most common types of errors
- Average error counts by institution and class year
- Correlation between error counts and other metrics

Identifying common errors and their patterns helps in improving ontology development practices and enhancing the quality of future ontologies.

In [None]:

def analyze_errors(df):
    """
    Analyze errors in ontologies.
    """
    # Calculate error counts
    df['prock_error_count'] = df['errors_prock'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    df['oops_error_count'] = df['errors_oops'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    # Visualize error distributions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))
    sns.histplot(df['prock_error_count'], ax=ax1, kde=True)
    ax1.set_title("Distribution of Prock Errors", fontsize=14)
    ax1.set_xlabel("Number of Errors", fontsize=12)
    sns.histplot(df['oops_error_count'], ax=ax2, kde=True)
    ax2.set_title("Distribution of OOPS Errors", fontsize=14)
    ax2.set_xlabel("Number of Errors", fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Function to safely extract error names
    def safe_extract_error_names(errors):
        if isinstance(errors, list):
            return [error['name'] for error in errors if isinstance(error, dict) and 'name' in error]
        return []

    # Analyze most common error types
    prock_error_types = [name for errors in df['errors_prock'] for name in safe_extract_error_names(errors)]
    oops_error_types = [name for errors in df['errors_oops'] for name in safe_extract_error_names(errors)]
    
    print("\nMost common Prock error types:")
    display(pd.Series(prock_error_types).value_counts().head())
    print("\nMost common OOPS error types:")
    display(pd.Series(oops_error_types).value_counts().head(10))
    
    # Analyze errors by institution and class year
    error_metrics = ['prock_error_count', 'oops_error_count']
    institution_errors = df.groupby('institution')[error_metrics].mean()
    class_year_errors = df.groupby('class_year')[error_metrics].mean().sort_index()
    
    print("\nAverage Error Counts by Institution:")
    display(institution_errors)
    
    print("\nAverage Error Counts by Class-Year:")
    display(class_year_errors)

    # Additional analysis: Check for entries with no errors
    no_errors = df[(df['prock_error_count'] == 0) & (df['oops_error_count'] == 0)]
    print(f"\nNumber of ontologies with no errors: {len(no_errors)}")
    print(f"Percentage of ontologies with no errors: {(len(no_errors) / len(df)) * 100:.2f}%")

    # Correlation between error counts and other metrics
    error_correlation = df[['prock_error_count', 'oops_error_count', 'class_count', 'object_property_count', 'data_property_count', 'individual_count', 'axioms', 'logical_axioms']].corr()
    print("\nCorrelation between error counts and other metrics:")
    display(error_correlation[['prock_error_count', 'oops_error_count']])

# Run the analysis
analyze_errors(df)

## 7. DL Expressivity Analysis

We examine the Description Logic (DL) expressivity of the ontologies. This analysis includes:
- Presence and count of different DL features across ontologies
- Visualization of feature presence and counts
- Analysis of feature usage by institution and class year

Understanding DL expressivity provides insights into the complexity and capabilities of the developed ontologies.

In [None]:

def analyze_dl_expressivity(df):
    """
    Analyze DL expressivity of ontologies.
    """
    # Define OWL feature descriptions
    feature_descriptions = {
        'A': 'Asymmetric properties',
        'C': 'Complex class constructors',
        'D': 'Datatype properties',
        'E': 'Existential restrictions',
        'F': 'Functional properties',
        'H': 'Role hierarchies',
        'I': 'Inverse properties',
        'N': 'Cardinality restrictions',
        'O': 'Nominals',
        'Q': 'Qualified cardinality restrictions',
        'R': 'Role constructors',
        'X': 'Reflexive properties',
        'Y': 'Irreflexive properties',
        'S': 'Symmetric properties',
        'T': 'Transitive properties',
        'U': 'Union of concepts'
    }
    
    # Extract DL expressivity details
    df['expressivity_details'] = df['dl_expressivity_details'].apply(
        lambda x: {item['feature']: item for item in x} if isinstance(x, list) else {}
    )
    
    # Analyze feature presence and count
    feature_presence = {feature: sum(1 for details in df['expressivity_details'] if details.get(feature, {}).get('present') == 1) for feature in feature_descriptions}
    feature_count = {feature: sum(details.get(feature, {}).get('feature_count', 0) for details in df['expressivity_details']) for feature in feature_descriptions}
    
    # Plot overall feature presence
    plt.figure(figsize=(14, 6))
    sorted_features = sorted(feature_presence.items(), key=lambda x: x[1], reverse=True)
    presence_data = [x[1] for x in sorted_features]
    feature_labels = [x[0] for x in sorted_features]
    plt.bar(feature_labels, presence_data)
    plt.title("Overall Presence of DL Expressivity Features Across Ontologies", fontsize=16)
    plt.xlabel("DL Expressivity Feature", fontsize=12)
    plt.ylabel("Number of Ontologies", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Display tabular view for feature presence
    presence_table = pd.DataFrame({
        'Feature': feature_labels,
        'Presence': presence_data,
        'Description': [feature_descriptions[f] for f in feature_labels]
    })
    print("Overall Presence of DL Expressivity Features:")
    display(presence_table)

analyze_dl_expressivity(df)

## 8. Competency Questions Analysis

We analyze the competency questions associated with each ontology. This analysis covers:
- Number of competency questions per ontology
- Average question length
- Most common words in competency questions
- Analysis of questions by institution and class year

Competency questions are crucial for understanding the intended use and coverage of ontologies, making this analysis valuable for assessing ontology quality and relevance.

In [None]:
def analyze_competency_questions(df):
    """
    Analyze competency questions in ontologies.
    """
    # Clean the competency_questions column
    df['competency_questions'] = df['competency_questions'].apply(
        lambda x: [str(q) for q in x] if isinstance(x, list) else [str(x)] if isinstance(x, str) else []
    )
    
    # Analyze competency questions
    df['word_freq'], df['avg_question_length'], df['question_count'] = zip(*df['competency_questions'].apply(
        lambda questions: (
            Counter([word.lower() for question in questions for word in word_tokenize(question) if word.isalnum() and word.lower() not in stopwords.words('english')]),
            sum(len(question.split()) for question in questions) / len(questions) if questions else 0,
            len(questions)
        )
    ))
    
    # Analysis by institution and class year
    institution_class_year_analysis = df.groupby(['institution', 'class_year']).agg({
        'question_count': ['sum', 'mean', 'size'],
        'avg_question_length': 'mean',
        'word_freq': lambda x: ', '.join([f"{word} ({count})" for word, count in sum(x, Counter()).most_common(5)])
    }).reset_index()
    
    institution_class_year_analysis.columns = ['institution', 'class_year', 'total_questions', 'avg_questions_per_entry', 'num_ontologies', 'avg_question_length', 'top_words']
    institution_class_year_analysis['norm_question_count'] = institution_class_year_analysis['total_questions'] / institution_class_year_analysis['num_ontologies']
    
    print("\nAnalysis of Competency Questions by Institution and Class Year:")
    display(institution_class_year_analysis)
    
    # Updated table: Institution statistics including average, total questions, and number of ontologies
    institution_stats = df.groupby('institution').agg({
        'question_count': ['mean', 'sum', 'size']
    }).reset_index()
    institution_stats.columns = ['Institution', 'Average Competency Questions', 'Total Competency Questions', 'Number of Ontologies']
    institution_stats = institution_stats.sort_values('Average Competency Questions', ascending=False)
    institution_stats['Average Competency Questions'] = institution_stats['Average Competency Questions'].round(2)
    
    print("\nInstitution Statistics for Competency Questions:")
    display(institution_stats)
    
    # Visualizations
    plt.figure(figsize=(15, 8))
    for institution in df['institution'].unique():
        data = institution_class_year_analysis[institution_class_year_analysis['institution'] == institution]
        plt.plot(data['class_year'], data['norm_question_count'], marker='o', label=institution)
    plt.title('Normalized Number of Competency Questions by Institution and Class Year')
    plt.xlabel('Class Year')
    plt.ylabel('Average Number of Questions per Ontology')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Print overall statistics
    total_questions = df['question_count'].sum()
    total_ontologies = len(df)
    avg_questions_per_ontology = total_questions / total_ontologies
    overall_avg_length = df['avg_question_length'].mean()
    
    print(f"\nOverall Statistics:")
    print(f"Total number of competency questions: {total_questions}")
    print(f"Total number of ontologies: {total_ontologies}")
    print(f"Average number of questions per ontology: {avg_questions_per_ontology:.2f}")
    print(f"Overall average question length: {overall_avg_length:.2f} words")
    
    # Find most common words across all competency questions
    all_word_freq = Counter()
    for freq in df['word_freq']:
        all_word_freq.update(freq)
    
    print("\nTop 10 most common words in competency questions:")
    for word, count in all_word_freq.most_common(10):
        print(f"{word}: {count}")

analyze_competency_questions(df)

## 9. Outlier Analysis

We identify and analyze outliers in the dataset, particularly focusing on ontologies with an unusual number of competency questions. This includes:
- Identification of outliers based on the number of competency questions
- Detailed examination of outlier ontologies
- Visualization of outlier distribution

Analyzing outliers can reveal exceptional cases that might provide insights into best practices or areas needing improvement in ontology development.

In [None]:

def find_outliers(group):
    """
    Find outliers in a group based on the number of competency questions.
    """
    q1 = group['question_count'].quantile(0.25)
    q3 = group['question_count'].quantile(0.75)
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    return group[group['question_count'] > upper_bound].sort_values('question_count', ascending=False)

def analyze_outliers(df):
    """
    Analyze outliers in the dataset.
    """
    # Find outliers for each institution
    outliers = df.groupby('institution').apply(find_outliers).reset_index(drop=True)
    
    # Display outliers for each institution
    for institution in outliers['institution'].unique():
        print(f"\nOutliers for {institution}:")
        inst_outliers = outliers[outliers['institution'] == institution]
        for _, row in inst_outliers.iterrows():
            print(f"Class Year: {row['class_year']}")
            print(f"Domain: {row['domain']}")
            print(f"Number of Competency Questions: {row['question_count']}")
            print("Sample of Competency Questions:")
            for q in row['competency_questions'][:5]:  # Display first 5 questions
                print(f"- {q}")
            print()
    
    # Calculate and display statistics
    print("\nOutlier Statistics:")
    print(f"Total number of outliers: {len(outliers)}")
    print("\nNumber of outliers per institution:")
    display(outliers['institution'].value_counts())
    print("\nAverage number of competency questions in outliers:")
    display(outliers.groupby('institution')['question_count'].mean())
    
    # Visualize outliers
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='institution', y='question_count', data=df)
    plt.title('Distribution of Competency Question Counts by Institution')
    plt.xticks(rotation=45)
    plt.show()
    
    # Visualize outliers (only outliers)
    plt.figure(figsize=(12, 6))
    sns.scatterplot(x='institution', y='question_count', data=outliers)
    plt.title('Outliers: Number of Competency Questions by Institution')
    plt.xticks(rotation=45)
    plt.show()

analyze_outliers(df)




## 10. Trend Analysis

We examine trends in ontology development over time. This analysis covers:
- Changes in key metrics over years
- Trends by institution
- Visualization of metric trends

Understanding these trends helps in identifying improvements or changes in ontology development practices over time.

In [None]:

def analyze_trends(df):
    """
    Analyze trends in the dataset over time.
    """
    # Extract year from class_year and create a new 'year' column
    df['year'] = df['class_year'].str.extract('(\d{4})').astype(int)
    
    # Sort the DataFrame by year in ascending order
    df = df.sort_values('year')
    
    # List of key metrics
    key_metrics = ['class_count', 'object_property_count', 'data_property_count', 'individual_count', 'axioms', 'logical_axioms']
    
    # Trend Analysis
    plt.figure(figsize=(14, 8))
    for institution in df['institution'].unique():
        inst_data = df[df['institution'] == institution]
        for metric in key_metrics:
            sns.lineplot(x='year', y=metric, data=inst_data, marker='o', label=f"{institution} - {metric}")
    
    plt.title("Trends in Metrics Over Years by Institution", fontsize=16)
    plt.xlabel("Year", fontsize=12)
    plt.ylabel("Average Value", fontsize=12)
    plt.legend(title="Institution - Metric", bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Tabular view for trend analysis
    trend_table = df.groupby(['year', 'institution'])[key_metrics].mean().reset_index()
    print("Trend Analysis - Average Metrics by Year and Institution:")
    display(trend_table)

analyze_trends(df)



## 11. Statistical Analysis

We perform more advanced statistical analyses to uncover significant patterns or differences in the data. This includes:
- Correlation analysis between numerical metrics
- T-tests to compare metrics between institutions
- ANOVA to compare metrics across years

These statistical tests help in identifying significant differences or relationships that might not be apparent from descriptive statistics alone.

In [None]:


def perform_statistical_analysis(df):
    """
    Perform statistical analysis on the dataset.
    """
    key_metrics = ['class_count', 'object_property_count', 'data_property_count', 'individual_count', 'axioms', 'logical_axioms']
    
    # Correlation analysis
    correlation_matrix = df[key_metrics].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0)
    plt.title("Correlation Matrix of Numerical Metrics", fontsize=16)
    plt.tight_layout()
    plt.show()
    
    print("\nCorrelation Matrix:")
    display(correlation_matrix)
    
    # T-tests to compare metrics between institutions
    print("\nT-test Results - Comparing Metrics Between Institutions:")
    t_test_results = []
    for metric in key_metrics:
        tu_group = df[df['institution'] == 'TU'][metric]
        wu_group = df[df['institution'] == 'WU'][metric]
        t_stat, p_value = stats.ttest_ind(tu_group, wu_group)
        t_test_results.append({'Metric': metric, 't-statistic': t_stat, 'p-value': p_value})
        print(f"{metric}: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")
    
    # ANOVA to compare metrics across years
    print("\nANOVA Results - Comparing Metrics Across Years:")
    anova_results = []
    for metric in key_metrics:
        years = df['year'].unique()
        year_groups = [df[df['year'] == y][metric] for y in years]
        f_statistic, p_value = stats.f_oneway(*year_groups)
        anova_results.append({'Metric': metric, 'F-statistic': f_statistic, 'p-value': p_value})
        print(f"{metric}: F-statistic = {f_statistic:.4f}, p-value = {p_value:.4f}")

perform_statistical_analysis(df)



## 12. Quality Assessment

We assess the overall quality of ontologies based on various metrics and error counts. This assessment includes:
- Calculation of a quality score for each ontology
- Identification of top and bottom ontologies by quality
- Analysis of quality scores by institution and class year

This quality assessment provides a holistic view of ontology development outcomes and helps identify areas for improvement.

In [None]:

def assess_quality(df):
    """
    Assess the quality of ontologies based on various metrics.
    """
    # Calculate a simple quality score based on key metrics and error counts
    df['quality_score'] = (
        df['class_count'] + 
        df['object_property_count'] + 
        df['data_property_count'] + 
        df['individual_count'] + 
        df['axioms'] / 10  # Normalized to give less weight to axioms
    ) / (1 + np.log1p(df['prock_error_count'] + df['oops_error_count']))  # Log to reduce impact of errors
    
    # Visualize quality score distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(df['quality_score'], kde=True)
    plt.title('Distribution of Quality Scores')
    plt.xlabel('Quality Score')
    plt.ylabel('Count')
    plt.show()
    
    # Top and bottom 10 ontologies by quality score
    print("\nTop 10 ontologies by quality score:")
    display(df.nlargest(10, 'quality_score')[['institution', 'class_year', 'domain', 'quality_score']])
    
    print("\nBottom 10 ontologies by quality score:")
    display(df.nsmallest(10, 'quality_score')[['institution', 'class_year', 'domain', 'quality_score']])
    
    # Average quality score by institution and class year
    quality_by_inst_year = df.groupby(['institution', 'class_year'])['quality_score'].mean().unstack()
    
    plt.figure(figsize=(12, 6))
    sns.heatmap(quality_by_inst_year, annot=True, cmap='YlGnBu', fmt='.2f')
    plt.title('Average Quality Score by Institution and Class Year')
    plt.show()

assess_quality(df)



## 13. Domain Analysis

We analyze the different domains covered by the ontologies in our dataset. This analysis includes:
- Identification of most common domains
- Analysis of domain distribution across institutions
- Visualization of domain diversity

Understanding the range and distribution of domains helps in assessing the breadth of ontology development efforts and identifying potential gaps or areas of focus.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

def analyze_domains(df):
    """
    Analyze the domains of ontologies with wordcloud and detailed statistics.
    """
    # Function to get domain frequency
    def get_domain_frequency(data):
        return Counter(data['domain'].str.lower().dropna())

    # Preprocess function to make text case-insensitive
    def preprocess_text(text):
        return ' '.join(text.str.lower().dropna())

    # Overall Domain Frequency
    overall_freq = get_domain_frequency(df)
    print("\nOverall Domain Frequency:")
    print(pd.DataFrame.from_dict(overall_freq, orient='index', columns=['Count']).sort_values('Count', ascending=False).head(20))

    # Plot overall wordcloud
    plt.figure(figsize=(14, 8))
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(preprocess_text(df['domain']))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title('Overall Wordcloud of Ontology Domains')
    plt.axis('off')
    plt.show()

    # Analysis by Institution
    institutions = sorted(df['institution'].unique())
    fig_institution, axes_institution = plt.subplots(1, len(institutions), figsize=(20, 8))
    fig_institution.suptitle('Word Clouds of Ontology Domains by Institution', fontsize=16)
    print("\nDomain Frequencies by Institution:")
    for i, institution in enumerate(institutions):
        institution_data = df[df['institution'] == institution]
        text = preprocess_text(institution_data['domain'])
        
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        
        ax = axes_institution[i] if len(institutions) > 1 else axes_institution
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Institution: {institution}')
        ax.axis('off')
        
        # Display tabular data
        domain_freq = get_domain_frequency(institution_data)
        print(f"\nInstitution: {institution}")
        print(pd.DataFrame.from_dict(domain_freq, orient='index', columns=['Count']).sort_values('Count', ascending=False).head(10))
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

    # Analysis by Class-Year
    class_years = sorted(df['class_year'].unique())
    n_class_years = len(class_years)
    fig_class_year, axes_class_year = plt.subplots(2, (n_class_years + 1) // 2, figsize=(20, 12))
    fig_class_year.suptitle('Word Clouds of Ontology Domains by Class-Year', fontsize=16)
    print("\nDomain Frequencies by Class-Year:")
    for i, class_year in enumerate(class_years):
        class_year_data = df[df['class_year'] == class_year]
        text = preprocess_text(class_year_data['domain'])
        
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate(text)
        
        ax = axes_class_year[i // ((n_class_years + 1) // 2), i % ((n_class_years + 1) // 2)]
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(f'Class-Year: {class_year}')
        ax.axis('off')
        
        # Display tabular data
        domain_freq = get_domain_frequency(class_year_data)
        print(f"\nClass-Year: {class_year}")
        print(pd.DataFrame.from_dict(domain_freq, orient='index', columns=['Count']).sort_values('Count', ascending=False).head(10))
    
    # Remove any unused subplots for class-years
    for j in range(i + 1, (n_class_years + 1) // 2 * 2):
        fig_class_year.delaxes(axes_class_year[j // ((n_class_years + 1) // 2), j % ((n_class_years + 1) // 2)])
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

    # Additional bar plots from the original code
    # Count occurrences of each domain
    domain_counts = df['domain'].value_counts()
    
    # Plot top 20 domains
    plt.figure(figsize=(14, 8))
    domain_counts.head(20).plot(kind='bar')
    plt.title('Top 20 Ontology Domains')
    plt.xlabel('Domain')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Print domain statistics
    print(f"\nTotal number of unique domains: {len(domain_counts)}")
    print("\nTop 10 domains:")
    display(domain_counts.head(10))
    
    # Analyze domains by institution
    domains_by_institution = df.groupby('institution')['domain'].value_counts().unstack().fillna(0)
    
    plt.figure(figsize=(14, 8))
    domains_by_institution.plot(kind='bar', stacked=True)
    plt.title('Domain Distribution by Institution')
    plt.xlabel('Institution')
    plt.ylabel('Count')
    plt.legend(title='Domain', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Call the function
analyze_domains(df)


## 14. Conclusions and Recommendations

Finally, we draw conclusions from our analysis and provide recommendations for improving ontology development practices. This section summarizes key findings from the analysis and offers actionable recommendations for enhancing ontology development processes, improving quality, and addressing identified issues.


In [None]:

def draw_conclusions(df):
    """
    Draw conclusions and make recommendations based on the analysis.
    """
    print("Conclusions and Recommendations:")
    
    # Overall statistics
    print("\n1. Overall Statistics:")
    print(f"   - Total number of ontologies: {len(df)}")
    print(f"   - Number of institutions: {df['institution'].nunique()}")
    print(f"   - Number of class years: {df['class_year'].nunique()}")
    
    # Complexity
    print("\n2. Ontology Complexity:")
    avg_complexity = df['complexity_score'].mean()
    print(f"   - Average complexity score: {avg_complexity:.2f}")
    print("   - Recommendation: Focus on reducing complexity in future ontologies while maintaining expressiveness.")
    
    # Error analysis
    print("\n3. Error Analysis:")
    avg_prock_errors = df['prock_error_count'].mean()
    avg_oops_errors = df['oops_error_count'].mean()
    print(f"   - Average Prock errors per ontology: {avg_prock_errors:.2f}")
    print(f"   - Average OOPS errors per ontology: {avg_oops_errors:.2f}")
    print("   - Recommendation: Implement stricter quality control measures to reduce the number of errors.")
    
    # Competency questions
    print("\n4. Competency Questions:")
    avg_questions = df['question_count'].mean()
    print(f"   - Average number of competency questions per ontology: {avg_questions:.2f}")
    print("   - Recommendation: Encourage the creation of more comprehensive sets of competency questions to guide ontology development.")
    
    # Domain diversity
    print("\n5. Domain Diversity:")
    num_domains = df['domain'].nunique()
    print(f"   - Number of unique domains: {num_domains}")
    print("   - Recommendation: Explore opportunities to develop ontologies in underrepresented domains.")
    
    # Trends
    print("\n6. Trends:")
    recent_year = df['year'].max()
    recent_complexity = df[df['year'] == recent_year]['complexity_score'].mean()
    overall_complexity = df['complexity_score'].mean()
    if recent_complexity > overall_complexity:
        trend = "increasing"
    else:
        trend = "decreasing"
    print(f"   - Ontology complexity is {trend} over time.")
    print("   - Recommendation: Continue monitoring trends and adjust teaching methods accordingly.")
    
draw_conclusions(df)

In [None]:

# Run the entire analysis
if __name__ == "__main__":
    df = load_and_preprocess_data('data/honest_benchmark_v4.json')
    display_basic_stats(df)
    analyze_metrics(df, key_metrics)
    analyze_by_institution_and_class_year(df, key_metrics)
    analyze_complexity(df, key_metrics)
    analyze_errors(df)
    analyze_dl_expressivity(df)
    analyze_competency_questions(df)
    analyze_outliers(df)
    analyze_trends(df)
    perform_statistical_analysis(df)
    assess_quality(df)
    analyze_domains(df)
    draw_conclusions(df)