# 文献数据清洗

In [12]:
import re
import json
import os
import glob

def clean_document(input_data):
    """
    Cleans the input document by removing references, acknowledgements,
    and in-text citations while preserving measurements.

    Parameters:
    - input_data (str or dict): The input document content as a string (plain text)
      or a dictionary (JSON-like structure).

    Returns:
    - str or dict: The cleaned document in the same format as the input.
    """
    def clean_text(text):
        # Define patterns for non-content sections
        non_content_sections = [
            'References',
            'NOTES AND REFERENCES',
            'Notes and references',
            'REFERENCES',
            '参考文献',
            '引用',
            'Bibliography',
            'Cited By',
            'Acknowledgment',
            'Acknowledgement',
            'Acknowledgments',
            'Acknowledgements',
            'Supporting Information',
            'Author Information',
            'Terms & Conditions',
            'This article is cited by',
            'Conflicts of interest',
            # Add any other headings that mark the end of the main content
        ]
        # Build a combined pattern for non-content sections
        combined_non_content_pattern = '|'.join([re.escape(section) for section in non_content_sections])

        # Reverse the text to search from the end
        reversed_text = text[::-1]
        # Reverse the patterns as well
        reversed_patterns = [section[::-1] for section in non_content_sections]
        combined_reversed_pattern = '|'.join([re.escape(pattern) for pattern in reversed_patterns])

        # Find the earliest occurrence from the end (which is the last occurrence in the original text)
        match = re.search(combined_reversed_pattern, reversed_text, flags=re.IGNORECASE)
        if match:
            # Truncate the text from the position where the non-content section starts
            cut_off_position = len(text) - match.end()
            text = text[:cut_off_position]

        # Now remove in-text citations like [12], (12), or superscript numbers
        # First, protect measurements to prevent accidental removal
        units = [
            '°C', 'h', 'mV', 'V', 'wt%', 'μm', 'nm', 'g', 'mg', 'kg', 'cm−2',
            'cm²', 'mA', 'wt%', 'kPa', 'MPa', 'kW', 's', 'min', 'hours', 'days',
            'weeks', 'months', 'years', 'bar', 'mol', 'atm', 'rpm', 'wt. %', '%',
            'mm', 'A', 'K', 'J', 'Pa', 'mbar', 'mTorr', 'μA', 'nA', 'pA', 'Torr',
            # Add any additional units as needed
        ]
        units_pattern = r'(?<!\w)(\d+(?:\.\d+)?\s*(?:' + '|'.join(units) + r'))(?!\w)'

        # Placeholder for measurements to protect them during cleaning
        measurement_placeholder = 'MEASUREMENTPLACEHOLDER'

        def protect_measurements(text):
            # Find all measurements to protect them
            measurements = re.findall(units_pattern, text)
            # Replace measurements with placeholders
            for measurement in measurements:
                escaped_measurement = re.escape(measurement)
                text = re.sub(escaped_measurement, measurement_placeholder, text, count=1)
            return text, measurements

        def restore_measurements(text, measurements):
            # Restore measurements from placeholders
            for measurement in measurements:
                text = text.replace(measurement_placeholder, measurement, 1)
            return text

        # Protect measurements
        text_protected, measurements = protect_measurements(text)

        # Remove in-text citations
        # Patterns:
        # - [12], [12,13], [12–15]
        # - (12), (12,13), (12–15)
        # - Superscript numbers (if any)
        # - Numbers at the end of sentences, possibly preceded by a comma

        # Pattern for in-text citations in square brackets
        square_bracket_citation_pattern = r'\[\s*\d+(?:–\d+)?(?:,\s*\d+(?:–\d+)?)*\s*\]'

        # Pattern for in-text citations in parentheses
        parenthesis_citation_pattern = r'\(\s*\d+(?:–\d+)?(?:,\s*\d+(?:–\d+)?)*\s*\)'

        # Pattern for superscript numbers (e.g., citation markers)
        superscript_citation_pattern = r'\^\s*\d+(?:,\s*\d+)*'

        # Remove the in-text citations
        text_no_citations = re.sub(square_bracket_citation_pattern, '', text_protected)
        text_no_citations = re.sub(parenthesis_citation_pattern, '', text_no_citations)
        text_no_citations = re.sub(superscript_citation_pattern, '', text_no_citations)

        # Remove numbers at the end of sentences that are likely citations
        end_of_sentence_citation_pattern = r',?\s*\d+(?:,\s*\d+)*(?=\s|\.|,|;|:|\))'
        text_no_citations = re.sub(end_of_sentence_citation_pattern, '', text_no_citations)

        # Restore measurements
        text_cleaned = restore_measurements(text_no_citations, measurements)

        # Clean up extra spaces and punctuation
        text_cleaned = re.sub(r'\s{2,}', ' ', text_cleaned)
        text_cleaned = re.sub(r'\s+([.,;:])', r'\1', text_cleaned)
        text_cleaned = re.sub(r'\(\s*\)', '', text_cleaned)  # Remove empty parentheses
        text_cleaned = re.sub(r'\[\s*\]', '', text_cleaned)  # Remove empty brackets

        return text_cleaned.strip()

    if isinstance(input_data, dict):
        # Input is a JSON-like dictionary
        # Remove non-content sections (case-insensitive)
        non_content_keys = [
            'Notes and references',
            'REFERENCES',
            'References',
            #'Supporting Information',
            'Acknowledgements',
            'Acknowledgments',
            'Conflicts of interest',
            'Terms & Conditions',
            # Add any other keys that are non-content
        ]
        # Create a set for faster lookup
        non_content_keys_lower = set(key.lower() for key in non_content_keys)
        keys_to_remove = [key for key in input_data.keys() if key.strip().lower() in non_content_keys_lower]
        for key in keys_to_remove:
            del input_data[key]

        # Clean each section
        for key in list(input_data.keys()):
            content = input_data[key]
            if isinstance(content, str):
                cleaned_content = clean_text(content)
                if not cleaned_content.strip():
                    # Remove the section if it's empty after cleaning
                    del input_data[key]
                else:
                    input_data[key] = cleaned_content
            elif isinstance(content, list):
                # If the content is a list of paragraphs
                cleaned_paragraphs = [clean_text(paragraph) for paragraph in content]
                # Remove empty paragraphs
                cleaned_paragraphs = [p for p in cleaned_paragraphs if p.strip()]
                if not cleaned_paragraphs:
                    # Remove the section if all paragraphs are empty
                    del input_data[key]
                else:
                    input_data[key] = cleaned_paragraphs
            # Add more conditions if needed for other data types

        return input_data

    elif isinstance(input_data, str):
        # Input is plain text
        cleaned_text = clean_text(input_data)
        return cleaned_text

    else:
        raise TypeError("Input data must be either a string or a dictionary.")

def process_file(file_path):
    """
    Processes a file to clean it by removing references, acknowledgements,
    and in-text citations while preserving measurements.

    Parameters:
    - file_path (str): The path to the input file.

    The function saves the cleaned content to a new file with '_cleaned' appended to the original filename.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    try:
        # Try to parse the content as JSON
        data = json.loads(content)
        # Input is JSON-like
        cleaned_data = clean_document(data)
        # Save the cleaned JSON
        base, ext = os.path.splitext(file_path)
        output_dir = os.path.join(os.path.dirname(file_path), 'output')
        os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
        new_file_path = os.path.join(output_dir, f"{os.path.basename(base)}_cleaned{ext}")
        with open(new_file_path, 'w', encoding='utf-8') as file:
            json.dump(cleaned_data, file, ensure_ascii=False, indent=4)
        print(f"Cleaned JSON document saved as {new_file_path}")
    except json.JSONDecodeError:
        # Input is plain text
        cleaned_text = clean_document(content)
        # Save the cleaned text
        base, ext = os.path.splitext(file_path)
        output_dir = os.path.join(os.path.dirname(file_path), 'output')
        os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
        new_file_path = os.path.join(output_dir, f"{os.path.basename(base)}_cleaned{ext}")
        with open(new_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)
        print(f"Cleaned text document saved as {new_file_path}")

def batch_process_files(file_pattern):
    """
    Batch processes multiple files matching the given pattern.

    Parameters:
    - file_pattern (str): The glob pattern to match files, e.g., 'documents/*.txt'
    """
    for file_path in glob.glob(file_pattern):
        process_file(file_path)

# To process multiple files in a directory:
batch_process_files('maintext_clean_test/*.txt')  # Adjust the pattern as needed


Cleaned text document saved as maintext_clean_test/output/A_Facile_and_General_Approach_for_the_Direct_Fabri_maintext_cleaned.txt
Cleaned text document saved as maintext_clean_test/output/Atomically_Dispersed_Transition_Metals_on_Carbon_N_maintext_cleaned.txt
Cleaned text document saved as maintext_clean_test/output/Enhanced_oxygen_reduction_reaction_activity_of_nit_maintext_cleaned.txt
Cleaned text document saved as maintext_clean_test/output/A_KClassisted_pyrolysis_strategy_to_fabricate_nitr_maintext_cleaned.txt
Cleaned text document saved as maintext_clean_test/output/Electrocatalytic_activity_of_nitrogen_doped_carbon_maintext_cleaned.txt


# ~


In [8]:
import json
from collections import defaultdict

def process_spectroscopy_data(json_data, spectroscopy_types):
    materials = json_data['materials']
    new_materials = []

    for material in materials:
        # 用于存储各类谱图数据的字典
        combined_spectra = {spec_type: defaultdict(list) for spec_type in spectroscopy_types}

        # 遍历所有属性并收集指定类型的谱图数据
        for prop in material['properties']:
            if prop['type'] in spectroscopy_types:
                combined_spectra[prop['type']]['values'].append(prop['value'])
                combined_spectra[prop['type']]['units'].append(prop['unit'])
                combined_spectra[prop['type']]['conditions'].append(prop['conditions'])
                combined_spectra[prop['type']]['evidence'].append(prop['evidence'])

        # 将收集到的谱图数据转换为合适的向量表示
        for spec_type, data in combined_spectra.items():
            if data['values']:
                combined_vector = {
                    "type": spec_type,
                    "values": data['values'],
                    "units": list(set(data['units'])),
                    "conditions": list(set(data['conditions'])),
                    "evidence": sum(data['evidence']) / len(data['evidence']) if data['evidence'] else None
                }

                # 将新生成的谱图数据添加回原材料属性中
                material['properties'] = [prop for prop in material['properties'] if prop['type'] != spec_type]
                material['properties'].append(combined_vector)

        new_materials.append(material)

    json_data['materials'] = new_materials
    return json_data

# 示例用法
spectroscopy_types = ["XPS", "XRD", "FTIR", "Raman"]  # 添加你想要处理的其他谱图类型

input_file = '/Users/yangz/Documents/projects/llm4catalyst/results/test.json'
with open(input_file, 'r') as f:
    json_data = json.load(f)

processed_data = process_spectroscopy_data(json_data, spectroscopy_types)

# 将处理后的数据保存回 JSON 文件
with open(input_file, 'w') as f:
    json.dump(processed_data, f, indent=4)


KeyError: 'value'

In [9]:
import json
import os
from collections import defaultdict

def process_spectroscopy_data(json_data, spectroscopy_types):
    materials = json_data['materials']
    new_materials = []

    for material in materials:
        combined_spectra = {spec_type: defaultdict(list) for spec_type in spectroscopy_types}

        for prop in material['properties']:
            if prop['type'] in spectroscopy_types:
                # Check if the necessary keys exist before accessing them
                if 'value' in prop and 'unit' in prop and 'conditions' in prop and 'evidence' in prop:
                    combined_spectra[prop['type']]['values'].append(prop['value'])
                    combined_spectra[prop['type']]['units'].append(prop['unit'])
                    combined_spectra[prop['type']]['conditions'].append(prop['conditions'])
                    combined_spectra[prop['type']]['evidence'].append(prop['evidence'])

        for spec_type, data in combined_spectra.items():
            if data['values']:
                combined_vector = {
                    "type": spec_type,
                    "values": data['values'],
                    "units": list(set(data['units'])),
                    "conditions": list(set(data['conditions'])),
                    "evidence": sum(data['evidence']) / len(data['evidence']) if data['evidence'] else None
                }

                material['properties'] = [prop for prop in material['properties'] if prop['type'] != spec_type]
                material['properties'].append(combined_vector)

        new_materials.append(material)

    json_data['materials'] = new_materials
    return json_data

def process_all_files_in_directory(directory, spectroscopy_types):
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r') as f:
                        json_data = json.load(f)

                    processed_data = process_spectroscopy_data(json_data, spectroscopy_types)

                    with open(file_path, 'w') as f:
                        json.dump(processed_data, f, indent=4)
                    print(f"Processed file: {file_path}")
                except json.JSONDecodeError:
                    print(f"Error decoding JSON in file: {file_path}")
                except KeyError as e:
                    print(f"Missing expected key {e} in file: {file_path}")
                except Exception as e:
                    print(f"An error occurred while processing file {file_path}: {e}")

# 示例用法
spectroscopy_types = ["XPS", "XRD", "FTIR", "Raman"]  # 添加你想要处理的其他谱图类型
directory = '/Users/yangz/Documents/projects/llm4catalyst/results/'

process_all_files_in_directory(directory, spectroscopy_types)


Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/Edge-sited_Fe-N4_atomic_species_Fe-N4_result_20240812062353.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/Carbon_nanotubes_with_rich_pyridinic_nitrogen_CNT_result_20240812042632.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/3D_porous_Fe_N_C_electrocatalyst_Fe_N_C_result_20240811175645.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/Graphitic-phase_C3N4_g-C3N4_result_20240812061308.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/Nitrogen-doped_carbons_NCs_result_20240812075901.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/hetero-single-atom_ORR_electrocatalyst_Fe_Ni_h-SA_result_20240812054017.json
Processed file: /Users/yangz/Documents/projects/llm4catalyst/results/carbon_nanotubes_doped_with_nitrogen_CNT-N_result_20240811183729.json
Processed file: /Users/yangz/Documents/projects/llm4

In [7]:
import os
import json
import csv

def json_to_csv(directory, output_csv_file):
    all_keys = set()
    rows = []

    # First pass: collect all possible property types
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        json_data = json.load(f)
                        title = json_data.get('meta', {}).get('title', '')

                        for material in json_data.get('materials', []):
                            material_name = material.get('material_name', '')
                            row = {
                                'title': title,
                                'material_name': material_name
                            }

                            for prop in material.get('properties', []):
                                prop_type = prop.get('type', '')
                                value = prop.get('value', '')
                                conditions = prop.get('conditions', '')
                                combined_value = f"{value} ({conditions})"
                                row[prop_type] = combined_value
                                all_keys.add(prop_type)

                            rows.append(row)
                except json.JSONDecodeError as e:
                    print(f"Skipping file {file_path} due to JSON decoding error: {e}")
                except Exception as e:
                    print(f"An error occurred while processing {file_path}: {e}")

    # Ensure 'title' and 'material_name' are in the columns
    all_keys = sorted(all_keys)
    all_keys.insert(0, 'material_name')
    all_keys.insert(0, 'title')

    # Second pass: write the data to the CSV file
    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=all_keys)
        writer.writeheader()

        for row in rows:
            writer.writerow(row)

    print(f"All data has been written to {output_csv_file}")

# Example usage
directory = '/Users/yangz/Documents/projects/llm4catalyst/results/'
output_csv_file = '/Users/yangz/Documents/projects/llm4catalyst/results/combined_data.csv'

json_to_csv(directory, output_csv_file)


Skipping file /Users/yangz/Documents/projects/llm4catalyst/results/test/Nitrogen-doped_graphene_N-graphene_result_20240626142827.json due to JSON decoding error: Extra data: line 58 column 1 (char 1873)
Skipping file /Users/yangz/Documents/projects/llm4catalyst/results/test/nitrogen-doped_graphene_quantum_dots_N-GQDs_result_20240626212537.json due to JSON decoding error: Extra data: line 163 column 1 (char 4654)
Skipping file /Users/yangz/Documents/projects/llm4catalyst/results/test/nitrogen-doped_graphene_quantum_dots_N-GQDs_result_20240626202829.json due to JSON decoding error: Extra data: line 127 column 1 (char 3506)
Skipping file /Users/yangz/Documents/projects/llm4catalyst/results/test/nitrogen-doped_graphene_quantum_dots_N-GQDs_result_20240626213909.json due to JSON decoding error: Extra data: line 187 column 1 (char 5349)
Skipping file /Users/yangz/Documents/projects/llm4catalyst/results/test/plasma-engraved_Co3O4_nanosheets_Co3O4_result_20240626143558.json due to JSON decoding