# Cleaning CSVs and merging them 

In [5]:
import os 
import pandas as pd 

def display_csv(input_folder, rows_to_display):

    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_folder, filename )
            print(f"\nLoading file: {file_path}")

            try:
                df = pd.read_csv(file_path)

                print(df.head(rows_to_display))

            except Exception as e:
                print(f"Error loading {file_path}: {e}")    

                
                  

In [6]:
input_folder = "/home/yassine/Textra-edu/csv_outputs"
display_csv(input_folder, rows_to_display=5)


Loading file: /home/yassine/Textra-edu/csv_outputs/summarized_PIMM_JME_2024_HASCOET.csv
   Chunk Number                                Original Text Chunk  \
0             1  HAL open science Open-Loop Control System for ...   
1             2  Javier Arduengo, Nicolas Hascoet, Francisco Ch...   
2             3  HAL Id: hal-04675399 https/halsclence/hal-0467...   
3             4  archive for the deposit and dissemination of s...   
4             5  Journal ofMachine Engineering, 2024, Vol. 24, ...   

                                             Summary  
0  Chunk 1: This video is part of the HAL Open Sc...  
1  Chunk 2: Open-Loop Control System for High Pre...  
2  Chunk 3: The Historical Archive of France (HAL...  
3  Chunk 4: The aim of this project is to make av...  
4  Chunk 5: A new method for 3D bioprinting has b...  

Loading file: /home/yassine/Textra-edu/csv_outputs/summarized_Cours Stochastiques - M. Hadda.csv
   Chunk Number                                Original Text C

In [11]:

def clean_summary_column(input_folder, output_folder):
    """
    Removes the 'Chunk X: ' prefix from the 'Summary' column in all CSV files in a folder.

    Parameters:
    - input_folder: Path to the folder containing input CSV files.
    - output_folder: Path to save the cleaned CSV files.
    """
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all CSV files in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            input_csv_path = os.path.join(input_folder, filename)
            output_csv_path = os.path.join(output_folder, filename)

            print(f"Processing file: {input_csv_path}")

            # Load the CSV file
            df = pd.read_csv(input_csv_path)

            # Clean the 'Summary' column by removing the 'Chunk X: ' prefix
            if "Summary" in df.columns:
                df["Summary"] = df["Summary"].str.replace(r'^Chunk \d+: ', '', regex=True)

            # Save the cleaned DataFrame to a new CSV file
            df.to_csv(output_csv_path, index=False, encoding='utf-8')
            print(f"Cleaned file saved to: {output_csv_path}")
            print(df.head())




In [12]:
output_folder = "/home/yassine/Textra-edu/csv_outputs_cleaned"
clean_summary_column(input_folder, output_folder)

Processing file: /home/yassine/Textra-edu/csv_outputs/summarized_PIMM_JME_2024_HASCOET.csv
Cleaned file saved to: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_PIMM_JME_2024_HASCOET.csv
   Chunk Number                                Original Text Chunk  \
0             1  HAL open science Open-Loop Control System for ...   
1             2  Javier Arduengo, Nicolas Hascoet, Francisco Ch...   
2             3  HAL Id: hal-04675399 https/halsclence/hal-0467...   
3             4  archive for the deposit and dissemination of s...   
4             5  Journal ofMachine Engineering, 2024, Vol. 24, ...   

                                             Summary  
0  This video is part of the HAL Open Science pro...  
1  Open-Loop Control System for High Precision Ex...  
2        The Historical Archive of France (HAL) is a  
3  The aim of this project is to make available t...  
4  A new method for 3D bioprinting has been devel...  
Processing file: /home/yassine/Textra-edu/csv_outputs

In [14]:
import os
import pandas as pd

def merge_csv_files(input_folder, output_file):
    """
    Merges all CSV files in a specified folder into a single CSV file.

    Parameters:
    - input_folder: Path to the folder containing the CSV files to merge.
    - output_file: Path to save the consolidated CSV file.
    """
    # List to hold DataFrames
    data_frames = []

    # Iterate through all CSV files in the folder
    for filename in os.listdir(input_folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(input_folder, filename)
            print(f"Processing file: {file_path}")

            # Load the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Add a column for the source file name
            df["Source File"] = filename

            # Append the DataFrame to the list
            data_frames.append(df)

    # Concatenate all DataFrames
    if data_frames:
        merged_df = pd.concat(data_frames, ignore_index=True)

        # Save the merged DataFrame to a CSV file
        merged_df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"All files merged and saved to: {output_file}")
    else:
        print("No CSV files found in the input folder.")

In [16]:
merge_csv_files(output_folder,"/home/yassine/Textra-edu/merged.csv" )

Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_PIMM_JME_2024_HASCOET.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_Cours Stochastiques - M. Hadda.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_big-book-of-machine-learning-use-cases-2nd-edition.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_PIMM_CRM _ 2019_REILLE.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_I2M_AEM_Yadav_2021.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_applsci-10-05261-v3.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_gaudez_2023additive-manufacturing.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_Thermodynamics-based Artificial Neural Networks for constitutive modeling.csv
Processing file: /home/yassine/Textra-edu/csv_outputs_cleaned/summarized_output_15.csv
Processing file: /home/

In [17]:
data_merged = pd.read_csv("/home/yassine/Textra-edu/merged.csv")

In [18]:
data_merged.head()

Unnamed: 0,Chunk Number,Original Text Chunk,Summary,Source File
0,1,HAL open science Open-Loop Control System for ...,This video is part of the HAL Open Science pro...,summarized_PIMM_JME_2024_HASCOET.csv
1,2,"Javier Arduengo, Nicolas Hascoet, Francisco Ch...",Open-Loop Control System for High Precision Ex...,summarized_PIMM_JME_2024_HASCOET.csv
2,3,HAL Id: hal-04675399 https/halsclence/hal-0467...,The Historical Archive of France (HAL) is a,summarized_PIMM_JME_2024_HASCOET.csv
3,4,archive for the deposit and dissemination of s...,The aim of this project is to make available t...,summarized_PIMM_JME_2024_HASCOET.csv
4,5,"Journal ofMachine Engineering, 2024, Vol. 24, ...",A new method for 3D bioprinting has been devel...,summarized_PIMM_JME_2024_HASCOET.csv
