In [1]:
import pandas as pd
import os

def count_words_in_csv_files(folder_path):
    total_word_count = 0

    # Get all CSV file paths in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    for file_name in csv_files:
        file_path = os.path.join(folder_path, file_name)
        
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        if 'text' in df.columns:
            # Count words in the 'text' column
            column = df['text']
            word_count = column.dropna().apply(lambda x: len(str(x).split())).sum()
            
            print(f"File: {file_name}, Total word count: {word_count}")
            total_word_count += word_count
        else:
            print(f"File: {file_name} does not contain a 'text' column.")
    
    return total_word_count

# Example usage
folder_path = '/home/waqar/Downloads/new'
total_words = count_words_in_csv_files(folder_path)
print(f"Total word count across all CSV files: {total_words}")


File: 1octhamza.csv, Total word count: 7763
File: 26Sephamza.csv, Total word count: 7877
File: 26Sepkhadija.csv, Total word count: 13017
File: 23Sephamza.csv, Total word count: 20419
File: 25Sepzia.csv, Total word count: 29223
Total word count across all CSV files: 78299


In [1]:
import pandas as pd
import os

def create_txt_files_from_csv(folder_path):
    # Path for the 'annotations' folder
    annotations_folder = folder_path
    os.makedirs(annotations_folder, exist_ok=True)
    
    # Get all CSV file paths in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    print(csv_files)
    for csv_file in csv_files:
        csv_path = os.path.join(folder_path, csv_file)
        print(csv_path)
        # Load the CSV file into a DataFrame
        df = pd.read_csv(csv_path)
        # print(csv_path)
        # print(df.columns)
        if 'text' in df.columns and 'filename' in df.columns:
            # Create a folder named after the CSV file (excluding '.csv') inside 'annotations'
            base_name = os.path.splitext(csv_file)[0]
            output_folder = os.path.join(annotations_folder, base_name)
            os.makedirs(output_folder, exist_ok=True)
            
            for index, row in df.iterrows():
                text_content = row['text']
                audio_file_name = row['filename']
                # audio_file_name=audio_file_name.lower()
                # print(audio_file_name)
                # print(audio_file_name)
                # Skip rows with no text content
                if pd.notna(text_content) and text_content.strip():
                    # Replace the .wav extension with .txt
                    txt_file_name = os.path.splitext(audio_file_name)[0] + '.txt'
                    txt_file_path = os.path.join(output_folder, txt_file_name)
                    
                    # Write the text content to the text file
                    with open(txt_file_path, 'w') as txt_file:
                        txt_file.write(text_content)
                    
                    # print(f"Created file: {txt_file_path}")
                # else:
                    # print(f"Skipping row with empty text content in file: {csv_file}")
        else:
            print(f"File: {csv_file} does not contain 'text' or 'filename' columns.")

# Example usage
folder_path = '/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2'
create_txt_files_from_csv(folder_path)


['9sephamza.csv', '7sephamza.csv', '10sephamza.csv', '12SepHamza.csv', '1octhamza.csv', '9sepzia.csv', '25Sepkhadija.csv', '23Sepzia.csv', '16Sepzia.csv', '13Sepzia.csv', '5sepzia.csv', '26Sephamza.csv', '30auguestheadlinezia.csv', '18Sephamza.csv', '7sephamza2.csv', '2sepzia.csv', '19Sepkhadija.csv', '10sepkhadija.csv', '26Sepkhadija.csv', '24Sepkhadija.csv', '23Sephamza.csv', '2sepkhadija.csv', '2sephamza.csv', '6sepzia.csv', '25Sepzia.csv', '11sepzia.csv', '5sepkhadija.csv', '16Sephamza.csv']
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/9sephamza.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/7sephamza.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/10sephamza.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/12SepHamza.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/1octhamza.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/9sepzia.csv
/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2/25Sepkhadija.csv
/home/waqar/MWaqar/stt-api/

In [16]:
# import os

# def remove_text_files(folder_path):
#     """Removes all .txt files from each subfolder in the specified folder path and prints folder information."""
#     for root, dirs, files in os.walk(folder_path):
#         # Filter the list of files for .txt files
#         text_files = [file for file in files if file.endswith('.txt')]
        
#         if text_files:  # If there are .txt files in the current directory
#             for file in text_files:
#                 file_path = os.path.join(root, file)
#                 try:
#                     os.remove(file_path)
#                 except Exception as e:
#                     print(f"Error removing {file_path}: {e}")
#             # Print the folder information after processing all files in it
#             print(f"Processed folder: {root} - Removed {len(text_files)} text files.")

# if __name__ == "__main__":
#     # Example usage
#     folder_path = '/home/waqar/MWaqar/stt-api/Dataset/Custom_dataset/v2'
#     remove_text_files(folder_path)
