In [16]:
import os
import shutil
import fitz  # PyMuPDF for PDF
import docx
import pandas as pd
import json
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor

# Define source and output folders
root_folder = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf"
#root_directory=r"C:\Users\Zy565\Downloads\EXHD\dataNEOUROlift\random"

output_folder = os.path.join(root_folder, "text_files")

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to convert files to text
def convert_to_text(file_path):
    try:
        # Preserve folder structure but change extension to .txt
        rel_path = os.path.relpath(file_path, root_folder)
        txt_path = os.path.join(output_folder, os.path.splitext(rel_path)[0] + ".txt")
        os.makedirs(os.path.dirname(txt_path), exist_ok=True)

        ext = file_path.lower().split(".")[-1]

        # Extract text based on file type
        if ext == "pdf":
            text = extract_pdf_text(file_path)
        elif ext in ["docx"]:
            text = extract_docx_text(file_path)
        elif ext in ["csv", "xlsx"]:
            text = extract_excel_text(file_path)
        elif ext == "json":
            text = extract_json_text(file_path)
        elif ext == "xml":
            text = extract_xml_text(file_path)
        elif ext in ["srt", "vtt", "txt", "sub", "ass", "ssa" ]:
            shutil.copy(file_path, txt_path)
            print(f"Copied: {file_path} → {txt_path}")
            return
        else:
            print(f"Skipping unsupported file: {file_path}")
            return

        # Save text output
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"Converted: {file_path} → {txt_path}")

    except Exception as e:
        print(f"Error converting {file_path}: {e}")

# PDF to Text
def extract_pdf_text(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text() + "\n"
    return text

# DOCX to Text
def extract_docx_text(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Excel (CSV/XLSX) to Text
def extract_excel_text(excel_path):
    df = pd.read_csv(excel_path) if excel_path.endswith(".csv") else pd.read_excel(excel_path)
    return df.to_string()

# JSON to Text
def extract_json_text(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        return json.dumps(json.load(f), indent=4)

# XML to Text
def extract_xml_text(xml_path):
    tree = ET.parse(xml_path)
    return ET.tostring(tree.getroot(), encoding="utf-8").decode("utf-8")

# Get all file paths
file_paths = []
for root, _, files in os.walk(root_folder):
    for file in files:
        file_paths.append(os.path.join(root, file))

# Process files using 30 threads
with ThreadPoolExecutor(max_workers=30) as executor:
    executor.map(convert_to_text, file_paths)


Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/001 data-preprocessing-homework.pdf → /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/text_files/001 data-preprocessing-homework.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/001 Course-Notes-Logistic-Regression.pdf → /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/text_files/001 Course-Notes-Logistic-Regression.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/004 Conditional-formatting-example.pdf → /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/text_files/004 Conditional-formatting-example.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/001 Shortcuts-for-Jupyter.pdf → /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/text_files/001 Shortcuts-for-Jupyter.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/002 Case-Study-2.pdf → /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/pdf/text_files/002 Case-Study-2.txt
Converted: /mnt/

: 

: 

: 

In [1]:
import os
import re

# Root directory
root_directory = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random"
#root_directory=r"C:\Users\Zy565\Downloads\EXHD\dataNEOUROlift\random"
# Allowed characters: letters (A-Z, a-z), digits (0-9), Arabic (ا-ي)
allowed_pattern = re.compile(r'^[a-zA-Z0-9\u0621-\u064A]')

# Walk through all subdirectories and files
for dirpath, _, filenames in os.walk(root_directory):
    for filename in filenames:
        old_path = os.path.join(dirpath, filename)

        # Remove leading unexpected characters
        new_filename = filename
        while new_filename and not allowed_pattern.match(new_filename[0]):
            new_filename = new_filename[1:]

        # Ensure new filename is different before renaming
        if new_filename and new_filename != filename:
            new_path = os.path.join(dirpath, new_filename)
            os.rename(old_path, new_path)
            print(f"Renamed: {old_path} -> {new_path}")


In [14]:
import os
import shutil

root_directory = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/pdf/text_files"
sa_folder = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random"

# Walk through all files and folders in the root_directory
for dirpath, _, filenames in os.walk(root_directory):
    for filename in filenames:
        if filename.endswith(".txt"):  # Check if the file is a .txt file
            file_path = os.path.join(dirpath, filename)
            
            # Get the relative path of the file from root_directory
            relative_path = os.path.relpath(dirpath, root_directory)
            
            # Create the corresponding directory structure in sa_folder
            target_dir = os.path.join(sa_folder, relative_path)
            os.makedirs(target_dir, exist_ok=True)
            
            # Move the file to the target directory
            shutil.move(file_path, os.path.join(target_dir, filename))

print("All .txt files have been moved successfully.")


All .txt files have been moved successfully.


In [1]:
import os

def convert_to_txt(file_path):
    txt_file = os.path.splitext(file_path)[0] + ".txt"
    
    with open(file_path, "r", encoding="utf-8", errors="replace") as infile, \
         open(txt_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            if not line.strip().isdigit() and "-->" not in line:  # Skip timestamps
                outfile.write(line)
                
    print(f"Converted: {file_path} -> {txt_file}")

root_directory = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random"

for dirpath, _, filenames in os.walk(root_directory):
    for filename in filenames:
        if filename.endswith((".srt", ".vtt", ".sub", ".ass", ".ssa")):
            file_path = os.path.join(dirpath, filename)
            convert_to_txt(file_path)

print("✅ Conversion completed.")


Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/01. Leverage generative AI for analytics and insights.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/01. Leverage generative AI for analytics and insights.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/02. What you should know.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/02. What you should know.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/03. How to use the challenge exercise files.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/03. How to use the challenge exercise files.txt
Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/03-Connecting Goals with Financial Results/01-Improving on current financial results.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/03-Connecting Goals with Financial Result

In [12]:
import os

root_directory = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random"

# Function to remove empty folders
def remove_empty_folders(directory):
    for dirpath, dirnames, _ in os.walk(directory, topdown=False):  
        for dirname in dirnames:
            folder_path = os.path.join(dirpath, dirname)
            if not os.listdir(folder_path):  # Check if folder is empty
                os.rmdir(folder_path)
                print(f"Deleted empty folder: {folder_path}")

# Run the function
remove_empty_folders(root_directory)

print("All empty folders have been removed.")


Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/1. Welcome
Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/10. Day 25 Web Scraping on Javascript Driven HTML
Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/11. Day 26 Get Data with an API
Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/12. Day 27 - 28 Text Messaging (SMSMMS) with Python & Twilio
Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/13. Day 29 Twitter API & Python
Deleted empty folder: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/30 Days of Python  Unlock your Python Potential/14. Day 30 Read Email Inbox using Python & Gmail
Deleted e

In [7]:
import os
import chardet

# Function to detect file encoding
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        raw_data = f.read(10000)  # Read a sample of the file
        result = chardet.detect(raw_data)
        return result["encoding"] if result["encoding"] else "utf-8"

# Function to convert subtitles to text
def convert_to_txt(file_path):
    encoding = detect_encoding(file_path)  # Detect encoding
    txt_file = os.path.splitext(file_path)[0] + ".txt"  # Change extension to .txt
    
    with open(file_path, "r", encoding=encoding, errors="replace") as infile, \
         open(txt_file, "w", encoding="utf-8") as outfile:
        for line in infile:
            # Remove timestamps and empty lines
            if not line.strip().isdigit() and "-->" not in line:
                outfile.write(line)
    
    print(f"✅ Converted: {file_path} -> {txt_file}")

# Root directory to scan
root_directory = "/mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random"

# Walk through directories and process .srt and .vtt files
for dirpath, _, filenames in os.walk(root_directory):
    for filename in filenames:
        if filename.endswith((".srt", ".vtt")):  # Only process .srt and .vtt
            file_path = os.path.join(dirpath, filename)
            convert_to_txt(file_path)

print("✅ All .srt and .vtt files converted to .txt")


✅ Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/01. Leverage generative AI for analytics and insights.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/01. Leverage generative AI for analytics and insights.txt
✅ Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/02. What you should know.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/02. What you should know.txt
✅ Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/03. How to use the challenge exercise files.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/0. Introduction/03. How to use the challenge exercise files.txt
✅ Converted: /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/03-Connecting Goals with Financial Results/01-Improving on current financial results.srt -> /mnt/c/Users/Zy565/Downloads/EXHD/dataNEOUROlift/random/03-Connecting Goals with Financia