In [1]:
import zipfile
import os
import shutil
import pandas as pd
from Bio import SeqIO

def process_fasta_zip(zip_file_path, output_csv_path, extract_dir_base='extracted_files'):
    """
    Extract sequences from a FASTA file in a ZIP archive and save them to a CSV file with their lengths.

    Parameters:
    zip_file_path (str): Path to the ZIP file containing the FASTA file.
    output_csv_path (str): Path to save the output CSV file.
    extract_dir_base (str): Base directory to extract files to (default is 'extracted_files').
    """
    # Create a unique extraction directory for this ZIP file
    extract_dir = f"{extract_dir_base}_{os.path.basename(zip_file_path).split('.')[0]}"
    
    # Ensure the extraction directory is empty
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)
    os.makedirs(extract_dir)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all files in the zip archive to the specified directory
        zip_ref.extractall(extract_dir)

    # List all files in the extracted directory to ensure extraction
    extracted_files = os.listdir(extract_dir)
    print(f"Extracted files from {zip_file_path}: {extracted_files}")

    # Find the first FASTA file in the extracted directory
    fasta_files = [f for f in extracted_files if f.endswith('.fasta')]
    if not fasta_files:
        print(f"Error: No FASTA file found in {zip_file_path}")
        return
    
    fasta_file_path = os.path.join(extract_dir, fasta_files[0])
    print(f"Processing FASTA file: {fasta_file_path}")

    # Verify if the extracted file exists
    if not os.path.exists(fasta_file_path):
        print(f"Error: The file {fasta_file_path} does not exist.")
        return

    sequences = []
    labels = []

    # Parse the FASTA file and collect sequences
    with open(fasta_file_path, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            sequence = str(record.seq)
            sequences.append(sequence)
            labels.append(0)  # Label for non-enhancers is 0

    # Create a DataFrame
    data = {'sequence': sequences, 'label': labels}
    df = pd.DataFrame(data)

    # Save to CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"CSV file created at {output_csv_path}")

# Process the training and test datasets
zip_file_path_train = '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.zip'
output_csv_path_train = '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.csv'
process_fasta_zip(zip_file_path_train, output_csv_path_train)

zip_file_path_test = '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.zip'
output_csv_path_test = '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.csv'
process_fasta_zip(zip_file_path_test, output_csv_path_test)


Extracted files from /common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.zip: ['train-non-enhancers.fasta']
Processing FASTA file: extracted_files_non_enhancers_train/train-non-enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.csv
Extracted files from /common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.zip: ['test-non-enhancers.fasta']
Processing FASTA file: extracted_files_non_enhancers_test/test-non-enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.csv


In [2]:
def process_fasta_zip(zip_file_path, output_csv_path, extract_dir_base='extracted_files'):
    """
    Extract sequences from a FASTA file in a ZIP archive and save them to a CSV file with their lengths.

    Parameters:
    zip_file_path (str): Path to the ZIP file containing the FASTA file.
    output_csv_path (str): Path to save the output CSV file.
    extract_dir_base (str): Base directory to extract files to (default is 'extracted_files').
    """
    # Create a unique extraction directory for this ZIP file
    extract_dir = f"{extract_dir_base}_{os.path.basename(zip_file_path).split('.')[0]}"
    
    # Ensure the extraction directory is empty
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)
    os.makedirs(extract_dir)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all files in the zip archive to the specified directory
        zip_ref.extractall(extract_dir)

    # List all files in the extracted directory to ensure extraction
    extracted_files = os.listdir(extract_dir)
    print(f"Extracted files from {zip_file_path}: {extracted_files}")

    # Find the first FASTA file in the extracted directory
    fasta_files = [f for f in extracted_files if f.endswith('.fasta')]
    if not fasta_files:
        print(f"Error: No FASTA file found in {zip_file_path}")
        return
    
    fasta_file_path = os.path.join(extract_dir, fasta_files[0])
    print(f"Processing FASTA file: {fasta_file_path}")

    # Verify if the extracted file exists
    if not os.path.exists(fasta_file_path):
        print(f"Error: The file {fasta_file_path} does not exist.")
        return

    sequences = []
    labels = []

    # Parse the FASTA file and collect sequences
    with open(fasta_file_path, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            sequence = str(record.seq)
            sequences.append(sequence)
            labels.append(1)  # Label for non-enhancers is 0

    # Create a DataFrame
    data = {'sequence': sequences, 'label': labels}
    df = pd.DataFrame(data)

    # Save to CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"CSV file created at {output_csv_path}")

zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.csv'
process_fasta_zip(zip_file_path, output_csv_path)
zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.csv'
process_fasta_zip(zip_file_path, output_csv_path)
zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.csv'
process_fasta_zip(zip_file_path, output_csv_path)
zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.csv'
process_fasta_zip(zip_file_path, output_csv_path)


Extracted files from /common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.zip: ['train-weak enhancers.fasta']
Processing FASTA file: extracted_files_weak_enhancers_train/train-weak enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.csv
Extracted files from /common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.zip: ['test-weak enhancers.fasta']
Processing FASTA file: extracted_files_weak_enhancers_test/test-weak enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.csv
Extracted files from /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.zip: ['train-strong enhancers.fasta']
Processing FASTA file: extracted_files_strong_enhancers_train/train-strong enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.csv
Extracted files from /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.zip: ['test-strong enhancers.fasta']
Processing F

In [3]:
import pandas as pd

def concatenate_and_shuffle(csv_files, output_csv_path):
    """
    Concatenate multiple CSV files and shuffle the resulting DataFrame.

    Parameters:
    csv_files (list): List of paths to the CSV files to concatenate.
    output_csv_path (str): Path to save the concatenated and shuffled CSV file.
    """
    # Read and concatenate all CSV files
    dataframes = [pd.read_csv(file) for file in csv_files]
    combined_df = pd.concat(dataframes, ignore_index=True)

    # Shuffle the combined DataFrame
    shuffled_df = combined_df.sample(frac=1).reset_index(drop=True)

    # Save the shuffled DataFrame to a new CSV file
    shuffled_df.to_csv(output_csv_path, index=False)
    print(f"Shuffled CSV file created at {output_csv_path}")

# Example usage:
csv_files = [
    '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.csv'
]
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_train_data.csv'
concatenate_and_shuffle(csv_files, output_csv_path)


Shuffled CSV file created at /common/zhangz2lab/zhanh/enhancer_data/enhancer_train_data.csv


In [4]:
csv_file_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_train_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Check the counts of each label
label_counts = df['label'].value_counts()
print(label_counts)

1    7487
0    7484
Name: label, dtype: int64


In [5]:
csv_files = [
    '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.csv'
]
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_test_data.csv'
concatenate_and_shuffle(csv_files, output_csv_path)

Shuffled CSV file created at /common/zhangz2lab/zhanh/enhancer_data/enhancer_test_data.csv


In [6]:
csv_file_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_test_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Check the counts of each label
label_counts = df['label'].value_counts()
print(label_counts)

0    200
1    200
Name: label, dtype: int64


In [7]:
def process_fasta_zip(zip_file_path, output_csv_path, extract_dir_base='extracted_files'):
    """
    Extract sequences from a FASTA file in a ZIP archive and save them to a CSV file with their lengths.

    Parameters:
    zip_file_path (str): Path to the ZIP file containing the FASTA file.
    output_csv_path (str): Path to save the output CSV file.
    extract_dir_base (str): Base directory to extract files to (default is 'extracted_files').
    """
    # Create a unique extraction directory for this ZIP file
    extract_dir = f"{extract_dir_base}_{os.path.basename(zip_file_path).split('.')[0]}"
    
    # Ensure the extraction directory is empty
    if os.path.exists(extract_dir):
        shutil.rmtree(extract_dir)
    os.makedirs(extract_dir)

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract all files in the zip archive to the specified directory
        zip_ref.extractall(extract_dir)

    # List all files in the extracted directory to ensure extraction
    extracted_files = os.listdir(extract_dir)
    print(f"Extracted files from {zip_file_path}: {extracted_files}")

    # Find the first FASTA file in the extracted directory
    fasta_files = [f for f in extracted_files if f.endswith('.fasta')]
    if not fasta_files:
        print(f"Error: No FASTA file found in {zip_file_path}")
        return
    
    fasta_file_path = os.path.join(extract_dir, fasta_files[0])
    print(f"Processing FASTA file: {fasta_file_path}")

    # Verify if the extracted file exists
    if not os.path.exists(fasta_file_path):
        print(f"Error: The file {fasta_file_path} does not exist.")
        return

    sequences = []
    labels = []

    # Parse the FASTA file and collect sequences
    with open(fasta_file_path, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            sequence = str(record.seq)
            sequences.append(sequence)
            labels.append(2)  # Label for non-enhancers is 0

    # Create a DataFrame
    data = {'sequence': sequences, 'label': labels}
    df = pd.DataFrame(data)

    # Save to CSV file
    df.to_csv(output_csv_path, index=False)
    print(f"CSV file created at {output_csv_path}")

zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train_level.csv'
process_fasta_zip(zip_file_path, output_csv_path)
zip_file_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.zip'
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test_level.csv'
process_fasta_zip(zip_file_path, output_csv_path)


Extracted files from /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train.zip: ['train-strong enhancers.fasta']
Processing FASTA file: extracted_files_strong_enhancers_train/train-strong enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train_level.csv
Extracted files from /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test.zip: ['test-strong enhancers.fasta']
Processing FASTA file: extracted_files_strong_enhancers_test/test-strong enhancers.fasta
CSV file created at /common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test_level.csv


In [8]:
csv_files = [
    '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_train.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_train.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_train_level.csv'
]
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_level_train_data.csv'
concatenate_and_shuffle(csv_files, output_csv_path)

Shuffled CSV file created at /common/zhangz2lab/zhanh/enhancer_data/enhancer_level_train_data.csv


In [9]:
csv_file_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_level_train_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Check the counts of each label
label_counts = df['label'].value_counts()
print(label_counts)

0    7484
2    3744
1    3743
Name: label, dtype: int64


In [10]:
csv_files = [
    '/common/zhangz2lab/zhanh/enhancer_data/non_enhancers_test.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/weak_enhancers_test.csv',
    '/common/zhangz2lab/zhanh/enhancer_data/strong_enhancers_test_level.csv'
]
output_csv_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_level_test_data.csv'
concatenate_and_shuffle(csv_files, output_csv_path)

Shuffled CSV file created at /common/zhangz2lab/zhanh/enhancer_data/enhancer_level_test_data.csv


In [11]:
csv_file_path = '/common/zhangz2lab/zhanh/enhancer_data/enhancer_level_test_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Check the counts of each label
label_counts = df['label'].value_counts()
print(label_counts)

0    200
1    100
2    100
Name: label, dtype: int64
