In [1]:
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import math

In [2]:
def read_file(file_path):
    """Helper function to read files with proper column handling"""
    try:
        # Read file content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
        
        # Check for "No Available results."
        if "No Available results." in content:
            return None, True
        
        # Process file normally
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        data_lines = [line for line in lines if not line.startswith('#')]
        
        if not data_lines:
            return pd.DataFrame(), False
        
        # Process lines and split by tabs
        processed_data = []
        for line in data_lines:
            row = [col.strip() for col in line.split('\t') if col.strip()]
            if row:
                processed_data.append(row)
        
        if processed_data:
            max_cols = max(len(row) for row in processed_data)
            padded_data = [row + [''] * (max_cols - len(row)) for row in processed_data]
            df = pd.DataFrame(padded_data[1:], columns=padded_data[0] if padded_data else [f'Column_{i}' for i in range(max_cols)])
            return df, False
            
        return pd.DataFrame(), False
    
    except Exception as e:
        raise Exception(f"Error reading file: {str(e)}")

In [3]:
def process_batch(files_batch, download_dir):
    """Process a batch of files and return extracted data and results"""
    no_results_files = []
    error_files = []
    extracted_data = pd.DataFrame(columns=['Column_B', 'Column_D'])
    
    for filename in files_batch:
        try:
            file_path = os.path.join(download_dir, filename)
            
            # Process file
            df, is_no_results = read_file(file_path)
            
            if is_no_results:
                no_results_files.append(filename)
            elif df is not None and not df.empty:
                if 'miRNAname' in df.columns and 'geneName' in df.columns and 'geneType' in df.columns:
                    lncrna_rows = df[df['geneType'] == "lncRNA"]
                    if not lncrna_rows.empty:
                        temp_df = lncrna_rows[['miRNAname', 'geneName']]
                        temp_df.columns = ['Column_B', 'Column_D']
                        extracted_data = pd.concat([extracted_data, temp_df], ignore_index=True)
            
            # Delete the processed file
            os.remove(file_path)
            
        except Exception as e:
            error_files.append(filename)
            print(f"Error processing {filename}: {str(e)}")
    
    return extracted_data, no_results_files, error_files

In [4]:
def download_and_process_mirna(excel_path, batch_size=50):
    """Main function to download and process miRNA files in batches"""
    # Read input Excel file
    df = pd.read_excel(excel_path)
    miRNAs = df['miRNA'].dropna().unique()
    
    # Create download directory
    download_dir = 'downloaded_files'
    os.makedirs(download_dir, exist_ok=True)
    
    # Create output directory
    output = 'output'
    os.makedirs(output, exist_ok=True)

    # Initialize result storage
    all_extracted_data = pd.DataFrame(columns=['Column_B', 'Column_D'])
    all_no_results = []
    all_errors = []
    
    # Calculate number of batches
    num_batches = math.ceil(len(miRNAs) / batch_size)
    
    # Process in batches
    for batch_num in tqdm(range(num_batches), desc='Processing batches'):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(miRNAs))
        batch_miRNAs = miRNAs[start_idx:end_idx]
        
        # Download batch
        downloaded_files = []
        for miRNA in tqdm(batch_miRNAs, desc=f'Downloading batch {batch_num + 1}/{num_batches}', leave=False):
            try:
                url = f'https://rnasysu.com/encori/moduleDownload.php?source=agoClipRNA&type=xls&value=hg38;lncRNA;{miRNA};1;0;0;1;None;all'
                response = requests.get(url)
                response.raise_for_status()
                
                filename = f"{miRNA.replace('/', '_')}.xls"
                file_path = os.path.join(download_dir, filename)
                
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                downloaded_files.append(filename)
                
            except Exception as e:
                print(f"Error downloading {miRNA}: {str(e)}")
        
        # Process batch
        extracted_data, no_results, errors = process_batch(downloaded_files, download_dir)
        
        # Accumulate results
        if not extracted_data.empty:
            all_extracted_data = pd.concat([all_extracted_data, extracted_data], ignore_index=True)
        all_no_results.extend(no_results)
        all_errors.extend(errors)
    
    # Save final results
    if all_no_results:
        pd.DataFrame({'Filename': all_no_results}).to_excel('output/data_not_found.xlsx', index=False)
        print(f"\nSaved {len(all_no_results)} files to data_not_found.xlsx")
    
    if all_errors:
        pd.DataFrame({'Filename': all_errors}).to_excel('output/error_files.xlsx', index=False)
        print(f"\nSaved {len(all_errors)} problematic files to error_files.xlsx")
    
    if not all_extracted_data.empty:
        all_extracted_data.to_excel('output/extracted_data.xlsx', index=False)
        print(f"\nSaved {len(all_extracted_data)} rows to extracted_data.xlsx")
    
    # Clean up download directory if empty
    if not os.listdir(download_dir):
        os.rmdir(download_dir)
    
    return len(all_no_results), len(all_extracted_data), len(all_errors)

In [None]:
# Execute the combined process
no_results_count, extracted_rows, error_count = download_and_process_mirna('get_id.xlsx', batch_size=20)

print(f"\nProcessing complete!")
print(f"Files with no results: {no_results_count}")
print(f"Files with errors: {error_count}")
print(f"Total rows extracted: {extracted_rows}")

Processing batches:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading batch 1/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 2/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 3/11:   0%|          | 0/20 [00:00<?, ?it/s]

Error downloading hsa-miR-3942-3p: HTTPSConnectionPool(host='rnasysu.com', port=443): Max retries exceeded with url: /encori/moduleDownload.php?source=agoClipRNA&type=xls&value=hg38;lncRNA;hsa-miR-3942-3p;1;0;0;1;None;all (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000248FEE66CF0>, 'Connection to rnasysu.com timed out. (connect timeout=None)'))


Downloading batch 4/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 5/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 6/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 7/11:   0%|          | 0/20 [00:00<?, ?it/s]

Downloading batch 8/11:   0%|          | 0/20 [00:00<?, ?it/s]

Error downloading hsa-miR-7850-5p: HTTPSConnectionPool(host='rnasysu.com', port=443): Max retries exceeded with url: /encori/moduleDownload.php?source=agoClipRNA&type=xls&value=hg38;lncRNA;hsa-miR-7850-5p;1;0;0;1;None;all (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x00000248FEE75BB0>, 'Connection to rnasysu.com timed out. (connect timeout=None)'))
