In [1]:
import pandas as pd
import requests
import os
from tqdm.notebook import tqdm
import math
from requests.exceptions import Timeout

In [2]:
def read_file(file_path):
    """Helper function to read files with proper column handling"""
    try:
        # Read file content
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
        
        # Check for "No Available results."
        if "No Available results." in content:
            return None, True
        
        # Process file normally
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        data_lines = [line for line in lines if not line.startswith('#')]
        
        if not data_lines:
            return pd.DataFrame(), False
        
        # Process lines and split by tabs
        processed_data = []
        for line in data_lines:
            row = [col.strip() for col in line.split('\t') if col.strip()]
            if row:
                processed_data.append(row)
        
        if processed_data:
            max_cols = max(len(row) for row in processed_data)
            padded_data = [row + [''] * (max_cols - len(row)) for row in processed_data]
            df = pd.DataFrame(padded_data[1:], columns=padded_data[0] if padded_data else [f'Column_{i}' for i in range(max_cols)])
            return df, False
            
        return pd.DataFrame(), False
    
    except Exception as e:
        raise Exception(f"Error reading file: {str(e)}")

def process_batch(files_batch, download_dir):
    """Process a batch of files and return extracted data and results"""
    no_results_files = []
    error_files = []
    extracted_data = pd.DataFrame(columns=['Column_B', 'Column_D'])
    
    for filename in files_batch:
        try:
            file_path = os.path.join(download_dir, filename)
            
            # Process file
            df, is_no_results = read_file(file_path)
            
            if is_no_results:
                no_results_files.append(filename)
            elif df is not None and not df.empty:
                if 'miRNAname' in df.columns and 'geneName' in df.columns and 'geneType' in df.columns:
                    lncrna_rows = df[df['geneType'] == "lncRNA"]
                    if not lncrna_rows.empty:
                        temp_df = lncrna_rows[['miRNAname', 'geneName']]
                        temp_df.columns = ['Column_B', 'Column_D']
                        extracted_data = pd.concat([extracted_data, temp_df], ignore_index=True)
            
            # Delete the processed file
            os.remove(file_path)
            
        except Exception as e:
            error_files.append(filename)
            print(f"Error processing {filename}: {str(e)}")
    
    return extracted_data, no_results_files, error_files


In [3]:

def save_batch_results(batch_num, no_results, not_downloaded, errors, extracted_data):
    """Save results after each batch"""
    os.makedirs('output', exist_ok=True)
    
    # Save no results
    if no_results:
        df = pd.DataFrame({'Filename': no_results})
        df.to_csv(f'output/data_not_found_batch_{batch_num}.csv', index=False)
        
    # Save not downloaded
    if not_downloaded:
        df = pd.DataFrame({'Filename': not_downloaded})
        df.to_csv(f'output/not_downloaded_batch_{batch_num}.csv', index=False)
        
    # Save errors
    if errors:
        df = pd.DataFrame({'Filename': errors})
        df.to_csv(f'output/error_files_batch_{batch_num}.csv', index=False)
        
    # Save extracted data
    if not extracted_data.empty:
        extracted_data.to_csv(f'output/extracted_data_batch_{batch_num}.csv', index=False)

def combine_batch_files():
    """Combine all batch files into final results"""
    output_types = {
        'data_not_found': [],
        'not_downloaded': [],
        'error_files': [],
        'extracted_data': pd.DataFrame()
    }
    
    for filename in os.listdir('output'):
        if filename.endswith('.csv'):
            file_path = os.path.join('output', filename)
            if 'data_not_found' in filename:
                df = pd.read_csv(file_path)
                output_types['data_not_found'].extend(df['Filename'].tolist())
            elif 'not_downloaded' in filename:
                df = pd.read_csv(file_path)
                output_types['not_downloaded'].extend(df['Filename'].tolist())
            elif 'error_files' in filename:
                df = pd.read_csv(file_path)
                output_types['error_files'].extend(df['Filename'].tolist())
            elif 'extracted_data' in filename:
                df = pd.read_csv(file_path)
                output_types['extracted_data'] = pd.concat([output_types['extracted_data'], df], ignore_index=True)
            
            # Delete batch file after combining
            os.remove(file_path)
    
    # Save final combined files
    if output_types['data_not_found']:
        pd.DataFrame({'Filename': output_types['data_not_found']}).to_csv('output/data_not_found_final.csv', index=False)
    if output_types['not_downloaded']:
        pd.DataFrame({'Filename': output_types['not_downloaded']}).to_csv('output/not_downloaded_final.csv', index=False)
    if output_types['error_files']:
        pd.DataFrame({'Filename': output_types['error_files']}).to_csv('output/error_files_final.csv', index=False)
    if not output_types['extracted_data'].empty:
        output_types['extracted_data'].to_csv('output/extracted_data_final.csv', index=False)
    
    return output_types

In [4]:


def download_and_process_mirna(excel_path, batch_size=50):
    """Main function to download and process miRNA files in batches"""
    # Read input Excel file
    df = pd.read_excel(excel_path)
    miRNAs = df['miRNA'].dropna().unique()
    
    # Create download directory
    download_dir = 'downloaded_files'
    os.makedirs(download_dir, exist_ok=True)
    
    # Create output directory
    output_dir = 'output'
    os.makedirs(output_dir, exist_ok=True)

    # Initialize result storage for current batch
    all_no_results = []
    all_not_downloaded = []
    all_errors = []
    
    # Calculate number of batches
    num_batches = math.ceil(len(miRNAs) / batch_size)
    
    # Process in batches
    for batch_num in tqdm(range(num_batches), desc='Processing batches'):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(miRNAs))
        batch_miRNAs = miRNAs[start_idx:end_idx]
        
        # Initialize batch results
        batch_downloaded_files = []
        batch_not_downloaded = []
        
        # Download batch
        for miRNA in tqdm(batch_miRNAs, desc=f'Downloading batch {batch_num + 1}/{num_batches}', leave=False):
            try:
                url = f'https://rnasysu.com/encori/moduleDownload.php?source=agoClipRNA&type=xls&value=hg38;lncRNA;{miRNA};1;0;0;1;None;all'
                response = requests.get(url, timeout=30)  # Add 30-second timeout
                response.raise_for_status()
                
                filename = f"{miRNA.replace('/', '_')}.xls"
                file_path = os.path.join(download_dir, filename)
                
                with open(file_path, 'wb') as f:
                    f.write(response.content)
                batch_downloaded_files.append(filename)
                
            except Timeout:
                print(f"Timeout downloading {miRNA}")
                batch_not_downloaded.append(miRNA)
            except Exception as e:
                print(f"Error downloading {miRNA}: {str(e)}")
                batch_not_downloaded.append(miRNA)
        
        # Process batch
        extracted_data, no_results, errors = process_batch(batch_downloaded_files, download_dir)
        
        # Save batch results
        save_batch_results(
            batch_num + 1,
            no_results,
            batch_not_downloaded,
            errors,
            extracted_data
        )
        
        # Accumulate results
        all_no_results.extend(no_results)
        all_not_downloaded.extend(batch_not_downloaded)
        all_errors.extend(errors)
    
    # Combine all batch files into final results
    final_results = combine_batch_files()
    
    # Clean up download directory if empty
    if not os.listdir(download_dir):
        os.rmdir(download_dir)
    
    return (
        len(final_results['data_not_found']),
        len(final_results['extracted_data']),
        len(final_results['error_files']),
        len(final_results['not_downloaded'])
    )

In [5]:



# Execute the combined process
no_results_count, extracted_rows, error_count, not_downloaded_count = download_and_process_mirna('get_id.xlsx', batch_size=50)

print(f"\nProcessing complete!")
print(f"Files with no results: {no_results_count}")
print(f"Files with errors: {error_count}")
print(f"Files not downloaded (timeout): {not_downloaded_count}")
print(f"Total rows extracted: {extracted_rows}")

Processing batches:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading batch 1/5:   0%|          | 0/50 [00:00<?, ?it/s]

Timeout downloading hsa-miR-3167
Timeout downloading hsa-miR-6752-5p


Downloading batch 2/5:   0%|          | 0/50 [00:00<?, ?it/s]

Timeout downloading hsa-miR-4722-3p
Timeout downloading hsa-miR-3918


Downloading batch 3/5:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading batch 4/5:   0%|          | 0/50 [00:00<?, ?it/s]

Downloading batch 5/5:   0%|          | 0/1 [00:00<?, ?it/s]


Processing complete!
Files with no results: 166
Files with errors: 0
Files not downloaded (timeout): 4
Total rows extracted: 1511
