In [None]:
import pandas as pd

# Load the data from a CSV file
data = pd.read_csv('./miRNA_name.csv')

# Extract the 'Interactor1' column data
id1_data = data['Interactor1']

# Construct a string with each element prefixed by ' OR '
formatted_string = ' OR '.join(id1_data)

# Save the formatted string to a text file
with open("output.txt", "w") as file:
    file.write(' OR ' + formatted_string)  # Prepend ' OR ' to the first item as well



In [None]:
import os
import pandas as pd
import re

# Load data from a CSV file
table1 = pd.read_csv('./miRNA_nc.csv', index_col=0)

# Load FASTA sequence data
with open('./miRNA_sequence.fasta', 'r') as fasta_file:
    fasta_data = fasta_file.read()

# Extract RNA sequence information from the FASTA data
p_sequences = re.findall(r">(.*?)(?=\n)([\s\S]*?)(?=>|\Z)", fasta_data)

# Create columns for sequence names and sequences
p_seqname_list = [seq[0] for seq in p_sequences]
p_sequences_list = [seq[1].replace('\n', '') for seq in p_sequences]

# Add a new column to store RNA sequences
table1['r_seq'] = ''

# Map sequences to table1 based on matching identifiers
for index, row in table1.iterrows():
    pattern = r'(NC_(\d+)\.(\d+))'
    protein_id = re.search(pattern, row['seq_id']).group(1)

    # Find the sequence that matches the identifier and add it to the dataframe
    try:
        idx = p_seqname_list.index(protein_id)
        table1.at[index, 'r_seq'] = p_sequences_list[idx][row['start_idx']:row['end_idx']]
    except ValueError:
        table1.at[index, 'r_seq'] = None

# Save the updated dataframe to a new CSV file
table1.to_csv('miRNA_sequence_nc.csv')

# Check and print if any 'None' values exist in the 'r_seq' column
column_has_none = table1['r_seq'].isnull().any()


In [None]:
import os
import re
import pandas as pd
import numpy as np

# Read data from a text file
with open('gene_result_rna.txt', 'r') as file:
    text_data = file.read()

# Load data from a CSV file
data_table1 = pd.read_csv('miRNA_name.csv', index_col=0)

# Add new columns for sequence identifiers and indices
data_table1['seq_id'] = None
data_table1['start_idx'] = ''
data_table1['end_idx'] = ''

# Extract sequence information based on Interactor1
for index, row in data_table1.iterrows():
    interactor = row["Interactor1"]
    match = re.search(rf"{re.escape(interactor)}", text_data)
    if match:
        start_index = match.end()
        match_xp = re.search(r'NC_\d+\.\d+ \((.*?)\)', text_data[start_index:])
        if match_xp:
            matches = re.search(r'NC_(\d+\.\d+) \((\d+)\.\.(\d+)', match_xp.group(0))
            nc_id, start_idx, end_idx = "NC_" + matches.group(1), matches.group(2), matches.group(3)
            data_table1.at[index, 'seq_id'] = nc_id
            data_table1.at[index, 'start_idx'] = start_idx
            data_table1.at[index, 'end_idx'] = end_idx

# Remove duplicates based on Interactor1
data_table1 = data_table1.drop_duplicates(subset='Interactor1', keep='first')

# Load another dataset
data_table2 = data_table1.dropna(subset=['seq_id']).reset_index(drop=True)
data_table2 = data_table2.drop_duplicates(subset='Interactor1', keep='first')

# Prepare the array from another CSV for cleaning
ass = np.array(pd.read_csv('ass.csv', index_col=0))

# Remove rows corresponding to missing seq_id
k_list = [k for k, seq_id in enumerate(data_table2['seq_id']) if seq_id is None]
ass = np.delete(ass, k_list, axis=0)

# Ensure the dataset directory exists
if not os.path.exists('./dataset/'):
    os.mkdir('./dataset/')

# Save the cleaned array to CSV
pd.DataFrame(ass).to_csv('./dataset/ass_del1.csv', index=False, header=False)
print(pd.DataFrame(ass).shape)

# Drop rows with missing seq_id in the data table and save
data_table = data_table2.dropna(subset=['seq_id'])
data_table.to_csv('miRNA_nc.csv')

# Save sequence identifiers to a separate CSV
data_table['seq_id'].to_csv('miRNA_id.csv', header=False, index=False)


In [None]:
import os
import pandas as pd
import re
from tqdm import tqdm

# Read CSV file containing data
data = pd.read_csv('./protein_name.csv', index_col=None)

# Save the last column of the data to a file
data.iloc[:, -1].to_csv('protein_id.csv', index=False, header=None)

# Read text data from multiple files and concatenate it
text_files = ['gene_result (6).txt', 'gene_result (7).txt']
text_data = ''
for filename in text_files:
    with open(filename, 'r') as file:
        text_data += file.read()

# Initialize a new column 'NP' with None values in the DataFrame
data['NP'] = None

# Iterate over each row to find the closest location for 'Interactor2' and extract the corresponding 'NP' value
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    interactor = row["Id2"]
    match = re.search(f"geneid {interactor}", text_data)
    if match:
        start_index = match.end()
        match_xp = re.search(r'"NP_(\d+)"', text_data[start_index:])
        if match_xp:
            np_number = 'NP_' + match_xp.group(1)
            data.at[index, 'NP'] = np_number
            # Optionally, handle 'version' if it's relevant
            match_xp_v = re.search(r'version (\d+)', text_data[start_index + match_xp.end():])
            if match_xp_v:
                np_number += '.' + match_xp_v.group(1)
                data.at[index, 'NP'] = np_number

# Save updated DataFrame with 'NP' numbers included
data.to_csv('./protein_id_np.csv')

# Load the updated table
table_data = pd.read_csv('./protein_id_np.csv', index_col=0)
table_data['NP'] = table_data['NP'].replace('', None)

# Drop rows with missing 'NP' values
table_data.dropna(subset=['NP'], inplace=True)

# Save the cleaned data
table_data.to_csv('./protein_id_np.csv')
table_data['NP'].to_csv('./protein_np.csv', header=None, index=None)

# Print the shape of the cleaned data
print(f"Cleaned data shape: {table_data.shape}")



In [None]:
# Batch download from NCBI could be planned using "https://www.ncbi.nlm.nih.gov/sites/batchentrez"


In [None]:
import os
import pandas as pd
import re

# Read data from CSV files
table1 = pd.read_csv('./protein_id_np.csv', index_col=0)

# Read and concatenate FASTA sequence data from multiple files
fasta_files = ['./sequence (10).fasta', './sequence (11).fasta']
fasta_data = ''
for fasta_file in fasta_files:
    with open(fasta_file, 'r') as file:
        fasta_data += file.read()

# Extract protein sequences from FASTA data
p_sequences = re.findall(r">(.*?)(?=\n)([\s\S]*?)(?=>|\Z)", fasta_data)

# Add a new column for storing protein sequences
table1['protein_seq'] = None

# Map protein IDs from table to sequences extracted from FASTA data
for index, row in table1.iterrows():
    pattern = r'(NP_(\d+))'
    protein_id = re.search(pattern, row['NP']).group(1)

    # Find and assign the corresponding protein sequence
    for header, sequence in p_sequences:
        if protein_id in header:
            table1.at[index, 'protein_seq'] = sequence.replace('\n', '')
            break

# Check for and report any rows with missing protein sequences
column_has_none = table1['protein_seq'].isnull().any()
print(f"Column contains None values: {column_has_none}")

# Clean up the data by dropping rows with missing protein sequences
table1 = table1.dropna(subset=['protein_seq'])
table1 = table1.drop_duplicates(subset=['NP'])

# Save the cleaned and processed data
if not os.path.exists('./dataset/'):
    os.mkdir('./dataset/')
table1.to_csv('./dataset/protein_seq.csv')

# Read another dataset and adjust it based on the existing protein IDs
ass = pd.read_csv('./dataset/ass_del1.csv', index_col=None, header=None)
selected_columns = table1['Protein_id'].dropna().astype(int)  # Ensure indices are integers
ass = ass.iloc[:, selected_columns]
ass.to_csv('./dataset/ass_del2.csv', header=None, index=None)

print(f"Processed and saved data with shape: {table1.shape}")


In [None]:
import os
import re
import pandas as pd
import numpy as np

# Load text data
with open('gene_result_rna.txt', 'r') as file:
    text_data = file.read()

# Load table data from a CSV file
data_table1 = pd.read_csv('miRNA_name.csv', index_col=0)

# Initialize new columns for extracted data
data_table1['seq_id'] = None
data_table1['start_idx'] = ''
data_table1['end_idx'] = ''

# Extract sequence identifiers and indices using regular expressions
for index, row in data_table1.iterrows():
    interactor = row["Interactor1"]
    match = re.search(re.escape(interactor), text_data)
    if match:
        start_index = match.end()
        match_xp = re.search(r'NC_\d+\.\d+ \((.*?)\)', text_data[start_index:])
        if match_xp:
            matches = re.search(r'NC_(\d+\.\d+) \((\d+)\.\.(\d+)', match_xp.group(0))
            if matches:
                nc_id, start_idx, end_idx = "NC_" + matches.group(1), matches.group(2), matches.group(3)
                data_table1.at[index, 'seq_id'] = nc_id
                data_table1.at[index, 'start_idx'] = start_idx
                data_table1.at[index, 'end_idx'] = end_idx

# Remove rows without sequence IDs and drop duplicates based on 'Interactor1'
data_table1.dropna(subset=['seq_id'], inplace=True)
data_table1.drop_duplicates(subset='Interactor1', inplace=True)

# Read another table for operations
data_table = pd.read_csv('./human/m_ss.csv', index_col=None, header=None)

# Identify duplicated entries
duplicated_entries = data_table1.duplicated(subset='Interactor1', keep='first')

# Drop these entries from the secondary data table
cleaned_data = data_table.drop(index=duplicated_entries[duplicated_entries].index)

# Save the cleaned data
cleaned_data.to_csv('./l_ss.csv', index=False, header=False)

# Load additional data
ass = np.array(pd.read_csv('ass.csv', index_col=0))

# Remove rows corresponding to None sequence IDs
none_idx = [idx for idx, seq_id in enumerate(data_table1['seq_id']) if seq_id is None]
cleaned_ass = np.delete(ass, none_idx, axis=0)

# Ensure the output directory exists
output_dir = './dataset/'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

# Save the further cleaned data
pd.DataFrame(cleaned_ass).to_csv(f'{output_dir}ass_del1.csv', index=False, header=False)


In [None]:
import numpy as np
import pandas as pd

def GIP_kernel(Asso_RNA_Dis):
    def getGosiR(Asso_RNA_Dis):
        """ Calculate the normalization constant 'r' for the Gaussian Kernel. """
        # Calculate the sum of squared norms of each row
        squared_norms = np.sum(np.square(np.linalg.norm(Asso_RNA_Dis, axis=1)))
        # Compute 'r' as the average of these squared norms
        r = squared_norms / Asso_RNA_Dis.shape[0]
        return r
    
    # Number of entities (RNA/Disease)
    nc = Asso_RNA_Dis.shape[0]
    # Initialize the result matrix
    matrix = np.zeros((nc, nc))
    # Calculate the normalization constant 'r'
    r = getGosiR(Asso_RNA_Dis)
    
    # Calculate the GIP kernel matrix
    for i in range(nc):
        for j in range(nc):
            # Calculate the squared Euclidean distance between row vectors
            squared_distance = np.square(np.linalg.norm(Asso_RNA_Dis[i, :] - Asso_RNA_Dis[j, :]))
            if r == 0:
                matrix[i, j] = 0
            elif i == j:
                matrix[i, j] = 1
            else:
                matrix[i, j] = np.exp(-squared_distance / r)
    
    return matrix

def main():
    # Load data from CSV file into a NumPy array
    A = np.array(pd.read_csv('./dataset/ass_del2.csv', header=None))

    # Calculate GIP kernel for the matrix and its transpose
    GIP_mr_sim = GIP_kernel(A)
    GIP_d_sim = GIP_kernel(A.T)

    # Save the computed GIP kernel matrices to CSV files
    pd.DataFrame(GIP_mr_sim).to_csv('./dataset/m_gs.csv', header=None, index=None)
    pd.DataFrame(GIP_d_sim).to_csv('./dataset/p_gs.csv', header=None, index=None)

if __name__ == '__main__':
    main()
