#### Deduplication for lncRNA nodes.

In [None]:
# Human
import pandas as pd
import os
import re

# Set the directory containing ensembl data
ensembl_dir = "../../reference_lncRNA/human/bed/ensembl/"

# Read lncRNA ID and gene_name information
lncRNA = pd.read_csv('../../data/LPI/human/lncRNA.csv')

# Read NONCODE BED files
noncodev5_bed = pd.read_csv('../../reference_lncRNA/human/bed/NONCODEv5_hg38.lncRNAGene.bed', sep='\t', 
                            header=None, names=['chr', 'start', 'end', 'gene_id', 'score', 'strand'])
noncodev6_bed = pd.read_csv('../../reference_lncRNA/human/bed/NONCODEv6_hg38.lncRNAGene.bed', sep='\t', 
                            header=None, names=['chr', 'start', 'end', 'gene_id', 'score', 'strand'])

# Initialize remaining lncRNA list
remained_lncRNA = lncRNA[['gene_id', 'gene_name', 'identifier']].copy()
results = []

# Get genomic position by noncode_id (noncodev6)
pos_lnc_noncodev6 = pd.merge(
    remained_lncRNA,
    noncodev6_bed[['chr', 'start', 'end', 'strand', 'gene_id']],
    left_on='identifier',
    right_on='gene_id',
    how='inner'
)
results.append(pos_lnc_noncodev6)
remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_noncodev6['identifier'])]

# Get genomic position by noncode_id (noncodev5)
pos_lnc_noncodev5 = pd.merge(
    remained_lncRNA,
    noncodev5_bed[['chr', 'start', 'end', 'strand', 'gene_id']],
    left_on='identifier',
    right_on='gene_id',
    how='inner'
)
results.append(pos_lnc_noncodev5)
remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_noncodev5['identifier'])]

# Get genomic position by ensembl_id & gene_name
# **STEP 1: Extract the version number from BED file**
def extract_version(filename):
    match = re.search(r'GRCh38\.(\d+)\.bed', filename) # ensembl
    return int(match.group(1)) if match else -1

# **STEP 2: Get all BED files and sort them by version number**
bed_files = [f for f in os.listdir(ensembl_dir) if f.endswith(".bed")]
bed_files_sorted = sorted(bed_files, key=extract_version, reverse=True)  # Sort by version number in descending order

# **STEP 3: Iterate over sorted BED files**
for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'gene_id', 'strand'])

    # Match by gene_id
    pos_lnc_ensembl = pd.merge(remained_lncRNA, 
                               ensembl_bed[['chr', 'start', 'end', 'strand', 'gene_id']], 
                               left_on='identifier',
                               right_on='gene_id', how='inner')
    results.append(pos_lnc_ensembl)
    remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_ensembl['identifier'])]

for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'gene_id', 'strand'])

    # Match by gene_name
    pos_lnc_gene_name = pd.merge(remained_lncRNA, 
                                 ensembl_bed[['chr', 'start', 'end', 'strand', 'gene_name']], 
                                 left_on='identifier',
                                 right_on='gene_name', how='inner')
    remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_gene_name['identifier'])]
    results.append(pos_lnc_gene_name)

# Combine all results
pos_lnc = pd.concat(results, ignore_index=True).drop_duplicates(subset=['identifier'])

# Save remaining lncRNAs without genomic positions
remained_lncRNA.drop_duplicates().to_csv('human_lnc_no_pos.csv', index=False)

# Create BED6 format output: chr, start, end, name, score, strand
pos_lnc_bed = pos_lnc[['chr', 'start', 'end', 'identifier', 'strand']].copy()
pos_lnc_bed['score'] = 0

# Reorder columns to BED6 format
pos_lnc_bed = pos_lnc_bed[['chr', 'start', 'end', 'identifier', 'score', 'strand']]

# Save as BED (tab-delimited, no header)
pos_lnc_bed.to_csv('human_lncRNA_0-based.bed', sep='\t', header=False, index=False)


In [None]:
# Mouse
import pandas as pd
import os
import re

# Set the directory containing ensembl data
ensembl_dir = "../../reference_lncRNA/mouse/bed/ensembl/"

# Read lncRNA ID and gene_name information
lncRNA = pd.read_csv('../../data/LPI/mouse/lncRNA.csv')

# Read NONCODE BED files
noncodev5_bed = pd.read_csv('../../reference_lncRNA/mouse/bed/NONCODEv5_mm10.lncRNAGene.bed', sep='\t', 
                            header=None, names=['chr', 'start', 'end', 'gene_id', 'score', 'strand'])
noncodev6_bed = pd.read_csv('../../reference_lncRNA/mouse/bed/NONCODEv6_mm10.lncRNAGene.bed', sep='\t', 
                            header=None, names=['chr', 'start', 'end', 'gene_id', 'score', 'strand'])

# Initialize remaining lncRNA list
remained_lncRNA = lncRNA[['gene_id', 'gene_name', 'identifier']].copy()
results = []

# Get genomic position by noncode_id (noncodev6)
pos_lnc_noncodev6 = pd.merge(
    remained_lncRNA,
    noncodev6_bed[['chr', 'start', 'end', 'strand', 'gene_id']],
    left_on='identifier',
    right_on='gene_id',
    how='inner'
)
results.append(pos_lnc_noncodev6)
remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_noncodev6['identifier'])]

# Get genomic position by noncode_id (noncodev5)
pos_lnc_noncodev5 = pd.merge(
    remained_lncRNA,
    noncodev5_bed[['chr', 'start', 'end', 'strand', 'gene_id']],
    left_on='identifier',
    right_on='gene_id',
    how='inner'
)
results.append(pos_lnc_noncodev5)
remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_noncodev5['identifier'])]

# Get genomic position by ensembl_id & gene_name
# **STEP 1: Extract the version number from BED file**
def extract_version(filename):
    match = re.search(r'GRCm38\.(\d+)\.bed', filename) # ensembl
    return int(match.group(1)) if match else -1

# **STEP 2: Get all BED files and sort them by version number**
bed_files = [f for f in os.listdir(ensembl_dir) if re.match(r"Mus_musculus\.GRCm38\.\d+\.bed$", f)]
bed_files_sorted = sorted(bed_files, key=extract_version, reverse=True)  # Sort by version number in descending order

# **STEP 3: Iterate over sorted BED files**
for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'gene_id', 'strand'])

    # Match by gene_id
    pos_lnc_ensembl = pd.merge(remained_lncRNA, 
                               ensembl_bed[['chr', 'start', 'end', 'strand', 'gene_id']], 
                               left_on='identifier',
                               right_on='gene_id', how='inner')
    results.append(pos_lnc_ensembl)
    remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_ensembl['identifier'])]

for bed_file in bed_files_sorted:
    bed_path = os.path.join(ensembl_dir, bed_file)

    # Read ensembl BED file
    ensembl_bed = pd.read_csv(bed_path, sep='\t', header=None, 
                              names=['chr', 'start', 'end', 'gene_name', 'gene_id', 'strand'])

    # Match by gene_name
    pos_lnc_gene_name = pd.merge(remained_lncRNA, 
                                 ensembl_bed[['chr', 'start', 'end', 'strand', 'gene_name']], 
                                 left_on='identifier',
                                 right_on='gene_name', how='inner')
    remained_lncRNA = remained_lncRNA[~remained_lncRNA['identifier'].isin(pos_lnc_gene_name['identifier'])]
    results.append(pos_lnc_gene_name)

# Combine all results
pos_lnc = pd.concat(results, ignore_index=True).drop_duplicates(subset=['identifier'])

# Save remaining lncRNAs without genomic positions
remained_lncRNA.drop_duplicates().to_csv('mouse_lnc_no_pos.csv', index=False)

# Create BED6 format output: chr, start, end, name, score, strand
pos_lnc_bed = pos_lnc[['chr', 'start', 'end', 'identifier', 'strand']].copy()
pos_lnc_bed['score'] = 0

# Reorder columns to BED6 format
pos_lnc_bed = pos_lnc_bed[['chr', 'start', 'end', 'identifier', 'score', 'strand']]

# Save as BED (tab-delimited, no header)
pos_lnc_bed.to_csv('mouse_lncRNA_0-based.bed', sep='\t', header=False, index=False)


- Run ./get_overlap.sh to find duplicate lncRNA nodes.
- ./get_overlap.sh human_lncRNA_0-based.bed human_overlap.txt
- ./get_overlap.sh mouse_lncRNA_0-based.bed mouse_overlap.txt

In [None]:
import pandas as pd
from collections import defaultdict

# =========================
# Configuration
# =========================
species = "human"

bed_file = "human_lncRNA_0-based.bed"              # Input BED6: chr start end identifier score strand
overlap_file = f"{species}_overlap.txt"            # Overlap pairs: A B (may contain AB/BA redundancy)
lpi_file = f"../../data/LPI/{species}/lpi.csv"     # LPI: lncRNA_id, protein_id (2 columns)

# NEW: lncRNAs without genomic positions
no_pos_file = "human_lnc_no_pos.csv"

# Output files
out_bed = f"../../data/LPI/{species}/lncRNA_dedup.bed"
out_mapping = f"../../data/LPI/{species}/lncRNA_mapping.csv"
out_lpi = f"../../data/LPI/{species}/lpi_dedup.csv"


# =========================
# Helper: chromosome sorting
# =========================
def chr_sort_key(c):
    """
    Sort chromosomes in a human-friendly way:
    chr1..chr22, chrX, chrY, chrM.
    Works for both 'chr1' and '1' styles.
    """
    c = str(c)
    c2 = c.replace("chr", "")
    if c2.isdigit():
        return (0, int(c2))
    if c2 == "X":
        return (1, 23)
    if c2 == "Y":
        return (1, 24)
    if c2 in ("M", "MT"):
        return (2, 25)
    return (3, c2)


# =========================
# 0) Load lncRNAs without positions
# =========================
df_no_pos = pd.read_csv(no_pos_file)
no_pos_ids = set(df_no_pos["identifier"].astype(str))

print(f"[Info] Loaded {len(no_pos_ids)} lncRNAs without genomic positions from {no_pos_file}")


# =========================
# 1) Load BED
# =========================
df_bed = pd.read_csv(
    bed_file, sep="\t", header=None,
    names=["chr", "start", "end", "identifier", "score", "strand"]
)

# Index by identifier for quick lookup
bed_index = df_bed.set_index("identifier")


# =========================
# 2) Load overlap pairs
# =========================
merge_pairs = pd.read_csv(overlap_file, sep=r"\s+", header=None, names=["A", "B"])
merge_pairs = merge_pairs[merge_pairs["A"] != merge_pairs["B"]].copy()  # remove self-pairs


# =========================
# 3) Union-Find to group overlaps
# =========================
parent = {}

def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    rx, ry = find(x), find(y)
    if rx != ry:
        parent[ry] = rx

# Union all pairs (AB/BA redundancy does not affect final groups)
for a, b in zip(merge_pairs["A"], merge_pairs["B"]):
    union(a, b)

# Build groups
groups = defaultdict(set)
all_ids_in_pairs = set(merge_pairs["A"]).union(set(merge_pairs["B"]))
for gid in all_ids_in_pairs:
    groups[find(gid)].add(gid)


# =========================
# 4) Build clusters using union coordinates (no representative needed)
# =========================
cluster_candidates = []  # each item: dict with chr/start/end/strand and member list

for group in groups.values():
    # Keep only members existing in BED (no_pos_ids will naturally be excluded)
    members = [m for m in group if m in bed_index.index]
    if len(members) == 0:
        continue

    group_df = bed_index.loc[members].copy()

    # Union range
    u_chr = group_df["chr"].iloc[0]
    u_strand = group_df["strand"].iloc[0]
    u_start = int(group_df["start"].min())
    u_end = int(group_df["end"].max())

    # Consistency check (should be consistent because bedtools used -s)
    if len(set(group_df["chr"])) != 1 or len(set(group_df["strand"])) != 1:
        print(f"[Warning] Inconsistent chr/strand in group: {members}")

    members_sorted = sorted(members)

    cluster_candidates.append({
        "chr": u_chr,
        "start": u_start,
        "end": u_end,
        "strand": u_strand,
        "members": members_sorted
    })


# =========================
# 5) Add singleton genes as clusters (not in any overlap group)
# =========================
merged_members = set().union(*[set(v) for v in groups.values()]) if groups else set()
all_ids = set(df_bed["identifier"])
singleton_ids = all_ids - merged_members

singleton_ids_sorted = sorted(
    list(singleton_ids),
    key=lambda x: (
        chr_sort_key(bed_index.loc[x, "chr"]),
        int(bed_index.loc[x, "start"]),
        int(bed_index.loc[x, "end"]),
        str(bed_index.loc[x, "strand"]),
        x
    )
)

for sid in singleton_ids_sorted:
    row = bed_index.loc[sid]
    cluster_candidates.append({
        "chr": row["chr"],
        "start": int(row["start"]),
        "end": int(row["end"]),
        "strand": row["strand"],
        "members": [sid]
    })


# =========================
# 6) Assign incremental lncRNA_id (stable ordering)
# =========================
cluster_candidates_sorted = sorted(
    cluster_candidates,
    key=lambda x: (
        chr_sort_key(x["chr"]),
        x["start"],
        x["end"],
        x["strand"],
        ";".join(x["members"])
    )
)

cluster_rows = []         
mapping_long_rows = []    
old_to_new = {}           

for i, c in enumerate(cluster_candidates_sorted, start=1):
    lncRNA_id = f"l{i:06d}"

    # Record BED row
    cluster_rows.append({
        "chr": c["chr"],
        "start": c["start"],
        "end": c["end"],
        "lncRNA_id": lncRNA_id,
        "score": 0,
        "strand": c["strand"]
    })

    # Record long mapping rows and dictionary mapping
    for m in c["members"]:
        mapping_long_rows.append({"lncRNA_id": lncRNA_id, "member_id": m})
        old_to_new[m] = lncRNA_id

df_cluster_bed = pd.DataFrame(cluster_rows)
df_mapping_long = pd.DataFrame(mapping_long_rows)


# =========================
# 7) Save BED6 and mapping table
# =========================
df_cluster_bed = df_cluster_bed.sort_values(
    by=["chr", "start", "end", "strand", "lncRNA_id"],
    key=lambda col: col.map(chr_sort_key) if col.name == "chr" else col
).reset_index(drop=True)

df_mapping_long = df_mapping_long.sort_values(["lncRNA_id", "member_id"]).drop_duplicates().reset_index(drop=True)

df_cluster_bed[["chr", "start", "end", "lncRNA_id", "score", "strand"]].to_csv(
    out_bed, sep="\t", header=False, index=False
)

df_mapping_long.to_csv(out_mapping, index=False)


# =========================
# 8) Update LPI: remove no-pos lncRNAs and map identifiers -> lncRNA_id
# =========================
df_lpi = pd.read_csv(lpi_file)

lnc_col = df_lpi.columns[0]
prot_col = df_lpi.columns[1]

# Step 8.1: remove interactions involving no-pos lncRNAs BEFORE mapping
df_lpi = df_lpi[~df_lpi[lnc_col].astype(str).isin(no_pos_ids)].copy()

# Step 8.2: map lncRNA identifier to new lncRNA_id
df_lpi[lnc_col] = df_lpi[lnc_col].map(lambda x: old_to_new.get(x, x))

# Step 8.3: remove any interactions that still cannot be mapped (optional)
# If you only want rows where lncRNA_id is a valid cluster id:
valid_cluster_ids = set(df_cluster_bed["lncRNA_id"])
df_lpi = df_lpi[df_lpi[lnc_col].isin(valid_cluster_ids)].copy()

# Step 8.4: deduplicate
df_lpi = df_lpi.drop_duplicates(subset=[lnc_col, prot_col]).reset_index(drop=True)

df_lpi.columns=['lncRNA_id','protein']
df_lpi.to_csv(out_lpi, index=False)


print("✅ Done!")


In [None]:
import pandas as pd
from collections import defaultdict

# =========================
# Configuration
# =========================
species = "mouse"

bed_file = "mouse_lncRNA_0-based.bed"              # Input BED6: chr start end identifier score strand
overlap_file = f"{species}_overlap.txt"            # Overlap pairs: A B (may contain AB/BA redundancy)
lpi_file = f"../../data/LPI/{species}/lpi.csv"     # LPI: lncRNA_id, protein_id (2 columns)

# NEW: lncRNAs without genomic positions
no_pos_file = "mouse_lnc_no_pos.csv"

# Output files
out_bed = f"../../data/LPI/{species}/lncRNA_dedup.bed"
out_mapping = f"../../data/LPI/{species}/lncRNA_mapping.csv"
out_lpi = f"../../data/LPI/{species}/lpi_dedup.csv"


# =========================
# Helper: chromosome sorting
# =========================
def chr_sort_key(c):
    """
    Sort chromosomes in a mouse-friendly way:
    chr1..chr22, chrX, chrY, chrM.
    Works for both 'chr1' and '1' styles.
    """
    c = str(c)
    c2 = c.replace("chr", "")
    if c2.isdigit():
        return (0, int(c2))
    if c2 == "X":
        return (1, 23)
    if c2 == "Y":
        return (1, 24)
    if c2 in ("M", "MT"):
        return (2, 25)
    return (3, c2)


# =========================
# 0) Load lncRNAs without positions
# =========================
df_no_pos = pd.read_csv(no_pos_file)
no_pos_ids = set(df_no_pos['identifier'].astype(str))

print(f"[Info] Loaded {len(no_pos_ids)} lncRNAs without genomic positions from {no_pos_file}")


# =========================
# 1) Load BED
# =========================
df_bed = pd.read_csv(
    bed_file, sep="\t", header=None,
    names=["chr", "start", "end", "identifier", "score", "strand"]
)

# Index by identifier for quick lookup
bed_index = df_bed.set_index("identifier")


# =========================
# 2) Load overlap pairs
# =========================
merge_pairs = pd.read_csv(overlap_file, sep=r"\s+", header=None, names=["A", "B"])
merge_pairs = merge_pairs[merge_pairs["A"] != merge_pairs["B"]].copy()  # remove self-pairs


# =========================
# 3) Union-Find to group overlaps
# =========================
parent = {}

def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(x, y):
    rx, ry = find(x), find(y)
    if rx != ry:
        parent[ry] = rx

# Union all pairs (AB/BA redundancy does not affect final groups)
for a, b in zip(merge_pairs["A"], merge_pairs["B"]):
    union(a, b)

# Build groups
groups = defaultdict(set)
all_ids_in_pairs = set(merge_pairs["A"]).union(set(merge_pairs["B"]))
for gid in all_ids_in_pairs:
    groups[find(gid)].add(gid)


# =========================
# 4) Build clusters using union coordinates
# =========================
cluster_candidates = []  # each item: dict with chr/start/end/strand and member list

for group in groups.values():
    # Keep only members existing in BED (no_pos_ids will naturally be excluded)
    members = [m for m in group if m in bed_index.index]
    if len(members) == 0:
        continue

    group_df = bed_index.loc[members].copy()

    # Union range
    u_chr = group_df["chr"].iloc[0]
    u_strand = group_df["strand"].iloc[0]
    u_start = int(group_df["start"].min())
    u_end = int(group_df["end"].max())

    # Consistency check (should be consistent because bedtools used -s)
    if len(set(group_df["chr"])) != 1 or len(set(group_df["strand"])) != 1:
        print(f"[Warning] Inconsistent chr/strand in group: {members}")

    members_sorted = sorted(members)

    cluster_candidates.append({
        "chr": u_chr,
        "start": u_start,
        "end": u_end,
        "strand": u_strand,
        "members": members_sorted
    })


# =========================
# 5) Add singleton genes as clusters (not in any overlap group)
# =========================
merged_members = set().union(*[set(v) for v in groups.values()]) if groups else set()
all_ids = set(df_bed["identifier"])
singleton_ids = all_ids - merged_members

singleton_ids_sorted = sorted(
    list(singleton_ids),
    key=lambda x: (
        chr_sort_key(bed_index.loc[x, "chr"]),
        int(bed_index.loc[x, "start"]),
        int(bed_index.loc[x, "end"]),
        str(bed_index.loc[x, "strand"]),
        x
    )
)

for sid in singleton_ids_sorted:
    row = bed_index.loc[sid]
    cluster_candidates.append({
        "chr": row["chr"],
        "start": int(row["start"]),
        "end": int(row["end"]),
        "strand": row["strand"],
        "members": [sid]
    })


# =========================
# 6) Assign incremental lncRNA_id (stable ordering)
# =========================
cluster_candidates_sorted = sorted(
    cluster_candidates,
    key=lambda x: (
        chr_sort_key(x["chr"]),
        x["start"],
        x["end"],
        x["strand"],
        ";".join(x["members"])
    )
)

cluster_rows = []         
mapping_long_rows = []    
old_to_new = {}           

for i, c in enumerate(cluster_candidates_sorted, start=1):
    lncRNA_id = f"l{i:06d}"

    # Record BED row
    cluster_rows.append({
        "chr": c["chr"],
        "start": c["start"],
        "end": c["end"],
        "lncRNA_id": lncRNA_id,
        "score": 0,
        "strand": c["strand"]
    })

    # Record long mapping rows and dictionary mapping
    for m in c["members"]:
        mapping_long_rows.append({"lncRNA_id": lncRNA_id, "member_id": m})
        old_to_new[m] = lncRNA_id

df_cluster_bed = pd.DataFrame(cluster_rows)
df_mapping_long = pd.DataFrame(mapping_long_rows)


# =========================
# 7) Save BED6 and mapping table
# =========================
df_cluster_bed = df_cluster_bed.sort_values(
    by=["chr", "start", "end", "strand", "lncRNA_id"],
    key=lambda col: col.map(chr_sort_key) if col.name == "chr" else col
).reset_index(drop=True)

df_mapping_long = df_mapping_long.sort_values(["lncRNA_id", "member_id"]).drop_duplicates().reset_index(drop=True)

df_cluster_bed[["chr", "start", "end", "lncRNA_id", "score", "strand"]].to_csv(
    out_bed, sep="\t", header=False, index=False
)

df_mapping_long.to_csv(out_mapping, index=False)


# =========================
# 8) Update LPI: remove no-pos lncRNAs and map identifiers -> lncRNA_id
# =========================
df_lpi = pd.read_csv(lpi_file)

lnc_col = df_lpi.columns[0]
prot_col = df_lpi.columns[1]

# Step 8.1: remove interactions involving no-pos lncRNAs BEFORE mapping
df_lpi = df_lpi[~df_lpi[lnc_col].astype(str).isin(no_pos_ids)].copy()

# Step 8.2: map lncRNA identifier to new lncRNA_id
df_lpi[lnc_col] = df_lpi[lnc_col].map(lambda x: old_to_new.get(x, x))

# Step 8.3: remove any interactions that still cannot be mapped (optional)
# If you only want rows where lncRNA_id is a valid cluster id:
valid_cluster_ids = set(df_cluster_bed["lncRNA_id"])
df_lpi = df_lpi[df_lpi[lnc_col].isin(valid_cluster_ids)].copy()

# Step 8.4: deduplicate
df_lpi = df_lpi.drop_duplicates(subset=[lnc_col, prot_col]).reset_index(drop=True)

df_lpi.columns=['lncRNA_id','protein']
df_lpi.to_csv(out_lpi, index=False)


print("✅ Done!")