In [None]:
import pandas as pd
import numpy as np

# Load the BED file (local)
bed_file_path = "./129S1_flye_gcpp.RM.bed"
bed_df = pd.read_csv(bed_file_path, sep="\t", header=None, comment='#')

columns = [
    "chrom", "start", "end", "repeat_name", "repeat_class_family", "score",
    "divergence", "deletions", "insertions", "query_left", "strand",
    "repeat_start", "repeat_end", "repeat_left", "ID", "SW_score"
]

bed_df.columns = columns

# Display the first few rows and the number of columns to inspect structure
bed_df.head(), bed_df.shape

(               chrom  start   end    repeat_name repeat_class_family  score  \
 0  contig_1000|arrow      1   331     RLTR13D3A1            LTR/ERVK   1822   
 1  contig_1000|arrow    400  1287  ERVB2_1A-I_MM            LTR/ERVK   1065   
 2  contig_1000|arrow   1411  1504       (GTTTT)n       Simple_repeat     43   
 3  contig_1000|arrow   1505  1652        B1_Mus1            SINE/Alu    902   
 4  contig_1000|arrow   1539  1669     ORR1B1-int       LTR/ERVL-MaLR    602   
 
    divergence  deletions  insertions query_left strand repeat_start  \
 0        11.7        2.1         4.3    (19162)      C        (594)   
 1        30.6        4.6         2.9    (18206)      C        (126)   
 2        15.9        3.2         1.0    (17989)      +            1   
 3        13.0        0.0         1.4    (17841)      C          (2)   
 4        20.2        0.0         1.6    (17824)      C        (750)   
 
    repeat_end repeat_left  ID  SW_score  
 0         357          34   1        16 

In [51]:
# Filter for LINE/L1 elements
l1_elements = bed_df[bed_df["repeat_class_family"].str.contains("LINE/L1")]

print(l1_elements.head())

                chrom  start   end repeat_name repeat_class_family  score  \
5   contig_1000|arrow   1918  2483     L1_Mur1             LINE/L1   3769   
7   contig_1000|arrow   2562  2989     L1_Mur1             LINE/L1   2760   
8   contig_1000|arrow   2987  3313     Lx3_Mus             LINE/L1   1955   
10  contig_1000|arrow   3524  3856     Lx3_Mus             LINE/L1   6353   
12  contig_1000|arrow   3889  4618     Lx3_Mus             LINE/L1   6353   

    divergence  deletions  insertions query_left strand repeat_start  \
5          8.6        0.2         1.2    (17010)      C       (1629)   
7          9.6        1.6         0.0    (16504)      C       (2165)   
8         11.5        2.7         0.0    (16180)      +         5221   
10        11.5        0.7         2.1    (15637)      +         5587   
12        11.5        0.7         2.1    (14875)      +         5921   

    repeat_end repeat_left  ID  SW_score  
5         5197        4638   6        16  
7         4661    

In [37]:

# Check distinct repeat names to identify 'young' L1s
unique_l1_names = l1_elements["repeat_name"].unique()
unique_l1_names.sort()  # sort for readability
unique_l1_names

array(['HAL1', 'HAL1M8', 'HAL1ME', 'HAL1b', 'L1Lx_I', 'L1Lx_II',
       'L1Lx_III', 'L1Lx_IV', 'L1M', 'L1M1', 'L1M2', 'L1M2a', 'L1M2c',
       'L1M3', 'L1M3a', 'L1M3b', 'L1M3c', 'L1M3d', 'L1M3de', 'L1M3e',
       'L1M3f', 'L1M4', 'L1M4a1', 'L1M4a2', 'L1M4b', 'L1M4c', 'L1M5',
       'L1M6', 'L1M6B', 'L1M7', 'L1M8', 'L1MA10', 'L1MA4', 'L1MA4A',
       'L1MA5', 'L1MA5A', 'L1MA6', 'L1MA7', 'L1MA8', 'L1MA9', 'L1MB1',
       'L1MB2', 'L1MB3', 'L1MB4', 'L1MB5', 'L1MB7', 'L1MB8', 'L1MC',
       'L1MC1', 'L1MC2', 'L1MC3', 'L1MC4', 'L1MC4a', 'L1MC5', 'L1MC5a',
       'L1MCa', 'L1MCb', 'L1MCc', 'L1MD', 'L1MD1', 'L1MD2', 'L1MD3',
       'L1MDa', 'L1MDb', 'L1ME1', 'L1ME2', 'L1ME2z', 'L1ME3', 'L1ME3A',
       'L1ME3B', 'L1ME3C', 'L1ME3Cz', 'L1ME3D', 'L1ME3E', 'L1ME3F',
       'L1ME3G', 'L1ME4a', 'L1ME4b', 'L1ME4c', 'L1ME5', 'L1MEa', 'L1MEb',
       'L1MEc', 'L1MEd', 'L1MEf', 'L1MEg', 'L1MEg1', 'L1MEg2', 'L1MEh',
       'L1MEi', 'L1MEj', 'L1MdA_I', 'L1MdA_II', 'L1MdA_III', 'L1MdA_IV',
       'L1MdA_V

In [59]:
# Filter: young L1s (e.g., L1Md_T, L1Md_A, etc.)
#young_l1_families = []
#pattern = "|".join(young_l1_families)
young_l1 = l1_elements[l1_elements["repeat_name"].str.contains(r'\bL1MdGf_I\b', regex=True)]
young_l1 = young_l1[np.abs(l1_elements["start"]-l1_elements["end"])>5800]


# Display a preview
print(young_l1.head())

                   chrom    start      end repeat_name repeat_class_family  \
4624   contig_1002|arrow   930099   936637    L1MdGf_I             LINE/L1   
5332   contig_1002|arrow  1369379  1376303    L1MdGf_I             LINE/L1   
10800  contig_1002|arrow  5188681  5195783    L1MdGf_I             LINE/L1   
10862  contig_1009|arrow     4798    11321    L1MdGf_I             LINE/L1   
11041  contig_1009|arrow   173923   180401    L1MdGf_I             LINE/L1   

       score  divergence  deletions  insertions query_left strand  \
4624   42036         1.1        0.4         0.9  (4271769)      C   
5332   39635         0.6        0.1         0.1  (3832103)      +   
10800  41283         0.7        0.1         0.1    (12623)      +   
10862  39282         1.8        0.0         0.0  (1338090)      C   
11041  41190         2.3        0.2         0.1  (1169010)      C   

      repeat_start  repeat_end repeat_left    ID  SW_score  
4624           (0)        6579          92  4014       

  young_l1 = young_l1[np.abs(l1_elements["start"]-l1_elements["end"])>5800]


In [None]:
# Output to bedfile
young_l1.to_csv('young_l1.bed', sep='\t', header=False, index=False)

bedgraph_L1 = young_l1[['chrom', 'start', 'end',]]
bedgraph_L1['chrom'] = "chrX"
bedgraph_L1['count'] = np.abs(l1_elements["start"]-l1_elements["end"])

bedgraph_L1.to_csv('young_l1.bedgraph', sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bedgraph_L1['chrom'] = "chrX"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bedgraph_L1['count'] = np.abs(l1_elements["start"]-l1_elements["end"])
