# Program name: coordinate_extension
- Purpose: Extend chromosome coordinates of CpG islands from 850K chip to a vlaue of choice (both input and output in .bed)
- Author: Vera Laub
- Last update: 04/15/2025

In [21]:
# 1. Retrieve methylation sites (CpG only).

file = open("../data/regulatory_elements/chromhmm_E11_Limb_mm10.bed", "r")

meth_coordinates = file.readlines()

file.close()

print(meth_coordinates[:10])

['chr1\t3531624\t3531843\t27\n', 'chr1\t3670619\t3671074\t34\n', 'chr1\t3671654\t3672156\t45\n', 'chr1\t4491701\t4493673\t165\n', 'chr1\t4496947\t4497608\t47\n', 'chr1\t4571641\t4572075\t44\n', 'chr1\t4689184\t4689397\t24\n', 'chr1\t4785376\t4785814\t49\n', 'chr1\t4807559\t4808103\t73\n', 'chr1\t4857465\t4858372\t83\n']


In [22]:
# Process data to remove whitepsace and convert numeric data    
for n in range(0, len(meth_coordinates)):
    meth_coordinates[n] = meth_coordinates[n].strip()
    meth_coordinates[n] = meth_coordinates[n].split()
    meth_coordinates[n][1] = int(meth_coordinates[n][1])
    meth_coordinates[n][2] = int(meth_coordinates[n][2])
print(meth_coordinates[:10]) 

[['chr1', 3531624, 3531843, '27'], ['chr1', 3670619, 3671074, '34'], ['chr1', 3671654, 3672156, '45'], ['chr1', 4491701, 4493673, '165'], ['chr1', 4496947, 4497608, '47'], ['chr1', 4571641, 4572075, '44'], ['chr1', 4689184, 4689397, '24'], ['chr1', 4785376, 4785814, '49'], ['chr1', 4807559, 4808103, '73'], ['chr1', 4857465, 4858372, '83']]


In [23]:
## Store coordinates of peakcenter in new list
# Initiate new list
meth_coordinate_center = []

# Take the first three elements from each sublist
meth_coordinate_center = [sublist[:3] for sublist in meth_coordinates]

# Output the new list of lists
print(meth_coordinate_center[:10])

# Calculate peak centers
meth_coordinate_center = [[entry[0], int((entry[1] + entry[2]) / 2)] for entry in meth_coordinate_center]

# Output result
print(meth_coordinate_center[:10])

[['chr1', 3531624, 3531843], ['chr1', 3670619, 3671074], ['chr1', 3671654, 3672156], ['chr1', 4491701, 4493673], ['chr1', 4496947, 4497608], ['chr1', 4571641, 4572075], ['chr1', 4689184, 4689397], ['chr1', 4785376, 4785814], ['chr1', 4807559, 4808103], ['chr1', 4857465, 4858372]]
[['chr1', 3531733], ['chr1', 3670846], ['chr1', 3671905], ['chr1', 4492687], ['chr1', 4497277], ['chr1', 4571858], ['chr1', 4689290], ['chr1', 4785595], ['chr1', 4807831], ['chr1', 4857918]]


In [24]:
i = 1000     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
meth_coordinates_extended = []     # new list to store information of extended sites


# 2. Extend methylation coordiantes with value of choice (i, see above).
for line in range(0, len(meth_coordinate_center)):
    chrom = meth_coordinate_center[j][0]
    start = meth_coordinate_center[j][1]-i
    stop = meth_coordinate_center[j][1]+i
    meth_coordinates_extended.append([chrom, start, stop])
    j += 1
    
print(meth_coordinates_extended[:10])

[['chr1', 3530733, 3532733], ['chr1', 3669846, 3671846], ['chr1', 3670905, 3672905], ['chr1', 4491687, 4493687], ['chr1', 4496277, 4498277], ['chr1', 4570858, 4572858], ['chr1', 4688290, 4690290], ['chr1', 4784595, 4786595], ['chr1', 4806831, 4808831], ['chr1', 4856918, 4858918]]


In [25]:
# Define standard chromosomes
standard_chromosomes = {f'chr{i}' for i in range(1, 19)} | {'chrX', 'chrY'}

# Filter the list
meth_coordinates_extended = [entry for entry in meth_coordinates_extended if entry[0] in standard_chromosomes]

# Output result
print(meth_coordinates_extended[:10])

[['chr1', 3530733, 3532733], ['chr1', 3669846, 3671846], ['chr1', 3670905, 3672905], ['chr1', 4491687, 4493687], ['chr1', 4496277, 4498277], ['chr1', 4570858, 4572858], ['chr1', 4688290, 4690290], ['chr1', 4784595, 4786595], ['chr1', 4806831, 4808831], ['chr1', 4856918, 4858918]]


In [26]:
# 3. Save extended coordiantes of methylation site in new file

output_file = open("../data/regulatory_elements/chromhmm_E11_Limb_mm10_heterochromatin.bed", "w")

def reconstruct_extended_sites(coordinates):
    k = 0
    for line in range(len(coordinates)):
        output_file.write(str(coordinates[k][0]) + "\t" + str(coordinates[k][1]) + "\t" + str(coordinates[k][2]) + "\n")
        k += 1
        
reconstruct_extended_sites(meth_coordinates_extended)

output_file.close()

### Application of program

In [14]:
# 1. Retrieve methylation sites (CpG only).

file = open("../files/2024-01-10_invHypermethylated_ChrStartStop.bed", "r")

meth_coordinates = file.readlines()

file.close()

# Process data to remove whitepsace and convert numeric data    
for n in range(0, len(meth_coordinates)):
    meth_coordinates[n] = meth_coordinates[n].strip()
    meth_coordinates[n] = meth_coordinates[n].split()
    meth_coordinates[n][1] = int(meth_coordinates[n][1])
    meth_coordinates[n][2] = int(meth_coordinates[n][2])


i = 1000     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
meth_coordinates_extended = []     # new list to store information of extended sites


# 2. Extend methylation coordiantes with value of choice (i, see above).
for line in range(0, len(meth_coordinates)):
    chrom = meth_coordinates[j][0]
    start = meth_coordinates[j][1]-i
    stop = meth_coordinates[j][2]+i
    meth_coordinates_extended.append([chrom, start, stop])
    j += 1
    
    
# 3. Save extended coordiantes of methylation site in new file

output_file = open("../files/2025-04-15_invHypermethylated_ChrStartStop_+-1000nt.bed", "w")

def reconstruct_extended_sites(coordinates):
    k = 0
    for line in range(len(coordinates)):
        output_file.write(str(coordinates[k][0]) + "\t" + str(coordinates[k][1]) + "\t" + str(coordinates[k][2]) + "\n")
        k += 1
        
reconstruct_extended_sites(meth_coordinates_extended)

output_file.close()

# Further file workup

In [39]:
# 1. Retrieve data from input file
file_path = "../data/regulatory_elements/chromhmm_E11_Limb_mm10.bed"

# 2. Apply filter for wanted feature
# List to store the results
query_coords = []

# Open and read the file
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        if "Weak promoter" in parts[3]:  # Check for the keyword in the annotation column
            chrom = parts[0]
            start = int(parts[1])
            end = int(parts[2])
            query_coords.append([chrom, start, end])

# Output the filtered coordinates
for coord in query_coords:
    print(coord)

# 3. Save filtered coordiantes of methylation site in new file

output_file = open("../data/regulatory_elements/chromhmm_E11_Limb_mm10_weak-promoter.bed", "w")

def reconstruct_filtered_sites(query_coords):
    k = 0
    for line in range(len(query_coords)):
        output_file.write(str(query_coords[k][0]) + "\t" + str(query_coords[k][1]) + "\t" + str(query_coords[k][2]) + "\n")
        k += 1
        
reconstruct_filtered_sites(query_coords)

output_file.close()

['chr1', 4571200, 4571400]
['chr1', 4572000, 4572200]
['chr1', 4785000, 4786200]
['chr1', 4807400, 4807600]
['chr1', 4808200, 4808800]
['chr1', 4857200, 4857400]
['chr1', 4857600, 4858000]
['chr1', 4858200, 4859000]
['chr1', 5083000, 5083600]
['chr1', 6213800, 6214600]
['chr1', 6215200, 6215800]
['chr1', 6358800, 6359000]
['chr1', 6382800, 6383400]
['chr1', 6441200, 6441600]
['chr1', 7088800, 7089000]
['chr1', 7089600, 7090000]
['chr1', 7206200, 7206800]
['chr1', 7397600, 7398800]
['chr1', 9544800, 9545200]
['chr1', 9545600, 9546400]
['chr1', 9699600, 9701400]
['chr1', 9747600, 9747800]
['chr1', 9748400, 9748600]
['chr1', 9797600, 9798000]
['chr1', 9798200, 9799000]
['chr1', 9848000, 9848800]
['chr1', 9942600, 9942800]
['chr1', 9943000, 9943200]
['chr1', 9944000, 9944600]
['chr1', 10009200, 10009400]
['chr1', 10037000, 10037600]
['chr1', 10038200, 10039000]
['chr1', 10039200, 10039800]
['chr1', 10040000, 10040400]
['chr1', 10231600, 10232200]
['chr1', 10233400, 10234000]
['chr1', 10993