# Program name: coordinate_extension
- Purpose: Extend chromosome coordinates of CpG islands from 850K chip to a vlaue of choice (both input and output in .bed)
- Author: Vera Laub
- Last update: 04/28/2025

In [1]:
# 1. Retrieve methylation sites (CpG only).

file = open("../files/filename.bed", "r")

meth_coordinates = file.readlines()

file.close()

print(meth_coordinates[:10])

['chr18\t44946523\t44946725\n', 'chr1\t165325940\t165326142\n', 'chr5\t176238274\t176238476\n', 'chr18\t6976783\t6976985\n', 'chr11\t131992572\t131992774\n', 'chr4\t141715158\t141715360\n', 'chr5\t158104457\t158104659\n', 'chr1\t215355021\t215355223\n', 'chr13\t78973324\t78973526\n', 'chr8\t11203853\t11204055\n']


In [2]:
# Process data to remove whitepsace and convert numeric data    
for n in range(0, len(meth_coordinates)):
    meth_coordinates[n] = meth_coordinates[n].strip()
    meth_coordinates[n] = meth_coordinates[n].split()
    meth_coordinates[n][1] = int(meth_coordinates[n][1])
    meth_coordinates[n][2] = int(meth_coordinates[n][2])
    
print(meth_coordinates[:10])

[['chr18', 44946523, 44946725], ['chr1', 165325940, 165326142], ['chr5', 176238274, 176238476], ['chr18', 6976783, 6976985], ['chr11', 131992572, 131992774], ['chr4', 141715158, 141715360], ['chr5', 158104457, 158104659], ['chr1', 215355021, 215355223], ['chr13', 78973324, 78973526], ['chr8', 11203853, 11204055]]


In [3]:
i = 200     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
meth_coordinates_extended = []     # new list to store information of extended sites


# 2. Extend methylation coordiantes with value of choice (i, see above).
for line in range(0, len(meth_coordinates)):
    chrom = meth_coordinates[j][0]
    start = meth_coordinates[j][1]-i
    stop = meth_coordinates[j][2]+i
    meth_coordinates_extended.append([chrom, start, stop])
    j += 1
    
print(meth_coordinates_extended[:10])

[['chr18', 44946323, 44946925], ['chr1', 165325740, 165326342], ['chr5', 176238074, 176238676], ['chr18', 6976583, 6977185], ['chr11', 131992372, 131992974], ['chr4', 141714958, 141715560], ['chr5', 158104257, 158104859], ['chr1', 215354821, 215355423], ['chr13', 78973124, 78973726], ['chr8', 11203653, 11204255]]


In [5]:
# 3. Save extended coordiantes of methylation site in new file

output_file = open("../files/filename_+-200nt.bed", "w")

def reconstruct_extended_sites(coordinates):
    k = 0
    for line in range(len(coordinates)):
        output_file.write(str(coordinates[k][0]) + "\t" + str(coordinates[k][1]) + "\t" + str(coordinates[k][2]) + "\n")
        k += 1
        
reconstruct_extended_sites(meth_coordinates_extended)

output_file.close()

### Application of program

In [14]:
# 1. Retrieve methylation sites (CpG only).

file = open("../files/2024-01-10_invHypermethylated_ChrStartStop.bed", "r")

meth_coordinates = file.readlines()

file.close()

# Process data to remove whitepsace and convert numeric data    
for n in range(0, len(meth_coordinates)):
    meth_coordinates[n] = meth_coordinates[n].strip()
    meth_coordinates[n] = meth_coordinates[n].split()
    meth_coordinates[n][1] = int(meth_coordinates[n][1])
    meth_coordinates[n][2] = int(meth_coordinates[n][2])


i = 1000     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
meth_coordinates_extended = []     # new list to store information of extended sites


# 2. Extend methylation coordiantes with value of choice (i, see above).
for line in range(0, len(meth_coordinates)):
    chrom = meth_coordinates[j][0]
    start = meth_coordinates[j][1]-i
    stop = meth_coordinates[j][2]+i
    meth_coordinates_extended.append([chrom, start, stop])
    j += 1
    
    
# 3. Save extended coordiantes of methylation site in new file

output_file = open("../files/2025-04-15_invHypermethylated_ChrStartStop_+-1000nt.bed", "w")

def reconstruct_extended_sites(coordinates):
    k = 0
    for line in range(len(coordinates)):
        output_file.write(str(coordinates[k][0]) + "\t" + str(coordinates[k][1]) + "\t" + str(coordinates[k][2]) + "\n")
        k += 1
        
reconstruct_extended_sites(meth_coordinates_extended)

output_file.close()

### Application of the program

In [40]:
# Retrieve data from input file
with open('../data_raw/RNBeads_Export_EPIC-Daten_Manuscript-Buhlmann_2025_04_18.csv', 'r') as file:
    lines = file.readlines()  # This makes it a list

meth_coordinates = []

for line in lines[1:]:  # Now this works, skipping the header
    parts = line.strip().split(';')
    selected = [parts[1], parts[3], parts[4]]
    meth_coordinates.append(selected)

print(meth_coordinates[:10])

[['0.542788', '11', '28984905'], ['0.450184', '4', '131400334'], ['0.441475', '6', '30431596'], ['0.414939', 'X', '102531708'], ['0.407380', '5', '160974837'], ['0.407169', '6', '46425781'], ['0.405515', '11', '28978110'], ['0.405280', '16', '82660727'], ['0.397828', '11', '28699290'], ['0.395300', '10', '23481776']]


In [41]:
# Process meth_coordinates to include delta-beta value, chromosome, start and stop

# Add 'chr' to chromosome identifier
for entry in meth_coordinates:
    entry[1] = 'chr' + entry[1]

# Add stop coordinate
cleaned = []
for entry in meth_coordinates:
    if entry[2].isdigit():
        start = int(entry[2])
        end = start + 1
        entry.append(str(end))
        cleaned.append(entry)
    else:
        print(f"Removing bad entry: {entry}")

meth_coordinates = cleaned
    
print(meth_coordinates[:10])

Removing bad entry: ['0.076022', 'chr', '']
Removing bad entry: ['0.054873', 'chr', '']
Removing bad entry: ['0.048530', 'chr', '']
Removing bad entry: ['0.041960', 'chr', '']
Removing bad entry: ['0.039425', 'chr', '']
Removing bad entry: ['0.039276', 'chr', '']
Removing bad entry: ['0.034419', 'chr', '']
Removing bad entry: ['0.025568', 'chr', '']
Removing bad entry: ['0.024130', 'chr', '']
Removing bad entry: ['0.023715', 'chr', '']
Removing bad entry: ['0.021188', 'chr', '']
Removing bad entry: ['0.018407', 'chr', '']
Removing bad entry: ['0.015827', 'chr', '']
Removing bad entry: ['0.015746', 'chr', '']
Removing bad entry: ['0.015112', 'chr', '']
Removing bad entry: ['0.013519', 'chr', '']
Removing bad entry: ['0.012846', 'chr', '']
Removing bad entry: ['0.006753', 'chr', '']
Removing bad entry: ['0.006248', 'chr', '']
Removing bad entry: ['0.004233', 'chr', '']
Removing bad entry: ['0.002728', 'chr', '']
Removing bad entry: ['0.001770', 'chr', '']
Removing bad entry: ['0.001249',

In [43]:
# Process meth_coordinates to make numbers into float/integer

for entry in meth_coordinates:
    # Convert beta value (col 0) to float
    entry[0] = float(entry[0])
    
    # Convert start and end coordinates (cols 2 and 3) to int
    entry[2] = int(entry[2])
    entry[3] = int(entry[3])

print(meth_coordinates[:10])

[[0.542788, 'chr11', 28984905, 28984906], [0.450184, 'chr4', 131400334, 131400335], [0.441475, 'chr6', 30431596, 30431597], [0.414939, 'chrX', 102531708, 102531709], [0.40738, 'chr5', 160974837, 160974838], [0.407169, 'chr6', 46425781, 46425782], [0.405515, 'chr11', 28978110, 28978111], [0.40528, 'chr16', 82660727, 82660728], [0.397828, 'chr11', 28699290, 28699291], [0.3953, 'chr10', 23481776, 23481777]]


In [None]:
# Save filtered coordiantes of methylation site in new file

# Hypermethylated sites
# Filter entries where beta value (entry[0]) > 0.2
hypermethylated_sites = [entry for entry in meth_coordinates if entry[0] > 0.2]

# Write to a BED file with columns: chrom, start, end
with open('../files/2025-04-28_hypermethylatedInv_deltabeta>0.2_ChrStartStop.bed', 'w') as bed_file:
    for entry in hypermethylated_sites:
        bed_file.write(f"{entry[1]}\t{entry[2]}\t{entry[3]}\n")

# Hypomethylated sites
# Filter entries where beta value (entry[0]) > -0.2
hypomethylated_sites = [entry for entry in meth_coordinates if entry[0] < -0.2]

# Write to a BED file with columns: chrom, start, end
with open('../files/2025-04-28_hypomethylatedInv_deltabeta<-0.2_ChrStartStop.bed', 'w') as bed_file:
    for entry in hypomethylated_sites:
        bed_file.write(f"{entry[1]}\t{entry[2]}\t{entry[3]}\n")

In [57]:
# Extend coordinates of hyper-/ and hypomethylated sites

# Extend methylation coordiantes for hypermethylated sites with value of choice (i, see above).
i = 1000     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
hypermethylated_coordinates_extended = []     # new list to store information of extended sites

for line in range(0, len(hypermethylated_sites)):
    chrom = hypermethylated_sites[j][1]
    start = hypermethylated_sites[j][2]-i
    stop = hypermethylated_sites[j][3]+i
    hypermethylated_coordinates_extended.append([chrom, start, stop])
    j += 1
    
print(hypermethylated_coordinates_extended[:10])

# Extend methylation coordiantes for hypomethylated sites with value of choice (i, see above).
i = 1000     # value to extend coordinates with, i can be changed depending on extension requirements.
j = 0      # counter variable to iterate over entire list of methylation sites
hypomethylated_coordinates_extended = []     # new list to store information of extended sites

for line in range(0, len(hypomethylated_sites)):
    chrom = hypomethylated_sites[j][1]
    start = hypomethylated_sites[j][2]-i
    stop = hypomethylated_sites[j][3]+i
    hypomethylated_coordinates_extended.append([chrom, start, stop])
    j += 1
    
print(hypomethylated_coordinates_extended[:10])

[['chr11', 28983905, 28985906], ['chr4', 131399334, 131401335], ['chr6', 30430596, 30432597], ['chrX', 102530708, 102532709], ['chr5', 160973837, 160975838], ['chr6', 46424781, 46426782], ['chr11', 28977110, 28979111], ['chr16', 82659727, 82661728], ['chr11', 28698290, 28700291], ['chr10', 23480776, 23482777]]
[['chr8', 6577451, 6579452], ['chr5', 76170312, 76172313], ['chr9', 74289826, 74291827], ['chr6', 1409267, 1411268], ['chr4', 120664941, 120666942], ['chr20', 20800233, 20802234], ['chr6', 25874233, 25876234], ['chr7', 30386954, 30388955], ['chr20', 46911835, 46913836], ['chr1', 155930763, 155932764]]


In [58]:
# Save extended coordiantes of hypermethylated sites in new file
output_file = open("../files/2025-04-28_hypermethylatedInv_deltabeta>0.2_ChrStartStop+-1000nt.bed", "w")

def reconstruct_extended_sites(coordinates):
    k = 0
    for line in range(len(coordinates)):
        output_file.write(str(coordinates[k][0]) + "\t" + str(coordinates[k][1]) + "\t" + str(coordinates[k][2]) + "\n")
        k += 1
        
reconstruct_extended_sites(hypermethylated_coordinates_extended)

output_file.close()


# Save extended coordiantes of hypermethylated sites in new file
output_file = open("../files/2025-04-28_hypomethylatedInv_deltabeta<-0.2_ChrStartStop+-1000nt.bed", "w")
        
reconstruct_extended_sites(hypomethylated_coordinates_extended)

output_file.close()