In [1]:
import re

**Pattern to match**
* Regular expressions customarily use r-strings.

In [2]:
pattern = re.compile(r"([0-9]+):([0-9]+)_([ACGT])_([ACGT])")
pattern

re.compile(r'([0-9]+):([0-9]+)_([ACGT])_([ACGT])', re.UNICODE)

**String search**
* If the pattern is found, query contains the results. Else, query is none.

In [3]:
# Case where query succedes.
id = "1:100_A_G"
match = pattern.search(id)
if match:
  print(match.group(0))
else:
  print("Query failed.")

1:100_A_G


In [4]:
# Case where query fails.
id = "rs100"
match = pattern.search(id)
if match:
  print(match.group(0))
else:
  print("Query failed.")

Query failed.


**Extract match groups**
* `match.group(0)` contains the entire match.
* `match.group(i)` contains the component groups, in order.  

In [5]:
id = "1:100_A_G"
match = pattern.search(id)

In [6]:
snp = {
  "id": match.group(0),
  "chr": match.group(1),
  "pos": match.group(2),
  "ref": match.group(3),
  "alt": match.group(3),
}
snp

{'id': '1:100_A_G', 'chr': '1', 'pos': '100', 'ref': 'A', 'alt': 'A'}

**Find all**
* Result is a list of tuples.

In [7]:
ids = "1:100_A_G 2:200_C_T rs123"
hits = pattern.findall(ids)

In [8]:
snps = []
for hit in hits:
  chr, pos, ref, alt = hit
  snp = {
    "id": f"{chr}:{pos}_{ref}_{alt}",
    "chr": chr,
    "pos": pos,
    "ref": ref,
    "alt": alt,
  }
  snps.append(snp)

In [9]:
snps

[{'id': '1:100_A_G', 'chr': '1', 'pos': '100', 'ref': 'A', 'alt': 'G'},
 {'id': '2:200_C_T', 'chr': '2', 'pos': '200', 'ref': 'C', 'alt': 'T'}]

**String substitution**
* "\1" is replaced by the contents of the first group, "\2" by the contents of the second, etc.

In [10]:
orig = r"([0-9]+):([0-9]+)_([ACGT])_([ACGT])"
reformat = r"chr\1:\2:\3:\4"
re.sub(orig, reformat, "1:100_A_G")

'chr1:100:A:G'