# Example from data

First, load libraries.

In [1]:
# Load Python standard libraries
import csv
from pathlib import Path
import itertools
from collections import Counter

# Load our libraries
import catcoocc
import malign

# Load data
def read_data(data_path, fields, sep=" ", delimiter="\t"):
    # Read the pairs of sequences
    pairs = []
    with open(data_path) as csvfile:
        reader = csv.DictReader(csvfile, delimiter=delimiter)
        for row in reader:
            pair = [row[field].split(sep) for field in fields]
            pairs.append(pair)
            
    return pairs

Now load data and experiment.

In [2]:
# Build full path
base_path = Path("/home/tresoldi/src/malign")
full_path = base_path / "resources" / "cmudict" / "cmudict.tsv"


pairs = read_data(full_path, ["Orthography", "Segments"])
print(f"Read {len(pairs)} for {full_path}")

import random
sample = random.sample(pairs, 5)

# First, demo with dumb alignments
for pair in sample:
    print(malign.tabulate_alms(malign.multi_align(pair, method="dumb")))
    
# Collect co-occurrences
# TODO: add gaps
cooccs = list(itertools.chain(*[itertools.product(seq_a, seq_b) for seq_a, seq_b in pairs]))

# Compute the approximate score for all co-occurrences
counter_cc = Counter(cooccs)
counter_a = Counter(itertools.chain(*[seq_a for seq_a, _ in pairs]))
counter_b = Counter(itertools.chain(*[seq_b for _, seq_b in pairs]))

scores = {}
for coocc, count in counter_cc.items():
    score = count / (counter_a[coocc[0]] * counter_b[coocc[1]])
    scores[coocc] = score

# scale over mean
values = scores.values()
mean = sum(values) / len(values) # use np
score_diff = max(values) - min(values)

# scale over mean
correct = 10
scaled_scores = {
    pair: ((score - mean) / score_diff) * correct
    for pair, score in scores.items()
}

scorer = malign.ScoringMatrix(scaled_scores)

print("=============")

# First, demo with dumb alignments
for pair in sample:
    print(malign.tabulate_alms(malign.multi_align(pair, matrix=scorer, method="anw")))

Read 134373 for /home/tresoldi/src/malign/resources/cmudict/cmudict.tsv
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |  #5  |  #6  |  #7  |
|-------|-------|---------|------|------|------|------|------|------|------|------|
| 0     | A     |   -0.79 |  B   |  E   |  R   |  G   |  L   |  U   |  N   |  D   |
| 0     | B     |   -0.79 |  b   |  ɝ   |  ɡ   |  l   |  ʌ   |  n   |  d   |  -   |
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |
|-------|-------|---------|------|------|------|------|------|
| 0     | A     |   -0.54 |  E   |  C   |  K   |  E   |  R   |
| 0     | B     |   -0.54 |  -   |  ɛ   |  k   |  ɝ   |  -   |
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |  #5  |
|-------|-------|---------|------|------|------|------|------|------|
| 0     | A     |   -0.75 |  P   |  A   |  N   |  D   |  E   |  R   |
| 0     | B     |   -0.75 |  p   |  æ   |  n   |  d   |  ɝ   |  -   |
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4 