# Demonstration using data

Run demonstration code on the various resources distributed with `malign`.

In [1]:
# Import Python standard libraries
import csv
import itertools
from collections import Counter
from pathlib import Path

# Import 3rd-party libraries
import catcoocc
import malign
from sklearn.preprocessing import StandardScaler
import numpy as np


# Build resource path
RES_PATH = Path("../resources")

def read_data(filepath, fields, sep=" ", delimiter="\t"):
    """
    Read sequence pairs from CSV/TSV files.
    """

    pairs = []
    
    with open(filepath, encoding="utf-8") as csvfile:
        reader = csv.DictReader(csvfile, delimiter=delimiter)
        for row in reader:
            pair = [row[field].split(sep) for field in fields]
            pairs.append(pair)
            
    return pairs

# TODO: have independent method, also using "windows"
# TODO: not actually using the cooccs library
# TODO: perform some kind of correction for low frequency, bonferroni perhaps?
def infer_matrix(pairs, add_gaps=True):
    """
    Infers a scoring matrix from a collection of pairs.
    """
    
    # Add gaps to mislength if necessary and requested
    # TODO: use dumb alignments?
    # TODO: make independent function
    if add_gaps:
        new_pairs = []
        for pair in pairs:
            len_a, len_b = len(pair[0]), len(pair[1])
            if len_a < len_b:
                new_pairs.append([pair[0] + ["-"]*(len_b-len_a), pair[1]])
            elif len_a > len_b:
                new_pairs.append([pair[0], pair[1] + ["-"]*(len_a-len_b)])
            else:
                new_pairs.append(pair)
                
        pairs = new_pairs
    
    # Collect co-occurrences over the whole sequences
    cooccs = itertools.chain(*[itertools.product(seq_a, seq_b) for seq_a, seq_b in pairs])

    # Compute the approximate-score for all co-occurrences
    counter_cc = Counter(cooccs)
    counter_a = Counter(itertools.chain(*[seq_a for seq_a, _ in pairs]))
    counter_b = Counter(itertools.chain(*[seq_b for _, seq_b in pairs]))

    # Collect scores as mean
    abs_scores = {}
    for (coocc_a, coocc_b), count in counter_cc.items():
        score = count / (counter_a[coocc_a] * counter_b[coocc_b])
        abs_scores[coocc_a, coocc_b] = score

    # Scale scores
    values = list(abs_scores.values())
    arr_values = np.array(values).reshape(-1, 1)
    scaler = StandardScaler()
    scaler.fit(arr_values)

    # Build new dictionary and return
    scores = {pair:scaled_value for pair, scaled_value in zip(abs_scores.keys(), scaler.transform(arr_values))}
    
    return scores

## cmudict

In [2]:
# Read data
# TODO: correct gaps used, as in "non-sequitur"
pairs = read_data(RES_PATH / "cmudict" / "cmudict.tsv", ["Orthography", "Segments"])

# Build first align demo
scores = infer_matrix(pairs)
for k, v in scores.items():
    if v > 1:
        print(k, v)

('X', 'k') [1.39503831]
('X', 's') [1.15719308]
('C', 'k') [1.12027641]
('L', 'l') [1.25036742]
('M', 'm') [2.67011908]
('T', 't') [1.27287724]
('P', 'p') [4.09731883]
('O', 'oʊ') [1.071184]
('Q', 'k') [1.66853672]
('Q', 'w') [7.2508129]
('D', 'd') [2.37499207]
('B', 'b') [3.90002956]
('H', 'h') [2.8892416]
('K', 'k') [1.7384103]
('U', 'aʊ') [1.62071127]
('H', 'ʃ') [1.40738123]
('U', 'j') [1.45847552]
('U', 'u') [1.88066626]
('F', 'f') [6.20621251]
('G', 'ɡ') [3.16492652]
('G', 'ŋ') [2.54272557]
('W', 'w') [6.34641498]
('T', 'θ') [1.17958661]
('H', 'θ') [2.89269145]
('T', 'ð') [1.0116901]
('H', 'ð') [2.69997116]
('U', 'ʊ') [1.20782672]
('-', 'u') [1.88453353]
('Y', 'i') [1.03595084]
('V', 'v') [8.41124576]
('W', 'aʊ') [1.98763216]
('Z', 'z') [2.12897522]
('J', 'dʒ') [11.38843662]
('G', 'dʒ') [1.81468676]
('C', 'tʃ') [1.79910842]
('-', 'j') [4.85848347]
('H', 'tʃ') [1.73290049]
('Y', 'ɔɪ') [2.10706715]
("'", 's') [1.34705188]
("'", 'ɪ') [1.55184819]
("'", 'z') [2.54079986]
('J', 'ɔɪ') [

In [3]:
import random

scorer = malign.ScoringMatrix(scores)
for pair in random.sample(pairs, 5):
    print(malign.tabulate_alms(malign.multi_align(pair, matrix=scorer, method="anw")))

| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |  #5  |  #6  |
|-------|-------|---------|------|------|------|------|------|------|------|
| 0     | A     |    0.56 |  G   |  E   |  I   |  T   |  N   |  E   |  R   |
| 0     | B     |    0.56 |  ɡ   |  -   |  aɪ  |  t   |  n   |  -   |  ɝ   |
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |  #5  |  #6  |  #7  |  #8  |  #9  |  #10  |
|-------|-------|---------|------|------|------|------|------|------|------|------|------|------|-------|
| 0     | A     |    0.68 |  M   |  I   |  S   |  M   |  A   |  N   |  A   |  -   |  G   |  E   |   D   |
| 0     | B     |    0.68 |  m   |  ɪ   |  s   |  m   |  æ   |  n   |  -   |  ɪ   |  dʒ  |  -   |   d   |
| Idx   | Seq   |   Score |  #0  |  #1  |  #2  |  #3  |  #4  |
|-------|-------|---------|------|------|------|------|------|
| 0     | A     |    2.02 |  B   |  -   |  U   |  G   |  H   |
| 0     | B     |    2.02 |  b   |  j   |  u   |  ɡ   |  -   |
| Idx   | Seq   

