In [26]:
const ALBUM_VARIANTS = [
  [
    "Random Access Memories",
    "Random Access Memories (Deluxe Edition)",
    "Random Access Memories - 10th Anniversary",
    "Random Access Memories [Explicit]"
  ],
  [
    "To Pimp a Butterfly",
    "To Pimp a Butterfly (Deluxe)",
    "To Pimp A Butterfly",
    "To Pimp a Butterfly - Remastered"
  ],
  [
    "1989",
    "1989 (Taylor’s Version)",
    "1989 (Deluxe Edition)",
    "1989 [Explicit]"
  ],
  [
    "The Dark Side of the Moon",
    "Dark Side of the Moon",
    "The Dark Side of the Moon (Remastered)",
    "The Dark Side of the Moon - 2011 Remaster"
  ]
];

const STOP_TOKENS = new Set([
  "feat", "ft", "featuring",
  "remaster", "remastered",
  "version", "edit",
  "mono", "stereo",
  "explicit", "clean",
  "live"
]);

export function normalizeTitle(input: string): string[] {
  return input
    .toLowerCase()
    .normalize("NFKD")
    .replace(/[\u0300-\u036f]/g, "")     // strip accents
    .replace(/[^a-z0-9\s]/g, " ")        // remove punctuation
    .split(/\s+/)
    .filter(Boolean)
    .filter(token => !STOP_TOKENS.has(token));
}

export function jaccardSimilarity(a: string[], b: string[]): number {
  if (a.length === 0 || b.length === 0) return 0;

  const setA = new Set(a);
  const setB = new Set(b);

  let intersection = 0;
  for (const token of setA) {
    if (setB.has(token)) intersection++;
  }

  const union = setA.size + setB.size - intersection;
  return union === 0 ? 0 : intersection / union;
}
export function overlapCoefficient(a: string[], b: string[]): number {
  if (a.length === 0 || b.length === 0) return 0;

  const setA = new Set(a);
  const setB = new Set(b);

  let intersection = 0;
  for (const token of setA) {
    if (setB.has(token)) intersection++;
  }

  return intersection / Math.min(setA.size, setB.size);
}

export function titleJaccard(a: string, b: string): number {
  return jaccardSimilarity(
    normalizeTitle(a),
    normalizeTitle(b)
  );
}

export function titleOverlap(a: string, b: string): number {
  return overlapCoefficient(
    normalizeTitle(a),
    normalizeTitle(b)
  );
}

function trigrams(str: string): Set<string> {
  const s = `  ${str} `;
  const grams = new Set<string>();
  for (let i = 0; i < s.length - 2; i++) {
    grams.add(s.slice(i, i + 3));
  }
  return grams;
}

export function charNgramJaccard(a: string, b: string): number {
  const A = trigrams(a.toLowerCase());
  const B = trigrams(b.toLowerCase());

  let intersection = 0;
  for (const g of A) {
    if (B.has(g)) intersection++;
  }

  return intersection / (A.size + B.size - intersection);
}

In [27]:
export interface ParsedTitle {
  core: string;
  disambiguation: string[];
}

const SPLIT_CHARS = [" - ", " – ", " — ", ": "];

export function disambiguateTitle(input: string): ParsedTitle {
  let remaining = input;
  const disambiguation: string[] = [];

  // 1. Extract bracketed content, but NOT if it starts at index 0
  const bracketRegex = /(\([^)]*\)|\[[^\]]*\]|\{[^}]*\})/g;

  remaining = remaining.replace(bracketRegex, (match, _p1, offset) => {
    if (offset === 0) {
      // Keep bracketed content if it starts the title
      return match;
    }

    disambiguation.push(match.slice(1, -1).trim());
    return "";
  });

  // 2. Split on separators (first occurrence only)
  for (const sep of SPLIT_CHARS) {
    const idx = remaining.indexOf(sep);
    if (idx > -1) {
      const right = remaining.slice(idx + sep.length).trim();
      if (right.length > 0) {
        disambiguation.push(right);
      }
      remaining = remaining.slice(0, idx).trim();
      break;
    }
  }

  // 3. Normalize whitespace
  remaining = remaining.replace(/\s+/g, " ").trim();

  return {
    core: remaining,
    disambiguation
  };
}

In [28]:
function testAlbumVariants() {
  for (const group of ALBUM_VARIANTS) {
    const base = group[0];

    const baseParsed = disambiguateTitle(base);
    const baseTokens = normalizeTitle(base);
    const baseTokensDisambiguated = normalizeTitle(baseParsed.core);

    console.log(`\nBase album: "${base}"`);
    console.log(`Core title: "${baseParsed.core}"`);

    for (let i = 1; i < group.length; i++) {
      const candidate = group[i];

      const candidateParsed = disambiguateTitle(candidate);
      const candidateTokens = normalizeTitle(candidate);
      const candidateTokensDisambiguated = normalizeTitle(candidateParsed.core);

      // --- Raw comparisons ---
      const jaccard = jaccardSimilarity(baseTokens, candidateTokens);
      const overlap = overlapCoefficient(baseTokens, candidateTokens);
      const chargram = charNgramJaccard(base, candidate);

      // --- Disambiguated comparisons ---
      const jaccardDisambiguated = jaccardSimilarity(
        baseTokensDisambiguated,
        candidateTokensDisambiguated
      );

      const overlapDisambiguated = overlapCoefficient(
        baseTokensDisambiguated,
        candidateTokensDisambiguated
      );

      const chargramDisambiguated = charNgramJaccard(
        baseParsed.core,
        candidateParsed.core
      );

      console.log(`  ↳ "${candidate}"`);
      console.log(`     Core: "${candidateParsed.core}"`);

      console.log(`     Jaccard index:                 ${jaccard.toFixed(3)}`);
      console.log(`     Overlap coefficient:          ${overlap.toFixed(3)}`);
      console.log(`     Char n-gram Jaccard:           ${chargram.toFixed(3)}`);

      console.log(`     Jaccard (disambiguated):       ${jaccardDisambiguated.toFixed(3)}`);
      console.log(`     Overlap (disambiguated):       ${overlapDisambiguated.toFixed(3)}`);
      console.log(`     Char n-gram (disambiguated):   ${chargramDisambiguated.toFixed(3)}`);
    }
  }
}

testAlbumVariants();


Base album: "Random Access Memories"
Core title: "Random Access Memories"
  ↳ "Random Access Memories (Deluxe Edition)"
     Core: "Random Access Memories"
     Jaccard index:                 0.600
     Overlap coefficient:          1.000
     Char n-gram Jaccard:           0.575
     Jaccard (disambiguated):       1.000
     Overlap (disambiguated):       1.000
     Char n-gram (disambiguated):   1.000
  ↳ "Random Access Memories - 10th Anniversary"
     Core: "Random Access Memories"
     Jaccard index:                 0.600
     Overlap coefficient:          1.000
     Char n-gram Jaccard:           0.548
     Jaccard (disambiguated):       1.000
     Overlap (disambiguated):       1.000
     Char n-gram (disambiguated):   1.000
  ↳ "Random Access Memories [Explicit]"
     Core: "Random Access Memories"
     Jaccard index:                 1.000
     Overlap coefficient:          1.000
     Char n-gram Jaccard:           0.676
     Jaccard (disambiguated):       1.000
     Overlap (