# Characters

In [1]:
import os
from homoglypher.process_data import HomoglyphJSON
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Preliminaries

Build a record for each font file

In [2]:
indir = "data_googlefonts"
fnames = os.listdir(indir)

records = []
for f in fnames:
    r = HomoglyphJSON(f, indir)
    records.append(r.record)

Compile into a big dataframe

In [3]:
fonts = pd.concat([r for r in records])
print(f"Number of entries: {len(fonts):,}")

Number of entries: 146,654


## Character pair metrics

How often do characters appear together across the whole dataset? First, group by filename, then homoglyph, then decimal. Make a co-occurrence matrix and stack it. Return all nonzero pairs

In [None]:
grouped = (
    fonts
    .groupby(['FILE', 'GROUP', 'DEC'])
    .size()
    .unstack()
    .fillna(0)
)

coocc = grouped.T.dot(grouped)
np.fill_diagonal(coocc.values, 0)

Now we can stack the co-occurences and return a dataframe of character-to-character counts. From that, we'll filter out 0 values

In [None]:
pairs = pd.DataFrame(coocc.stack())
pairs = (
    pairs
    .reset_index(level=0)
    .rename(columns={'DEC': 'PAIR', 0: 'COUNT'})
    .reset_index()
)
pairs = pairs[pairs['COUNT'] > 0]

In [None]:
pairs.head(5)

With the counts made, we can find the conditional probability of one character pairing with another. First, create a long table that has a row for every character-character count

In [None]:
long = pairs[['DEC', 'PAIR']].apply(lambda x: np.repeat(x, pairs['COUNT']))

Now this can be fed through `pd.crosstab()`. As above, we'll also filter out 0 values

In [None]:
pair_probs = (
    pd.crosstab(long['DEC'], long['PAIR'], normalize='index')
    .stack()
    .reset_index()
    .rename(columns={0: 'PROB'})
)
pair_probs = pair_probs[pair_probs['PROB'] > 0]

Now we can stick these probabilities back onto the dataframe of pairs

In [None]:
pairs = pd.merge(pairs, pair_probs)

## Pair counts

A quick overview of raw pair counts. Here are some highly occurent characters

In [None]:
quant = 0.95
high_quant = pairs['COUNT'].quantile(quant)

print(
    "The number of pairs:",
    f"\n+ Mean: {pairs['COUNT'].mean():.02f}",
    f"\n+ Standard deviation: {pairs['COUNT'].std():.02f}",
    f"\n+ Variance: {pairs['COUNT'].var():.02f}",
    f"\n+ {quant} quantile: {high_quant:.02f}"
)

In [None]:
pair_counts = (pairs
 .assign(CHAR=pairs['DEC'].apply(chr))
 .sort_values('COUNT', ascending=False)
 .drop(columns=['PROB'])
 .reindex(columns=['DEC', 'PAIR', 'CHAR', 'COUNT'])
)
pair_counts.head(25)

In [None]:
pair_counts['COUNT'].hist(figsize=(15,5), bins=100);

How many times does a decimal appear in a pair? Here's something interesting: font files tend to map glyphs onto lower code points (rather than the other way around)

In [None]:
dec_counts = (
    pair_counts
    .groupby('DEC')
    .size()
    .sort_values(ascending=False)
)
pd.DataFrame({
    'CHAR': dec_counts.index.map(chr),
    'COUNT': dec_counts
}).head(25)

## Pair probabilities

Let's look at probabilities

In [None]:
quant = 0.95
high_quant = pairs['PROB'].quantile(quant)

print(
    "The probability that a character is a homoglyph for another:",
    f"\n+ Mean: {pairs['PROB'].mean():.02f}",
    f"\n+ Standard deviation: {pairs['PROB'].std():.02f}",
    f"\n+ Variance: {pairs['PROB'].var():.02f}",
    f"\n+ {quant} quantile: {high_quant:.02f}"
)

In [None]:
pairs['PROB'].hist(figsize=(15,5), bins=100);

Pairs with a probability that's one standard deviation away from the mean

In [None]:
sigma = pairs['PROB'].mean() + pairs['PROB'].std()

high_prob = pairs[pairs['PROB'] >= sigma]
high_prob = high_prob.sort_values(['PROB', 'COUNT'], ascending=False)

With `COUNT` and `PROB` in mind, some of these top pairs seem like they must consistently be homoglyphs (over and above the particularities of a single font file)

In [None]:
(high_prob
 .assign(CHAR=high_prob['DEC'].apply(chr))
 .reindex(columns=['DEC', 'PAIR', 'CHAR', 'COUNT', 'PROB'])
).head(50)