In [17]:
import urllib.request
import re

In [18]:
def calc_stats(url, stats=None):
    if stats is None:
        stats = {}
    try:
        text = urllib.request.urlopen(url).read().decode('utf-8')
        text = ''.join(c.lower() for c in text if c.isalnum())
        text = re.sub(r'[0-9]+', '', text)
        for char in text:
            if char in stats:
                stats[char] += 1
            else:
                stats[char] = 1
        total_characters = len(text)
        for char in stats:
            stats[char] /= total_characters
    except Exception as e:
        print(f"Error: {e}")
    return stats

In [19]:
english_url = 'https://www.gutenberg.org/cache/epub/22274/pg22274-images.html'
finnish_url = 'https://www.gutenberg.org/cache/epub/15643/pg15643-images.html'
german_url = 'https://www.gutenberg.org/cache/epub/6996/pg6996.html'
print(f'english: {calc_stats(english_url)}\nfinnish: {calc_stats(finnish_url)}\ngerman:{calc_stats(german_url)}')

english: {'d': 0.0477811868592382, 'o': 0.07552570530569483, 'c': 0.02472311606675148, 't': 0.08748971039437252, 'y': 0.020111501908254135, 'p': 0.022945820549277858, 'e': 0.11731085834019307, 'h': 0.06149442490458729, 'm': 0.026042056424455586, 'l': 0.03901631370201302, 'a': 0.08140948888722592, 'n': 0.06808912669310783, 'g': 0.021664296939310036, 'r': 0.062467260345730746, 's': 0.05969842101324553, 'u': 0.029634064207139117, 'f': 0.02274002843672828, 'i': 0.0694548379854823, 'v': 0.008297163810521589, 'b': 0.016519494125570605, 'k': 0.009784479533038988, 'z': 0.0006080221507146599, 'x': 0.0011412108059567461, 'w': 0.021776547182518896, 'j': 0.00332073636159545, 'q': 0.0009260645064730974, 'é': 2.806256080221507e-05}
finnish: {'d': 0.025349037887113072, 'o': 0.05807341411787266, 'c': 0.007397464358677454, 't': 0.07772131546534697, 'y': 0.015842226762152713, 'p': 0.033705296230464574, 'e': 0.07680677351073481, 'h': 0.023564206007950613, 'm': 0.03264324750897948, 'l': 0.0449158105128072

In [20]:
def guess_language(languages, url):
    input_stats = calc_stats(url)
    min_difference = float('inf')  # init with high value
    detected_language = ''

    for language_name, language_stats, _ in languages:
        if len(language_stats) == 0:
            continue

        difference = 0
        for char, expected_frequency in language_stats.items():
            if char in input_stats:
                difference += abs(expected_frequency - input_stats[char])
            else:
                difference += expected_frequency

        average_difference = difference / len(language_stats)

        if average_difference < min_difference:
            min_difference = average_difference
            detected_language = language_name

    return detected_language

In [21]:
english = {}
english_sources = [
    'https://www.gutenberg.org/cache/epub/71828/pg71828-images.html',
    'https://www.gutenberg.org/cache/epub/71827/pg71827-images.html',
    'https://www.gutenberg.org/cache/epub/71831/pg71831-images.html'
]
finnish = {}
finnish_sources = [
    'https://www.gutenberg.org/cache/epub/11296/pg11296.html',
    'https://www.gutenberg.org/cache/epub/71606/pg71606.html',
    'https://www.gutenberg.org/cache/epub/71702/pg71702.html',
    'https://www.gutenberg.org/cache/epub/11296/pg11296.html'
]
german = {}
german_sources = [
    'https://www.gutenberg.org/cache/epub/44956/pg44956-images.html',
    'https://www.gutenberg.org/cache/epub/7205/pg7205-images.html',
    'https://www.gutenberg.org/cache/epub/65661/pg65661-images.html',
    'https://www.gutenberg.org/cache/epub/38126/pg38126-images.html',
    'https://www.gutenberg.org/cache/epub/65662/pg65662-images.html'
]

In [22]:
languages = [
    ('english', english, english_sources),
    ('finnish', finnish, finnish_sources),
    ('german', german, german_sources)
]        
guess_list = [
    ('https://www.gutenberg.org/cache/epub/71824/pg71824-images.html', 'english'),
    ('https://www.gutenberg.org/cache/epub/41907/pg41907-images.html', 'german'),
    ('https://www.gutenberg.org/cache/epub/41907/pg41907-images.html', 'german'),
    ('https://www.gutenberg.org/cache/epub/45263/pg45263.html', 'finnish')
]

In [23]:
# # Update language statistics
for _, language, sources in languages:
    for source in sources:
        language = calc_stats(source, language)
        print(f'{_} -> {language}')

english -> {'d': 0.0477607617106933, 'o': 0.05382264168806198, 'c': 0.033866957597834244, 't': 0.0949100946052255, 'y': 0.010546442667700554, 'p': 0.030865602874448023, 'e': 0.09952314187628061, 'h': 0.04667653217862512, 'm': 0.01912842371960607, 'l': 0.0581601233451975, 'a': 0.10922203713429794, 'n': 0.07526322433182382, 'g': 0.02337235958551325, 'r': 0.05209598925237753, 's': 0.061940274957002744, 'u': 0.017883588461633827, 'f': 0.02467636537408173, 'i': 0.07854803407320916, 'v': 0.015393917945689343, 'b': 0.014060045127391336, 'k': 0.007616656109892636, 'z': 0.0005984676523162163, 'x': 0.008124395615294624, 'w': 0.012486672542394276, 'j': 0.0005573300453302618, 'q': 0.0003668572896966637, 'ò': 0.0008678344487447958, 'ü': 0.00020737862151823692, 'ö': 0.00017300336088613786, 'â': 0.0004705466004557821, 'û': 0.0003059961725119637, 'ä': 5.6916415144950895e-05, 'ê': 4.733642447698886e-05, 'î': 0.00019892568857591747, 'ô': 3.5502318357741644e-05, 'ḍ': 6.649640581291293e-05, 'ḷ': 9.0164618

In [24]:
for url, initial_language in guess_list:
    detected_language = guess_language(languages, url)
    print(f'{initial_language} -> {detected_language} (URL: {url})')

english -> english (URL: https://www.gutenberg.org/cache/epub/71824/pg71824-images.html)
german -> german (URL: https://www.gutenberg.org/cache/epub/41907/pg41907-images.html)
german -> german (URL: https://www.gutenberg.org/cache/epub/41907/pg41907-images.html)
finnish -> finnish (URL: https://www.gutenberg.org/cache/epub/45263/pg45263.html)
