In [7]:
import numpy as np
import pandas as pd
from glob import glob
import re
from collections import Counter
from sklearn.preprocessing import LabelEncoder

'''
The code was adapted from the documentation of sklearn
'''
paths = glob('text.txt/*')

lang_keys = ['A', 'B', "C"]
lang_paths = []
test_paths = []

for lang in lang_keys:
    lang_paths.append(list(filter(re.compile(f'.*lang{lang}').match, paths)))
lang_A, lang_B, lang_C = lang_paths

test_paths.append(list(filter(re.compile('.*test').match, paths)))

def read_data(paths):
    all_data = []
    for path in paths:
        with open(path) as f:
            all_data.append(f.readlines())
    return all_data

lang_A_data = read_data(lang_A)
lang_B_data = read_data(lang_B)
lang_C_data = read_data(lang_C)
test_data = read_data(test_paths[0])

def initial_distribution(samples):
    formatted_data = sum(samples, [])
    first_data = [string[0] for string in formatted_data]
    counts = dict(Counter(first_data))
    total = sum(counts.values())

    initial_dist = {key: value/total for key, value in counts.items()}
    return initial_dist

le = LabelEncoder()
le.fit(list(''.join(sum(lang_A_data, []))))
mapping = dict(zip(list(le.classes_), [i for i in range(0, len(le.classes_))]))


def transition_matrix(samples):
    formatted_transitions = list(''.join(sum(samples, [])))
    le.transform(formatted_transitions)

    df = pd.DataFrame(formatted_transitions)
    # create a new column, shifting the data
    df['shift'] = df[0].shift(-1)
    # add a count column
    df['count'] = 1
    trans_matrix = df.groupby([0, 'shift']).count().unstack().fillna(0)
    # normalisation step
    norm_trans_matrix = trans_matrix.div(trans_matrix.sum(axis=1), axis=0).values
    norm_trans_matrix[np.where(norm_trans_matrix == 0)] = 0.001
    return norm_trans_matrix


trans_mat_a = transition_matrix(lang_A_data)
trans_mat_b = transition_matrix(lang_B_data)
trans_mat_c = transition_matrix(lang_C_data)

prior_probs = {"A": 0.3333, "B": 0.3333, "C": 0.3333}

def markov_likelihood(test_string, transition_matrix):
    total_prob = 1
    for char_idx in range(len(test_string)-1):
        prob = transition_matrix[mapping.get(test_string[char_idx])][mapping.get(test_string[char_idx+1])]
        total_prob *= np.log(prob)
    return total_prob

def marginalization(test_string):
    marginal = (
    markov_likelihood(test_string, trans_mat_a)*prior_probs.get("A") *
    markov_likelihood(test_string, trans_mat_b)*prior_probs.get("B") *
    markov_likelihood(test_string, trans_mat_c)*prior_probs.get("C")
    )
    return marginal

def bayes(test_string, transition_matrix, language):
    likelihood = markov_likelihood(test_string, transition_matrix)
    prior = prior_probs.get(language)
    marginal = marginalization(test_string)

    norm = likelihood / marginal

    return norm * prior

def bayesianify_test_data(tests):
    test_case_probs = []
    for string in processed_test_data:
        lang_probs = []
        lang_probs.append( bayes(string, trans_mat_a, "A") )
        lang_probs.append( bayes(string, trans_mat_b, "B") )
        lang_probs.append( bayes(string, trans_mat_c, "C") )

        test_case_probs.append(lang_probs)
    return test_case_probs

processed_test_data = sum(test_data, [])

output = bayesianify_test_data(processed_test_data)

lang_array = ["A", "B", "C"]

for testcase in output:
    idx = np.argmax(testcase)
    print(f"Language {lang_array[idx]} is the most probable")


KeyError: 0