In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

In [7]:
def load_data(data_dir):
    X = []  # List of documents
    y = []  # List of labels (e for English, j for Japanese, s for Spanish)
    
    for c in ['e', 'j', 's']:
        for i in range(10):
            filename = os.path.join(data_dir, f'{c}{i}.txt')
            with open(filename, 'r', encoding='utf-8') as file:
                text = file.read().replace('\n', '')
                X.append(text)
                y.append(c)
    return X, y


def calculate_prior_probabilities(y):
    num_classes = len(set(y))
    prior_probabilities = {}
    
    for c in ['e', 'j', 's']:
        c_count = y.count(c)
        prior_probabilities[c] = (c_count + 0.5) / (len(y) + num_classes * 0.5)
    return prior_probabilities

# Define the character vocabulary (a to z and space)
vocabulary = 'abcdefghijklmnopqrstuvwxyz '

def calculate_class_conditional_probability1(X, y, vocabulary, c, smoothing_param=0.5):
    char_count_per_class = Counter()
    total_char_count_per_class = 0
    
    for i in range(len(X)):
        if y[i] == c:
            # updates the count of characters in the char_count object with the characters found in the document
            char_count_per_class.update(X[i])
            total_char_count_per_class += len(X[i])
            
    theta_ie = {}
    for char in vocabulary:
        char_count = char_count_per_class[char]
        prob = (char_count + smoothing_param) / (total_char_count_per_class + len(vocabulary) * smoothing_param)
        theta_ie[char] = prob
    return theta_ie         

In [15]:
data_dir = 'languageID'
X, y = load_data(data_dir)

#Q1
prior_probabilities = calculate_prior_probabilities(y)
print("Prior Probabilities:", prior_probabilities)

Prior Probabilities: {'e': 0.3333333333333333, 'j': 0.3333333333333333, 's': 0.3333333333333333}


In [17]:
theta_e = calculate_class_conditional_probability1(X, y, vocabulary, 'e')
print("Class Conditional Probability for English:")
for char, prob in theta_e.items():
    prob = "{:.4f}".format(prob)
    #print(f"{char}: {prob}")
    print(f"{prob} \\\\")

Class Conditional Probability for English:
0.0602 \\
0.0111 \\
0.0215 \\
0.0220 \\
0.1054 \\
0.0189 \\
0.0175 \\
0.0472 \\
0.0554 \\
0.0014 \\
0.0037 \\
0.0290 \\
0.0205 \\
0.0579 \\
0.0645 \\
0.0168 \\
0.0006 \\
0.0538 \\
0.0662 \\
0.0801 \\
0.0267 \\
0.0093 \\
0.0155 \\
0.0012 \\
0.0138 \\
0.0006 \\
0.1792 \\


In [18]:
theta_j = calculate_class_conditional_probability1(X, y, vocabulary, 'j')
print("Class Conditional Probability for Japanese:")
for char, prob in theta_j.items():
    prob = "{:.4f}".format(prob)
    #print(f"{char}: {prob}")
    print(f"{prob} \\\\")

Class Conditional Probability for Japanese:
0.1318 \\
0.0109 \\
0.0055 \\
0.0172 \\
0.0602 \\
0.0039 \\
0.0140 \\
0.0318 \\
0.0970 \\
0.0023 \\
0.0574 \\
0.0014 \\
0.0398 \\
0.0567 \\
0.0912 \\
0.0009 \\
0.0001 \\
0.0428 \\
0.0422 \\
0.0570 \\
0.0706 \\
0.0002 \\
0.0197 \\
0.0000 \\
0.0142 \\
0.0077 \\
0.1234 \\


In [19]:
theta_s = calculate_class_conditional_probability1(X, y, vocabulary, 's')
print("Class Conditional Probability for Spanish:")
for char, prob in theta_s.items():
    prob = "{:.4f}".format(prob)
    #print(f"{char}: {prob}")
    print(f"{prob} \\\\")

Class Conditional Probability for Spanish:
0.1046 \\
0.0082 \\
0.0375 \\
0.0397 \\
0.1138 \\
0.0086 \\
0.0072 \\
0.0045 \\
0.0499 \\
0.0066 \\
0.0003 \\
0.0529 \\
0.0258 \\
0.0542 \\
0.0725 \\
0.0243 \\
0.0077 \\
0.0593 \\
0.0658 \\
0.0356 \\
0.0337 \\
0.0059 \\
0.0001 \\
0.0025 \\
0.0079 \\
0.0027 \\
0.1683 \\


In [20]:
#Q4
def calculate_bag_of_characters(X, y, vocabulary):
    char_count = Counter()

    for i in range(len(X)):
        char_count.update(X[i])

    count = {}
    for char in vocabulary:
        count[char] = char_count[char]
        #print(count[char])

    return count

X_test = []  # List of documents
y_test = []
filename = os.path.join(data_dir, 'e10.txt')
with open(filename, 'r', encoding='utf-8') as file:
    text = file.read().replace('\n', '')
    print(text)
    X_test.append(text)
    y_test.append('e')
    
bag_of_chars = calculate_bag_of_characters(X_test, y_test, vocabulary)
for char, count in bag_of_chars.items():
    print(f"{count} \\\\")

except when the winds rise to a high speed we seem to live in a very tranquil world at night when the glare of the sun passes out of our atmosphere the stars and planets seem to move across the heavens with a stately and solemn slowness it was one of the first discoveries of modern astronomy that this movement is only apparent the apparent creeping of the stars across the heavens at night is accounted for by the fact that the earth turns upon its axis once in every twentyfour hours when we remember the size of the earth we see that this implies a prodigious speedfig the milky wayit had remained unchanged since noon of the previous daya long low quietlooking cloud not very dense or brilliant or in any way remarkable except for its size at  pm the professor left the spectroscope for a short time and on returning half an hour later to his observations he was astonished to find the gigantic sun flame shattered to pieces the solar atmosphere was filled with flying debris and some of these p

In [26]:
#Q5: test data e10.txt

# Calculate p(x | y) for a document x and language y
def calculate_log_likelihood(document, theta_y):
    log_prob = 0
    for char in document:
        if char in theta_y:
            log_prob += np.log(theta_y[char])
            
    return log_prob
         

print(theta_e)
print('-------------------------------------------------------------')
print(theta_j)
print('-------------------------------------------------------------')
print(theta_s)
print('-------------------------------------------------------------')

document = X_test[0]
lang = y_test[0]

p_x_given_e = calculate_log_likelihood(document, theta_e)
p_x_given_j = calculate_log_likelihood(document, theta_j)
p_x_given_s = calculate_log_likelihood(document, theta_s)

print(p_x_given_e)
print(p_x_given_j)
print(p_x_given_s)

{'a': 0.0601685114819098, 'b': 0.011134974392863043, 'c': 0.021509995043779945, 'd': 0.021972575582355856, 'e': 0.1053692383941847, 'f': 0.018932760614571286, 'g': 0.017478936064761277, 'h': 0.047216256401784236, 'i': 0.055410540227986124, 'j': 0.001420783082768875, 'k': 0.0037336857756484387, 'l': 0.028977366595076822, 'm': 0.020518751032545846, 'n': 0.057921691723112505, 'o': 0.06446390219725756, 'p': 0.01675202378985627, 'q': 0.0005617049396993227, 'r': 0.053824549810011564, 's': 0.06618205848339666, 't': 0.08012555757475633, 'u': 0.026664463902197257, 'v': 0.009284652238559392, 'w': 0.015496448042293078, 'x': 0.001156451346439782, 'y': 0.013844374690236246, 'z': 0.0006277878737815959, ' ': 0.1792499586981662}
-------------------------------------------------------------
{'a': 0.1317656102589189, 'b': 0.010866906600510151, 'c': 0.005485866033054963, 'd': 0.01722631818022992, 'e': 0.06020475907613823, 'f': 0.003878542227191726, 'g': 0.014011670568503443, 'h': 0.03176211607673224, 'i'

In [12]:
len(vocabulary)

27

In [33]:
#Q6
log_prior_e = np.log(prior_probabilities['e'])
log_prior_j = np.log(prior_probabilities['j'])
log_prior_s = np.log(prior_probabilities['s'])

log_posterior_e = p_x_given_e + log_prior_e
log_posterior_j = p_x_given_j + log_prior_j
log_posterior_s = p_x_given_s + log_prior_s

log_posterior = np.array([log_posterior_e, log_posterior_j, log_posterior_s])

print("Log Posterior Probability (y = e | x):", log_posterior_e)
print("Log Posterior Probability (y = j | x):", log_posterior_j)
print("Log Posterior Probability (y = s | x):", log_posterior_s)

# Predict the class label
predicted_class = np.argmax(log_posterior)
predicted_language = ['e', 'j', 's'][predicted_class]
print("Predicted Language: y = {}".format(predicted_language))

Log Posterior Probability (y = e | x): -7842.964059349239
Log Posterior Probability (y = j | x): -8772.531691363689
Log Posterior Probability (y = s | x): -8468.380656299232
Predicted Language: y = e


In [36]:
#Q7
def load_test_data(data_dir):
    X = []  # List of documents
    y = []  # List of labels (e for English, j for Japanese, s for Spanish)
    
    for c in ['e', 'j', 's']:
        for i in range(10, 20):
            filename = os.path.join(data_dir, f'{c}{i}.txt')
            with open(filename, 'r', encoding='utf-8') as file:
                text = file.read().replace('\n', '')
                X.append(text)
                y.append(c)
    return X, y


def confusion_matrix(X, y, theta_e, theta_j, theta_s, prior_probabilities):
    num_classes = len(prior_probabilities)
    confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    
    for i in range(len(X)):
        true_language = y[i]
        posterior_probabilities = {
            'e': np.log(prior_probabilities['e']) + calculate_log_likelihood(X[i], theta_e),
            'j': np.log(prior_probabilities['j']) + calculate_log_likelihood(X[i], theta_j),
            's': np.log(prior_probabilities['s']) + calculate_log_likelihood(X[i], theta_s),
        }
        
        predicted_language = max(posterior_probabilities, key=posterior_probabilities.get)
        
        true_class = 'ejs'.index(true_language)
        predicted_class = 'ejs'.index(predicted_language)
        confusion_matrix[true_class][predicted_class] += 1
        
    return confusion_matrix

        

X_test2, y_test2 = load_test_data(data_dir)
confusion_matrix = confusion_matrix(X_test2, y_test2, theta_e, theta_j, theta_s, prior_probabilities)
print("Confusion Matrix:")
print(confusion_matrix)

Confusion Matrix:
[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
