In [3]:
# true: save metrics to metrics folder
SAVE_METRICS = True

## Sentence loading

Load the sentences from the files.

In [4]:
western_sentences = []
with open('../data/western_sentences.txt', encoding='utf-8') as file:
    for line in file:
        western_sentences.append(line)
        
eastern_sentences = []
with open('../data/eastern_sentences.txt', encoding='utf-8') as file:
    for line in file:
        eastern_sentences.append(line)

Strip newlines and join lists.

In [5]:
western_sentences = [sentence.strip() for sentence in western_sentences]
eastern_sentences = [sentence.strip() for sentence in eastern_sentences]
sentences = western_sentences + eastern_sentences

## Data set creation

Create corresponding labels.

In [6]:
# 1 if Western Armenian, 0 otherwise
y = [1] * len(western_sentences) + [0] * len(eastern_sentences)

Tokenize sentences using character-level n-grams.

In [7]:
def tokenize_characters(sentences, n):
    
    tokenized_sentences = []

    for sentence in sentences:
        
        sentence_tokens = []
        
        # sliding window of size n
        for i in range(len(sentence) - (n - 1)):
            
            # extract and append n-gram
            token = sentence[i:i+n]
            sentence_tokens.append(token)
        
        tokenized_sentences.append(sentence_tokens)
    
    return tokenized_sentences

## Data preparation for learning

Feature extraction using TF-IDF vectorization.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf_character_ngram_vectorizer(n):
    return TfidfVectorizer(analyzer='char', ngram_range=(n, n)) 

In [9]:
def get_feature_extracted_X(tokenized_sentences, vectorizer):
    joined_tokens = [' '.join(tokens) for tokens in tokenized_sentences]
    X = vectorizer.fit_transform(joined_tokens)
    return X

## Training learning models per character-level n-gram

In [10]:
import tqdm as tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# dictionary for metrics per n
metrics_dict = {}

# n values for ngrams
ns = [3, 4, 5]

for n in tqdm.tqdm(ns):
    
    # dictionary of metrics for current n
    n_metrics = {}
    
    # tokenize sentences
    tokenized_sentences = tokenize_characters(sentences, n=n)
    
    # get vectorizer
    tfidf_vectorizer = get_tfidf_character_ngram_vectorizer(n=n)
    
    # get features
    X = get_feature_extracted_X(tokenized_sentences=tokenized_sentences, vectorizer=tfidf_vectorizer)
    
    # train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
    # models to train
    lr = LogisticRegression(random_state=1)
    nb = BernoulliNB()
    dt = DecisionTreeClassifier(random_state=1)
    svm = SVC(random_state=1)
    mlp = MLPClassifier(hidden_layer_sizes=(64, 32), random_state=1)
    
    models = [('Logistic Regression', lr), ('Naive Bayes', nb), ('Decision Tree', dt), ('SVM', svm), ('MLP', mlp)]
    
    # initialize dictionary for metrics of each model
    model_metrics = {model_name: {} for model_name, _ in models}

    # fit models, obtain predictions and metrics    
    for model_name, model in models:
        
        # fit model
        model.fit(X_train, y_train)
        
        # obtain predictions
        y_pred = model.predict(X_test)
    
        # obtain metrics
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
    
        # store metrics
        model_metrics[model_name]['Accuracy'] = acc
        model_metrics[model_name]['F1'] = f1
        model_metrics[model_name]['Precision'] = precision
        model_metrics[model_name]['Recall'] = recall
    
    # store metrics for current n
    n_metrics['Metrics'] = model_metrics
    metrics_dict[n] = n_metrics

100%|██████████| 3/3 [00:30<00:00, 10.18s/it]


## Results

Extract metrics and store to file.

In [11]:
import pandas as pd

# dataframe columns
metrics_to_include = ['Accuracy', 'F1', 'Precision', 'Recall']

# dictionary for dataframes per n value
n_dataframes = {}

for n, n_metrics in metrics_dict.items():
    
    # get metrics for current n
    metrics_data = n_metrics['Metrics']
    
    # make dictionary to store data for dataframe
    data_dict = {}
    
    # extract metrics to include
    for metric in metrics_to_include:
        metric_values = {model: metrics[metric] for model, metrics in metrics_data.items()}
        # store metric
        data_dict[metric] = metric_values
    
    # create dataframe for current n
    n_df = pd.DataFrame(data_dict)
    
    # add dataframe to dictionary
    n_dataframes[n] = n_df

Results for $n=3$

In [12]:
n_dataframes[3]

Unnamed: 0,Accuracy,F1,Precision,Recall
Logistic Regression,0.979592,0.984615,0.969697,1.0
Naive Bayes,0.918367,0.935484,0.966667,0.90625
Decision Tree,0.938776,0.953846,0.939394,0.96875
SVM,0.979592,0.984615,0.969697,1.0
MLP,0.979592,0.984615,0.969697,1.0


Results for $n=4$

In [13]:
n_dataframes[4]

Unnamed: 0,Accuracy,F1,Precision,Recall
Logistic Regression,0.979592,0.984615,0.969697,1.0
Naive Bayes,0.979592,0.984615,0.969697,1.0
Decision Tree,0.938776,0.953846,0.939394,0.96875
SVM,0.979592,0.984615,0.969697,1.0
MLP,0.979592,0.984615,0.969697,1.0


Results for $n=5$

In [14]:
n_dataframes[5]

Unnamed: 0,Accuracy,F1,Precision,Recall
Logistic Regression,0.959184,0.969697,0.941176,1.0
Naive Bayes,0.979592,0.984615,0.969697,1.0
Decision Tree,0.959184,0.96875,0.96875,0.96875
SVM,0.959184,0.969697,0.941176,1.0
MLP,0.979592,0.984615,0.969697,1.0


## Save metrics

In [15]:
if SAVE_METRICS:
    
    # create list of dataframes
    df_list = [n_dataframes[n] for n in ns]
    
    for n in ns:
        n_dataframes[n].to_csv(f'../metrics/{n}gram.csv')