In [77]:
import subprocess

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

In [15]:
english_train = []

def load_data(filename):
    data_list = []
    with open(filename, 'r') as f:
        for line in f:
            data_list.append(line[:-1])
    return data_list
    
english_test = load_data('english.test')
tagalog_test = load_data('tagalog.test')

In [75]:
subprocess.getoutput("java -version")

'openjdk version "1.8.0_282"\nOpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~20.04-b08)\nOpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)'

In [3]:
# Compute the average of the tagalog testset
avg = !java -jar negsel2.jar -self english.train -n 10 -r 4 -c -l < tagalog.test | awk '{n+=$1}END{print n/NR}'

In [41]:
# Get individual test scores


testresults_tagalog = !java -jar negsel2.jar -self english.train -n 10 -r 4 -c -l < tagalog.test
testresults_tagalog = np.array([float(t) for t in testresults_tagalog])

testresults_english = !java -jar negsel2.jar -self english.train -n 10 -r 4 -c -l < english.test
testresults_english = np.array([float(t) for t in testresults_english])

In [42]:
global_min = min([min(testresults_english), min(testresults_tagalog)])
global_max = max([max(testresults_english), max(testresults_tagalog)])
print(global_min, global_max)

0.0 31.28750387473102


In [43]:
# Normalize (need not scale by min because it's 0)
testresults_tagalog /= global_max
testresults_english /= global_max

In [57]:
testresults = np.concatenate((testresults_tagalog, testresults_english))

In [59]:
labels = np.zeros(testresults.shape[0], dtype=bool)
labels[:testresults_tagalog.shape[0]] = True

In [62]:
roc_auc_score(labels, testresults)

0.7916097138691454

In [89]:
def get_scores(filename, r=4):
    run_command = f"java -jar negsel2.jar -self english.train -n 10 -r {r} -c -l < {filename}"
    results = subprocess.getoutput(run_command)
    return np.array([float(r) for r in results.split('\n')])

In [90]:
for r in range(1,10):
    testresults_tagalog = get_scores('tagalog.test', r)
    testresults_english = get_scores('english.test', r)
    
    #global_min = min([min(testresults_english), min(testresults_tagalog)])
    #global_max = max([max(testresults_english), max(testresults_tagalog)])
    
    # Normalize (need not scale by min because it's 0)
    #testresults_tagalog /= global_max
    #testresults_english /= global_max
    
    testresults = np.concatenate((testresults_tagalog, testresults_english))
    
    labels = np.zeros(testresults.shape[0], dtype=bool)
    labels[:testresults_tagalog.shape[0]] = True
    
    ras = roc_auc_score(labels, testresults)
    
    print(f'ROC AUC Score for r={r}:', ras)

ROC AUC Score for r=1: 0.5435347184253692
ROC AUC Score for r=2: 0.7396459814106069
ROC AUC Score for r=3: 0.8311235647895024
ROC AUC Score for r=4: 0.7916097138691454
ROC AUC Score for r=5: 0.7282440313468198
ROC AUC Score for r=6: 0.6680847913249499
ROC AUC Score for r=7: 0.5907258064516129
ROC AUC Score for r=8: 0.5201612903225806
ROC AUC Score for r=9: 0.5120967741935484


* The best score is observed for r=3
* We observe that for r=1 the score is pretty low, which can be explained by the fact that it matches too many strings and is thus underfitting
* For r=9 we observe an equally bad score, which makes sense considering we have 10 letter strings and are thus overfitting on the provided training data

## Exercise 3.

The folder `lang` contains strings from 4 other languages. Determine which of these languages can be best
discriminated from English using the negative selection algorithm, and for which of the languages this is
most difficult. Can you explain your findings?

In [96]:
languages_dir = 'lang/'
languages = ['hiligaynon.txt', 'middle-english.txt', 'plautdietsch.txt', 'xhosa.txt']

In [101]:
for r in [2,3,4,5,6]:
    print(f'Computing for r={r}:')

    testresults_english = get_scores('english.test', r)

    for language in languages:
        testresults_lang = get_scores(languages_dir+language, r)

        testresults = np.concatenate((testresults_lang, testresults_english))

        labels = np.zeros(testresults.shape[0], dtype=bool)
        labels[:testresults_lang.shape[0]] = True

        ras = roc_auc_score(labels, testresults)

        print(f'\tROC AUC Score for language "{language}": {ras:.3f}')

Computing for r=2:
	ROC AUC Score for language "hiligaynon.txt": 0.752
	ROC AUC Score for language "middle-english.txt": 0.514
	ROC AUC Score for language "plautdietsch.txt": 0.707
	ROC AUC Score for language "xhosa.txt": 0.852
Computing for r=3:
	ROC AUC Score for language "hiligaynon.txt": 0.840
	ROC AUC Score for language "middle-english.txt": 0.542
	ROC AUC Score for language "plautdietsch.txt": 0.775
	ROC AUC Score for language "xhosa.txt": 0.889
Computing for r=4:
	ROC AUC Score for language "hiligaynon.txt": 0.797
	ROC AUC Score for language "middle-english.txt": 0.534
	ROC AUC Score for language "plautdietsch.txt": 0.753
	ROC AUC Score for language "xhosa.txt": 0.832
Computing for r=5:
	ROC AUC Score for language "hiligaynon.txt": 0.730
	ROC AUC Score for language "middle-english.txt": 0.522
	ROC AUC Score for language "plautdietsch.txt": 0.701
	ROC AUC Score for language "xhosa.txt": 0.765
Computing for r=6:
	ROC AUC Score for language "hiligaynon.txt": 0.671
	ROC AUC Score fo