In [None]:
import pandas as pd

df = pd.read_csv("annotations.csv", sep=',')
df.head()

In [None]:
# We have four annotated columns, each corresponding to an investigative question
cols = df.columns.tolist()
cols.remove("Dokument")
cols

In [None]:
# find how many indexes has a string-value in a column
def has_value(row):
    # check if there's any string in this row:
    return any([isinstance(row[col], str) for col in cols])

df["has_value"] = df.apply(has_value, axis=1)
df.head()

# count the number of "has_value":
df["has_value"].value_counts()

In [None]:
# mapping from ColumnName -> IBn
col_to_ib = {}
ibs = ["IB1", "IB2", "IB3", "IB4"]
for ib, col in zip(ibs, cols):
    col_to_ib[col] = ib
ib_to_col = {v: k for k, v in col_to_ib.items()}

to_dict = {}

for i in df.index:
    to_dict[i] = {}
    for col in cols:
        if not pd.isna(df.loc[i, col]):
            _id = col_to_ib[col]
            to_dict[i][_id] = df.loc[i, col]

# remove empty
to_dict = {k: v for k, v in to_dict.items() if v}
to_dict

In [None]:
rename_map = {col: col_to_ib[col] for col in cols}
df = df.rename(columns=rename_map)
df.head()

In [None]:
def column_statistics(colname):
    col = df[colname]
    print("Column: {}".format(colname))
    print("Number of unique values: {}".format(len(col.unique())))

for col in ib_to_col.keys():
    column_statistics(col)

In [None]:
ib1 = "Tilknytning til åstedet eller gjerningsadressen"
ib2 = "Død, dødsårsak og skader"
ib3 = "Fornærmede involvert i konflikt"
ib4 = "Våpen, drapsvåpen og våpenbruk"

In [None]:
# nltk sentence tokenizer for norwegian
from nltk.tokenize import sent_tokenize

def sentencize(text):
    return sent_tokenize(text, language='norwegian')

sentencize(df.iloc[1]["Dokument"])

In [None]:
# convert to a df with sentences, with the paragraph id

# 1. add paragraph id
df["Paragraph"] = df.index
# 2. sentencize, 1 row for each sentence
df_sentences = df.apply(lambda row: pd.Series(sentencize(row["Dokument"])), axis=1).stack().reset_index(level=1, drop=True).to_frame("Sentence")
# 3. add back paragraph id
df_sentences = df_sentences.join(df["Paragraph"], how="left")

# also add a column for each IB, IB1, IB2, IB3, IB4
for ib in ibs:
    df_sentences[ib] = None
df_sentences.head(25)


In [None]:
# filter sentences less than 10 characters long
df_sentences = df_sentences[df_sentences["Sentence"].str.len() > 10]
df_sentences.shape

In [None]:
# for each sentence, check if there's a match in any of the
# indexes at the paragraph index of the original df

i = 0

paragraphs_with_matches = set()

sentence_data = {}

for row_idx, row in enumerate(df_sentences.iterrows()):
    para_id = row[1]["Paragraph"]
    sent = row[1]["Sentence"]
    # print(f"Row {row_idx} - Paragraph {para_id} - Sentence: {sent}")

    sentence_data[row_idx] = {
        "Paragraph": para_id,
        "Sent": sent,
        "IB1": False,
        "IB2": False,
        "IB3": False,
        "IB4": False,
    }

    if para_id not in to_dict:
        continue

    # then, find matching IBs for this paragraph ID, and check each sentence!
    for ib, text in to_dict[para_id].items():
        # the text may be split into several sentences, as one IB can match multiple!
        candidate_sents = text.split(";")
        for cand_sent in candidate_sents:
            for cs in sentencize(cand_sent):
                cs = cs.strip()
                if cs.lower() in sent.lower():
                    paragraphs_with_matches.add(para_id)
                    sentence_data[row_idx][ib] = True
len(paragraphs_with_matches)

In [None]:
sent_df = pd.DataFrame.from_dict(sentence_data, orient="index")
sent_df.head(25)

In [None]:
paragraphs_with_matches = sorted(list(paragraphs_with_matches))
print(paragraphs_with_matches)

In [None]:
# find sentences that has any of IB1, IB2, IB3, IB4 as True
sentences_with_matches = sent_df[sent_df[ibs].any(axis=1)]
sentences_with_matches = sentences_with_matches.index.tolist()
print(sentences_with_matches)

In [None]:
import nltk

def preprocess(document, stop=True, stem=True, lower=True):
    tokens = nltk.word_tokenize(document, language='norwegian')
    if stop:
        stop_words = nltk.corpus.stopwords.words('norwegian')
        tokens = [t for t in tokens if t not in stop_words]
    if stem:
        stemmer = nltk.stem.snowball.NorwegianStemmer()
        tokens = [stemmer.stem(t) for t in tokens]
    # remove tokens with less than 3 chars and only keep alpha
    tokens = [t.lower() for t in tokens if len(t) >= 2 and t.isalpha()]
    # return " ".join(tokens).strip()
    return tokens

print(preprocess(df.iloc[0]["Dokument"]))


In [None]:
import numpy as np

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union

def filter_results(scores, threshold=0, top_n=20):
    results = [(i, score) for i, score in enumerate(scores) if score > threshold]
    results = sorted(results, key=lambda x: x[1], reverse=True)
    if top_n == 0:
        return results
    return results[:top_n]

def print_results(results, docs, sent_df):
    for i, score in results:
        print(f"--- {sent_df.iloc[i]['Sent']}")
        print(f"------ Score: {score} - {docs[i]}")
        print()

In [None]:
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer

def get_simple_metrics(query, docs):
    # dct = Dictionary(docs)
    # corpus = [dct.doc2bow(doc) for doc in docs]
    # query = dct.doc2bow(preprocess(query))
    all_scores = {}
    q = preprocess(query)

    # simple string search
    # initialize an empty array of th elength of docs
    string_sim = np.zeros(len(docs))
    for q_token in q:
        # check all documents if they contain this q_token
        for i, doc in enumerate(docs):
            if q_token in doc:
                string_sim[i] += 1
    all_scores["string"] = string_sim


    # tf-idf + cosine
    vectorizer = TfidfVectorizer(
        analyzer='word',
        tokenizer=lambda doc: doc,
        preprocessor=lambda doc: doc,
        ngram_range=(1, 3),
    )

    tfidf_matrix = vectorizer.fit_transform(docs)
    cosine_sim = cosine_similarity(vectorizer.transform([q]), tfidf_matrix).flatten()
    all_scores["tfidf"] = cosine_sim

    # jaccard
    j_query = set(q)
    j_sims = [jaccard_similarity(j_query, set(j_doc)) for j_doc in docs]
    # j_sims = (j_sims - np.min(j_sims)) / (np.max(j_sims) - np.min(j_sims))
    all_scores["jaccard"] = j_sims

    # bm25
    bm25 = BM25Okapi(docs)
    bm25_scores = bm25.get_scores(q)
    # bm25_scores = (bm25_scores - np.min(bm25_scores)) / (np.max(bm25_scores) - np.min(bm25_scores))
    all_scores["bm25"] = bm25_scores

    # normalize all scores:
    for k, v in all_scores.items():
        if np.max(v) != np.min(v):
            all_scores[k] = (v - np.min(v)) / (np.max(v) - np.min(v))
        else:
            all_scores[k] = np.zeros(len(v))

    return all_scores


In [None]:
import fasttext.util

fasttext.util.download_model('no', if_exists='ignore')  # English
fasttext_model = fasttext.load_model('cc.no.300.bin')

In [None]:
from sentence_transformers import SentenceTransformer, util
sbert_model = SentenceTransformer("NbAiLab/nb-sbert-base")

In [None]:
# compute fasttext sentence vectors for all sentences in the corpus
sent_df["fasttext"] = sent_df["Sent"].apply(lambda x: fasttext_model.get_sentence_vector(x))
sent_df["sbert"] = sent_df["Sent"].apply(lambda x: sbert_model.encode(x))

In [None]:
from scipy.spatial.distance import cosine, euclidean, cityblock, minkowski

def compute_similarity_with_scipy(query_sentence, dataframe, column="fasttext", top_n=20):
    vectors = dataframe[column].values
    sentence_indexes = dataframe.index.values

    metric_mapping = {
        'cosine': cosine,
        'euclidean': euclidean,
        'manhattan': cityblock,
        'minkowski': lambda u, v: minkowski(u, v, p=2)
    }

    top_similar_matches = {metric: [] for metric in metric_mapping}

    for metric, distance_func in metric_mapping.items():
        distances = [distance_func(query_sentence, vector) for vector in vectors]

        if metric == 'cosine':
            similarity_scores = [1 - distance for distance in distances]
        else:
            max_distance = max(distances)
            similarity_scores = [(max_distance - distance) / max_distance for distance in distances]

        top_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:top_n]
        top_similar_matches[metric] = [(similarity_scores[i], sentence_indexes[i]) for i in top_indices]

    return top_similar_matches


In [None]:
def get_true_index(ib="IB1", verbose=True):
    ib_df = sent_df[sent_df[ib] == True]
    ib_indexes = set(ib_df.index.values)
    ib_paragraphs = set(ib_df["Paragraph"].values)

    if verbose:
        print("True sentences:")
        for i in sorted(ib_indexes):
            print(f"-- IDX {i} - {sent_df.iloc[i]['Sent']}")

        print("True paragraphs:")
        print(sorted(ib_paragraphs))
    return ib_indexes, ib_paragraphs

def get_sentids(filtered_scores):
    return [sent_id for sent_id, _ in filtered_scores]

docs = [preprocess(sent) for sent in sent_df["Sent"].tolist()]
def build_results(query, TOP_N):
    all_scores = get_simple_metrics(query, docs)
    index_results = {}

    for similarity_type in all_scores.keys():
        # return all (top_n=0)
        sim_indexes = get_sentids(filter_results(all_scores[similarity_type], top_n=TOP_N))
        index_results[similarity_type] = sim_indexes

    fast_query = fasttext_model.get_sentence_vector(query)
    fast_sim = compute_similarity_with_scipy(fast_query, sent_df, column="fasttext", top_n=TOP_N)
    index_results["fasttext"] = [i for _, i in fast_sim["cosine"]]
    # for metric, res in fast_sim.items():
    #     metric_name = f"fasttext_{metric}"
    #     index_results[metric_name] = [i for _, i in res]

    sbert_query = sbert_model.encode(query)
    sbert_sim = compute_similarity_with_scipy(sbert_query, sent_df, column="sbert", top_n=TOP_N)
    index_results["sbert"] = [i for _, i in sbert_sim["cosine"]]
    # for metric, res in sbert_sim.items():
    #     metric_name = f"sbert_{metric}"
    #     index_results[metric_name] = [i for _, i in res]

    return index_results

# IB1

In [None]:
ib1_indexes, ib1_paragraphs = get_true_index("IB1", verbose=True)

In [None]:
from sklearn.metrics import precision_score

def compare_with_index(query, sent_df, ib_indexes, ib_paragraphs, print_sents=False, print_diff=False):
    if print_sents:
        print(f"Searching with '{query}'\n")
    index_results = build_results(query, len(ib_indexes))
    # index_results = build_results(query, TOP_N=15)
    # iterate indexes and show sentences in sent_df:
    final_matching_paragraphs = {}

    for metric, indexes in index_results.items():
        matching_indexes = set(indexes).intersection(ib_indexes)
        paragraphs = set(sent_df[sent_df.index.isin(matching_indexes)]["Paragraph"].values)
        matching_paragraphs = paragraphs.intersection(ib_paragraphs)
        percentage_paras = len(matching_paragraphs) / len(ib_paragraphs)
        
        if print_sents:
            print(f"Results for {metric}: {sorted(indexes)}")
            print(f"Matching indexes: {sorted(matching_indexes)}")
            print(f"Matching paragraphs: {sorted(matching_paragraphs)} ({percentage_paras:.2%})")
            for i in indexes:
                print(f"-- {sent_df.iloc[i]['Sent']}")
            print("-"*40)

        if print_diff:
            non_matching_indexes = set(indexes).difference(ib_indexes)
            print(f"Non-matching indexes: {sorted(non_matching_indexes)}")
            for i in non_matching_indexes:
                print(f"-- {sent_df.iloc[i]['Sent']}")

        final_matching_paragraphs[metric] = {}
        final_matching_paragraphs[metric]["matching_sentences"] = matching_indexes
        final_matching_paragraphs[metric]["matching_paragraphs"] = matching_paragraphs

        sentence_accuracy = round(len(matching_indexes) / len(ib_indexes), 2)
        final_matching_paragraphs[metric]["sent_accuracy"] = sentence_accuracy

        para_accuracy = round(len(matching_paragraphs) / len(ib_paragraphs), 2)
        final_matching_paragraphs[metric]["para_accuracy"] = para_accuracy

        final_matching_paragraphs[metric]["model_sentences"] = indexes
        final_matching_paragraphs[metric]["model_paragraphs"] = paragraphs

    return final_matching_paragraphs

# Hvem har tilknytning til åstedet / gjerningsadressen?
# q = "person i, rundt, eller i nærheten av huset"
# q = "person i eller rundt hus eller adressen"
q = "person med tilknytning til boligen / adressen"
matches = compare_with_index(q, sent_df, ib1_indexes, ib1_paragraphs)
matches

# IB2

In [None]:
ib2_indexes, ib2_paragraphs = get_true_index("IB2", verbose=True)

In [None]:
# Hva har skjedd - dødsårsak
q = "årsak til dødsfallet"
compare_with_index(q, sent_df, ib2_indexes, ib2_paragraphs)

In [None]:
ib3_indexes, ib3_paragraphs = get_true_index("IB3", verbose=True)

In [None]:
# Var fornærmede involvert i konflikt?
q = "fornærmede / avdøde involvert i konflikt" 
compare_with_index(q, sent_df, ib3_indexes, ib3_paragraphs)

In [None]:
ib4_indexes, ib4_paragraphs = get_true_index("IB4", verbose=True)

In [None]:
# Drapsvåpenet, hva vet vi om drapsvåpenet?
q = "informasjon om våpenet"
compare_with_index(q, sent_df, ib4_indexes, ib4_paragraphs)

In [None]:
s = "inneholder en rekke rom som på gjerningstidspunktet ble utleid til arbeidstakere fra forskjellige østeuropeiske land"
def find_index(sent, sent_df):
    return sent_df[sent_df["Sent"].str.contains(sent)].index.values[0]

def find_index(sent, sent_df):
    return sent_df[sent_df["Sent"].str.contains(sent, regex=False)].index.values[0]

find_index(s, sent_df)
idx = find_index("som er angitt i tiltalen leide C husrom gjennom F i [adresse]", sent_df)
sent_df.iloc[idx]["Sent"]

In [None]:
# find the paragraphs of any sentence index
# e.g. 9 --> paragraph containing the sentence index
def find_paragraphs(sent_index, sent_df):
    return sent_df[sent_df.index == sent_index]["Paragraph"].values[0]
assert find_paragraphs(0, sent_df) == 0
assert find_paragraphs(5, sent_df) == 1
assert find_paragraphs(6, sent_df) == 2

In [None]:
queries = {
    "sbert": {
        "IB1": "person med tilknytning til boligen / adressen", 
        "IB2": "årsak til dødsfallet", 
        "IB3": "fornærmede / avdøde involvert i konflikt",
        "IB4": "informasjon om våpenet", 
    },
    "string": {
        "IB1": "boligen",
        "IB2": "dødsårsak",
        "IB3": "konflikt",
        "IB4": "drapsvåpen",
    }
}
print(queries.keys())


total_retrieved = {}
USE_INVESTIGATOR = False

for ib in ["IB1", "IB2", "IB3", "IB4"]:
    print(ib)
    model_results_all = build_results(q, TOP_N=100)

    # model_results = sorted(model_results["sbert"])
    for model_type in queries.keys():
        model_results = model_results_all[model_type]
        _id = f"{ib}_{model_type}"
        q = queries[model_type][ib]

        total_retrieved[_id] = []

        indexes, paragraphs = get_true_index(ib, verbose=False)
        print(sorted(indexes))
        print(sorted(paragraphs))

        if USE_INVESTIGATOR:
            for paragraph in sorted(paragraphs):
                tmp = {
                    "type": None,  # etterforsker, modell, begge
                    "paragraf": paragraph,
                    "setning": None,
                    "tekst": None,
                }

                # find the corresponding investigator sentence
                investigator_sent = df.iloc[paragraph][ib]
                if investigator_sent:
                    investigator_sent = [s.strip() for s in investigator_sent.split(";")]
                else:
                    investigator_sent = []

                for inv_sents in investigator_sent:
                    # there are possibly multiple sentences in one:
                    for inv_sent in sentencize(inv_sents):
                        _tmp = tmp.copy()
                        inv_index = find_index(inv_sent, sent_df)
                        _tmp["setning"] = inv_index
                        _tmp["tekst"] = sent_df.iloc[inv_index]["Sent"]
                        if inv_index in model_results:
                            _tmp["type"] = "begge"
                        else:
                            _tmp["type"] = "etterforsker"
                        total_retrieved[_id].append(_tmp)

        for model_sent in model_results:
            added = False
            # if model_sent is already in total_retrieved, change its type to "begge"
            for x in total_retrieved[_id]:
                if x["setning"] == model_sent:
                    x["type"] = "begge"
                    added = True
            if not added:
                total_retrieved[_id].append({
                    "type": "modell",
                    "query": q,
                    "paragraf": find_paragraphs(model_sent, sent_df),
                    "setning": model_sent,
                    "tekst": sent_df.iloc[model_sent]["Sent"],
                })


        # sort by paragraph -> setning
        total_retrieved[_id] = sorted(total_retrieved[_id], key=lambda x: (x["paragraf"], x["setning"]))

In [None]:
for ib, results in total_retrieved.items():
    print(ib)
    for res in results:
        print(res)
    print()

# stats

In [None]:
for ib, results in total_retrieved.items():
    for res in results:
        res["IB"] = ib

res_df = pd.DataFrame([res for results in total_retrieved.values() for res in results])
res_df.head(30)

In [None]:
# get some stats...

# how many sentences are retrieved by each model?
res_df.groupby("IB")["setning"].nunique()


In [None]:
res_df.to_csv("fritekst_vs_sbert.csv", index=False, columns=["IB", "type", "paragraf", "setning", "tekst"])