In [118]:
import pandas as pd
import os
from pathlib import Path
import nltk
from nltk.corpus import wordnet

In [119]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [120]:
print(os.getcwd())

c:\Users\willi\Desktop\Uni\3rd_year\NLU\comp38412-not-broken\src\notebooks


In [121]:
train_df = pd.read_csv(Path(os.getcwd()) / "../../data/train.csv")
test_df = pd.read_csv(Path(os.getcwd()) / "../../data/test.csv")

In [122]:
temp = train_df["Claim"]
temp.to_csv("train_claims.csv", index=False)

temp = test_df["Claim"]
temp.to_csv("test_claims.csv", index=False)


In [123]:
train_claims = train_df["Claim"].tolist()
test_claims = test_df["Claim"].tolist()

print(f"Number of claims in train set: {len(train_claims)}")
print(f"Number of claims in test set: {len(test_claims)}")
print(f"Total number of claims: {len(train_claims) + len(test_claims)}")

# How many overlapping claims are there?
overlapping_claims = set(train_claims) & set(test_claims)
print(f"Number of overlapping claims: {len(overlapping_claims)}")

Number of claims in train set: 21508
Number of claims in test set: 4688
Total number of claims: 26196
Number of overlapping claims: 115


In [124]:
def _get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags."""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [125]:
test_claims_with_or = 0
train_claims_with_or = 0

ord_test_claims: list[dict] = []

for claim in train_claims:
    if "/" in claim:
        train_claims_with_or += 1

for claim in test_claims:
    if "/" in claim:
        test_claims_with_or += 1
        pos_claim = dict(nltk.pos_tag(nltk.word_tokenize(claim.replace("/", " or "))))
        ord_test_claims.append({"original_claim": claim, "pos_claim": pos_claim})

print(f"Number of claims with / in train set: {train_claims_with_or}")
print(f"Number of claims with / in test set: {test_claims_with_or}")


Number of claims with / in train set: 1
Number of claims with / in test set: 478


In [126]:
claim_structure: list[dict] = []


for claim_dict in ord_test_claims:
    claim_parts = claim_dict["original_claim"].split("/")
    pos_tags = claim_dict["pos_claim"]
    
    if len(claim_parts) == 2:
        before, after = claim_parts
        before_words, after_words = before.strip().split(" "), after.strip().split(" ")
        word1, word2 = before_words[-1], after_words[0]
        word_before_or = before_words[-2]
        
        try:
            pos_before_or = pos_tags[word_before_or]
        except KeyError:
            continue
        
        words_around_or = (word1, word2)
    elif len(claim_parts) == 3:
        before, middle, after = claim_parts
        before_words, middle_words, after_words = before.strip().split(" "), middle.strip().split(" "), after.strip().split(" ")
        word1, word2, word3 = before_words[-1], middle_words[0], after_words[0]
        word_before_or = before_words[-2]
        
        try:
            pos_before_or = pos_tags[word_before_or]
        except KeyError:
            continue
        
        words_around_or = (word1, word2, word3)
    elif len(claim_parts) == 4:
        before, middle1, middle2, after = claim_parts
        before_words, middle1_words, middle2_words, after_words = before.strip().split(" "), middle1.strip().split(" "), middle2.strip().split(" "), after.strip().split(" ")
        word1, word2, word3, word4 = before_words[-1], middle1_words[0], middle2_words[0], after_words[0]
        word_before_or = before_words[-2]
        
        try:
            pos_before_or = pos_tags[word_before_or]
        except KeyError:
            continue
        
        words_around_or = (word1, word2, word3, word4)
    else:
        print(claim_dict["original_claim"])
        continue
    
    claim_structure.append({
        "original_claim": claim_dict["original_claim"],
        "words_around_or": [{"word": word, "pos": pos_tags[word]} for word in words_around_or],
        "word_before_or": {
            "word": word_before_or,
            "pos": pos_before_or
        }
    })

In [127]:
print(claim_structure)

[{'original_claim': 'People do not have the right to self-harm/harm others', 'words_around_or': [{'word': 'self-harm', 'pos': 'NN'}, {'word': 'harm', 'pos': 'NN'}], 'word_before_or': {'word': 'to', 'pos': 'TO'}}, {'original_claim': 'People do not have the right to self-harm/harm others', 'words_around_or': [{'word': 'self-harm', 'pos': 'NN'}, {'word': 'harm', 'pos': 'NN'}], 'word_before_or': {'word': 'to', 'pos': 'TO'}}, {'original_claim': 'People do not have the right to self-harm/harm others', 'words_around_or': [{'word': 'self-harm', 'pos': 'NN'}, {'word': 'harm', 'pos': 'NN'}], 'word_before_or': {'word': 'to', 'pos': 'TO'}}, {'original_claim': 'People do not have the right to self-harm/harm others', 'words_around_or': [{'word': 'self-harm', 'pos': 'NN'}, {'word': 'harm', 'pos': 'NN'}], 'word_before_or': {'word': 'to', 'pos': 'TO'}}, {'original_claim': 'People do not have the right to self-harm/harm others', 'words_around_or': [{'word': 'self-harm', 'pos': 'NN'}, {'word': 'harm', 'p

In [138]:
non_matching_pos_tags = 0
word_before_or_pos_counts = {}

for claim in claim_structure:
    # Check if all the elements in the words_around_or have the same pos
    if len(set(word["pos"] for word in claim["words_around_or"])) != 1:
        # print(claim["original_claim"], claim["words_around_or"])
        non_matching_pos_tags += 1
        
    if claim["word_before_or"]["pos"] not in word_before_or_pos_counts:
        word_before_or_pos_counts[claim["word_before_or"]["pos"]] = 1
    else:
        word_before_or_pos_counts[claim["word_before_or"]["pos"]] += 1
        
print(f"Number of claims with '/' in with words that have non-matching pos tags: {non_matching_pos_tags} out of {len(claim_structure)}")
print(sorted(word_before_or_pos_counts.items(), key=lambda x: x[1], reverse=True))


Number of claims with '/' in with words that have non-matching pos tags: 190 out of 471
[('VBZ', 98), ('JJ', 71), ('IN', 69), ('DT', 50), ('VB', 36), ('VBP', 33), ('TO', 24), ('VBG', 24), ('NNS', 23), ('RB', 17), ('NN', 15), ('RBR', 6), ('NNP', 5)]


In [130]:
train_evidence = train_df["Evidence"].tolist()
test_evidence = test_df["Evidence"].tolist()

print(f"Number of evidence in train set: {len(train_evidence)}")
print(f"Number of evidence in test set: {len(test_evidence)}")
print(f"Total number of evidence: {len(train_evidence) + len(test_evidence)}")

# How many overlapping evidence are there?
overlapping_evidence = set(train_evidence) & set(test_evidence)
print(f"Number of overlapping evidence: {len(overlapping_evidence)}")

Number of evidence in train set: 21508
Number of evidence in test set: 4688
Total number of evidence: 26196
Number of overlapping evidence: 710
