PMI(w, l) = log ( P(w | l) / P(w) ) where P(w | l) is a probability that word w appears in a set of sentences of grade level l and P(w) is a probability of word w being within the entire training corpus. 

PPMI(w, l) = max(PMI(w, l), 0).

Words
with negative PMI scores have a negative correlation against l that means w tends to appear across different sentence levels

In [None]:
def read_file(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(line.strip())
    return data

In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize

def get_vocab(y, moses=False):
    vocab_counter=Counter()
    for line in y:
        vocab_counter.update(line.split(" "))
    return vocab_counter

In [None]:
grade_files = "" # file mapping grade to articles

In [None]:
all_grade_data = {grade: [] for grade  in range(2, 11)}
for grade in range(2, 11):
    for filename in grade_files:
        all_grade_data[grade].extend(read_file(filename))
    
grade_vocab = {}
for grade in range(2, 11):
    grade_vocab[grade] = get_vocab(all_grade_data[grade])

In [None]:
all_vocab = # vocab from all articles using get_vocab 

In [None]:
import math

PMI = {}

for word in all_vocab:
    for grade in range(2, 11):
        PMI[(word, grade)] = math.log( (complex_vocab[word] / all_vocab[word] ) / (all_vocab[word] / sum(all_vocab.values())))

In [None]:
pickle.dump(PMI, open("data/PMI_nltk.pkl","wb"))
pickle.dump(grade_vocab, open("data/GradeVocab_nltk.pkl","wb"))

In [None]:
import pickle
PMI = pickle.load(open("data/PMI_nltk.pkl","rb"))
grade_vocab = pickle.load(open("data/GradeVocab_nltk.pkl", "rb"))

In [None]:
def get_positive_PMI(grade):
    words = []
    for word in grade_vocab[grade]:
        if PMI[(word, grade)] > 0:
            words.append(word)
    return words

In [None]:
words = get_positive_PMI(12)

In [None]:
word="washington"
[PMI[(word, grade)] for grade in range(2,13) if (word, grade) in PMI]

In [None]:
def read_file(filename, sep="~"):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(line.strip().split(sep))
    return data

In [None]:
token = {'2': '<TWO>', '3': '<THREE>' , '4': '<FOUR>', '5': '<FIVE>', '6' : '<SIX>',
     '7': '<SEVEN>', '8':'<EIGHT>', '9' : '<NINE>', '10': '<TEN>', '11': '<ELEVEN>', '12' : '<TWELVE>'}
inv_map = {v: int(k) for k, v in token.items()}

In [None]:
dev_src = read_file("../data/dev.src.nograde")
dev_tgt = read_file("../data/dev.tgt")
grades =  read_file("../data/dev.src-tgt.grade", "\t")
oracle_const = read_file("../experiments/exp-1/data/test.const.src.oracle.del", "  ")

In [None]:
target_grade = [inv_map[x[1]] for x in grades]
source_grade = [inv_map[x[0]] for x in grades] 
src_text = [x[0].lower() for x in dev_src]
tgt_text = [x[0].lower() for x in dev_tgt]
oracle_bpes = [x[1].lower() for x in oracle_const]
src_bpes = [x[0].lower() for x in oracle_const]

In [None]:
from tqdm import tqdm

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

flatten = lambda t: [item for sublist in t for item in sublist]

all_entities = []
all_entities_caps = []
for i in tqdm(range(len(dev_src))):
    entities = flatten([X.text.lower().split() for X in nlp(dev_src[i][0]).ents])
    all_entities.append(entities)
    all_entities_caps.append(flatten([X.text.split() for X in nlp(dev_src[i][0]).ents]))

In [None]:
with open("data/test_ents.pkl","wb") as f:
    pickle.dump([all_entities, all_entities_caps], f)

In [None]:
with open("data/test_ents.pkl","rb") as f:
    all_entities, all_entities_caps = pickle.load(f)

In [None]:
import sys
sys.path.append("../readability/")
from compute_grade_stats import get_text_grade_score, clip_value
from SARI import SARIsent
import numpy as np
clip_val=False
import sacrebleu

def get_sari(src, tgt, out):
    return SARIsent(src.strip(), out.strip(), [tgt.strip()])[0]

def get_bleu(tgt, out):
    return sacrebleu.sentence_bleu(out, [tgt]).score

def get_grade(text, clip_val=True, grade_type="ARI"):
    return get_text_grade_score(text, clip_val, grade_type)

In [None]:
from nltk.tokenize import word_tokenize
import string

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

words = []
pos_const = []
neg_const = []
overlap_score_pos = []
overlap_score_neg = []
include_ent = True
include_stop_words = True
include_punct=True
neg_new_source = []
count = 0
for i in tqdm(range(len(src_text))):
#     sg = source_grade[i]
    tg = target_grade[i]
    sg = int(get_grade(src_text[i], clip_val=True, grade_type="ARI"))
    pos_words = []
    
    tgt_words = word_tokenize(tgt_text[i])
    src_words = word_tokenize(src_text[i])
    
    pos_oracle = [x for x in src_words if x in tgt_words]
    
    neg_words = []
    for word in word_tokenize(src_text[i]):

        if (word, sg) in PMI  and (word, tg) in PMI and PMI[(word, sg)] > 0 and PMI[(word, tg)] < 0:
            neg_words.append(word)
    
    set_exclude = []
    if include_ent:
        set_exclude.extend(all_entities[i])
    if include_stop_words:
        set_exclude.extend(stop_words)
    
    neg_words = [x for x in neg_words if not (x in set_exclude)]
    neg_const.append(neg_words)
    
    new_pos = [x for x in word_tokenize(src_text[i]) if x not in neg_words]
    neg_new_source.append(new_pos)
    if len(new_pos) == 0:
        new_pos = src_words
    prec_score = len(set(pos_oracle).intersection(set(new_pos))) / len(set(pos_oracle))
    recall_score = len(set(pos_oracle).intersection(set(new_pos))) / len(set(new_pos))
    overlap_score_neg.append((prec_score, recall_score))

In [None]:
def print_prec_rec(scores):
    prec = [x[0] for x in scores]
    rec = [x[1] for x in scores]
    print("Precision: ", sum(prec)/len(prec))
    print("Recall: ", sum(rec)/len(rec))

In [None]:
print_prec_rec(overlap_score_neg)

In [None]:
import pickle
with open("neg_const_test_ar.txt","w") as f:
    for words in neg_new_source:
        f.write((" ").join(words) + "\n")

Here, call bash  python $bpe_scripts_path/apply_bpe.py --codes ../experiments/exp-1/data/bpe < neg_const_dev_recall.txt > neg_const_dev_recall.bpe  to generate bpes

In [None]:
neg_const_bpe = []
with open("neg_const_test_ar.bpe") as f:
    for line in f:
        neg_const_bpe.append(line.strip().split())

In [None]:
overlap_score = []
for i in range(len(neg_const_bpe)):
    pos_oracle = oracle_bpes[i].split(" ")
    pos_words = [x for x in src_bpes[i].split(" ")[1:] if x not in neg_const_bpe[i]]
    prec_score = len(set(pos_oracle).intersection(set(pos_words))) / len(set(pos_oracle))
    if len(set(pos_words)) > 0:
        rec_score = len(set(pos_oracle).intersection(set(pos_words))) / len(set(pos_words))
    else:
        rec_score = 0
    overlap_score.append((prec_score, rec_score))

In [None]:
prec = [x[0] for x in overlap_score ]
rec = [x[1] for x in overlap_score ]

In [None]:
sum(prec)/len(prec)

In [None]:
# includes more words than required
sum(rec)/len(rec)

In [None]:
import matplotlib.pyplot as plt
plt.hist(prec)