In [1]:
import pandas as pd
from nltk import word_tokenize, pos_tag, sent_tokenize
from nltk.tokenize import TweetTokenizer
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from openie import StanfordOpenIE
from collections import Counter, OrderedDict, defaultdict
from nltk.corpus import stopwords
import re
import itertools as it
import language_check
import pickle
from spacy import displacy
import en_core_web_sm
import gender_guesser.detector as gender

In [2]:
lemmatizer = WordNetLemmatizer() 
stop_words = stopwords.words('english')
nlp = spacy.load("en_core_web_sm")
tool = language_check.LanguageTool('en-US')
nlp = en_core_web_sm.load()
d = gender.Detector()
tknzr = TweetTokenizer()

From title_dep.py

In [3]:
idioms = pd.read_csv('idioms_def.csv')
articles = pd.read_csv('nyt_150_.csv')
with open('nyt_words.pickle', 'rb') as f:
    dependencies = pickle.load(f)

In [4]:
# nyt_bert_lead_verb = pd.read_csv('final/nyt_bert_lead_verb_.csv')
# nyt_bert_lead = pd.read_csv('final/nyt_bert_lead_.csv')
# nyt_bert_title_verb = pd.read_csv('final/nyt_bert_title_verb_.csv')
# nyt_bert_title = pd.read_csv('final/nyt_bert_title_.csv')
# nyt_use_lead_verb = pd.read_csv('final/nyt_use_lead_verb_.csv')
# nyt_use_lead = pd.read_csv('final/nyt_use_lead_.csv')
# nyt_use_title_verb = pd.read_csv('final/nyt_use_title_verb_.csv')
# nyt_use_title = pd.read_csv('final/nyt_use_title_.csv')

In [4]:
bert = pd.read_csv('final/nyt_bert.csv')
verb = pd.read_csv('final/nyt_bert_verb.csv')

### Get idioms with examples from https://idioms.thefreedictionary.com

In [5]:
idx = []
bad_idx = []
with_example = []
for i, item in tqdm(verb.iterrows(), total=len(bert)):
    example = list(idioms[idioms['idiom'] == item['candidate']]['example'].dropna())
    
    if len(example) == 0:
        name = item['candidate'].lower().replace('-', ' ').replace(' ', '+').replace("'", '%27')
        link = f"https://idioms.thefreedictionary.com/{name}"
        url = requests.get(link).content
        soup = BeautifulSoup(url, 'html.parser')
        example = []
        for ex in soup.find_all('span', 'illustration'):
            example.append(ex.text)

     
    with_example.append({'article': item['article'], 'lead': item['lead'], 'content': item['content'],
                         'candidate': item['candidate'], 'pos': item['pos'], 'definition': item['definition'], 
                         'example': example, 'score': item['score'], 'type': item['type']})
with_example = pd.DataFrame(with_example)

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))




## Modification

In [6]:
def title_chunks(title):
    """
    spacy chunks from title
    """
    name = []
    doc = nlp(title)
    for chunk in doc.noun_chunks:
        name.append((chunk.text, chunk.root.head.text, chunk.root.dep_))
    return name  

In [7]:
def get_lemma(word):
    """
    get lemma from word
    """
    if pos_tag([word])[0][1].find('V') != -1: 
        lemma = lemmatizer.lemmatize(word, 'v')
    elif word == "one's":
        lemma = "one's"
    else: 
        lemma = lemmatizer.lemmatize(word)
    return lemma

In [8]:
def get_lemma_sentence(word, sentence): 
    """
    get lemma with pos
    """
    t = ''
    for i, (item, tag) in enumerate(pos_tag(word_tokenize(sentence))):
        if item == word:
            t = tag
    if t.find('V') != -1:
        lemma = lemmatizer.lemmatize(word, 'v')
    else: 
        lemma = lemmatizer.lemmatize(word)
    return lemma

In [9]:
def is_popular(word, sentence): 
    """
    check is subject popular
    """
    t = ''
    for item in nlp(sentence).ents:
        if item.text == word:
            t = item.label_
    if t == 'PERSON':
        return True

    adress = 'https://people3.azurewebsites.net/People/Search?SearchString='
    url = requests.get(adress + word).content
    soup = BeautifulSoup(url, 'html.parser')
    if len(soup.find_all(id='item_Name')):
        return True
    return False

### Pattern

In [10]:
def get_pattern_verb(idiom, text):
    """
    get pattern for modification where idiom is verb
    """
#     lemmas = [get_lemma(word) for word in word_tokenize(idiom)]
    lem = []
    wait = False
    for item in word_tokenize(idiom): 
        if wait and item == "'s":
            lem.append("one's")
            wait = False
        elif item == 'one':
            wait = True
        else:
            lem.append(item)
    lemmas = [get_lemma(word) for word in lem]
    predlog = 'IN' in list(map(lambda x: x[1], pos_tag(idiom.split())))
    order = {key: i for i, key in enumerate(lemmas)}
    dep = ['nsubj', 'dobj']
    doc = nlp(' '.join(text))
    replace = []
    for chunk in doc.noun_chunks:
#         print('Chunks:', chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)
        dependency = False
        for d in dep:
            if d in chunk.root.dep_:
                dependency = True
            if d == 'pobj' and predlog:
                dependency = True
        if (get_lemma(chunk.text) in lemmas or get_lemma(chunk.root.head.text) in lemmas) and dependency:
#             print(chunk.text, chunk.root.text, chunk.root.dep_,
#                 chunk.root.head.text)
            num = []
            try:
                num.append(order[get_lemma(chunk.text)])
            except:
                pass
            
            try:
                num.append(order[get_lemma(chunk.root.head.text)])
            except:
                pass
            
            num = min(num)
            replace.append((chunk.text, chunk.root.head.text, chunk.root.dep_, num))
    return replace, order

In [11]:
def get_pattern_adj(idiom, text):
    """
    get pattern for modification where idiom is adj

    """
    lemmas = [get_lemma(word) for word in word_tokenize(idiom)]
    predlog = 'IN' in list(map(lambda x: x[1], pos_tag(idiom.split())))
    order = {key: i for i, key in enumerate(lemmas)}
    dep = ['amod']
    doc = nlp(' '.join(text))
    replace = []
    for token in doc:
#         print(token.text, token.dep_, token.head.text, token.head.pos_,
#             [child for child in token.children])
#         print('Chunks:', token.text, token.head.text, token.dep_)
        token_in = False
        for k in lemmas:
            if k.find(token.text) != -1:
                token_in = True
                token_k = k
        if token_in and token.dep_ in dep:
            
#             print(chunk.text, chunk.root.text, chunk.root.dep_,
#                 chunk.root.head.text)
            num = []
            try:
                num.append(order[token_k])
            except:
                pass
            
            try:
                num.append(order[get_lemma(token.head.text)])
            except:
                pass
            
            num = min(num)
            replace.append((token.head.text, token_k, token.dep_, num))
    return replace, order

In [31]:
def replace_pattern(pattern, dependencies, order, idx): 
    final = {get_lemma(item[1]): [] for item in pattern}
    new = {}
    for sent in with_example['example'][idx]:
        for word in word_tokenize(sent): 
            if word in final.keys():
                k = get_lemma_sentence(word, sent)
                try:
                    new[k] += final[word]
                except KeyError:
                    new[k] = final[word]
    final = new
    final_lems = [get_lemma(item) for item in final.keys()]
    not_in = {item: [(item, '', i, 'None', 0)] for item, i in order.items() if item not in final.keys()}
    final = {**final, **not_in}
    
    
    subject = False
    for subj, obj, dep, score in dependencies: 
        if re.search('nsubj', dep) or dep == 'ROOT':
            subject = True
            try:
                pat = [item for item in pattern if re.search('nsubj', item[2])]
                for p in pat:
                    final[get_lemma(p[1])].append((subj, p[1], p[3], dep, score))
            except:
                pass
        elif re.search('obj', dep): 
            try:
                pat = [item for item in pattern if re.search('obj', item[2])]
                for p in pat:
                    final[get_lemma(p[1])].append((p[1], subj, p[3], dep, score))
            except:
                pass
            
    if not subject: 
        for subj, obj, dep, score in dependencies: 
            try:
                pat = [item for item in pattern if re.search('nsubj', item[2])]
                for p in pat:
                    final[get_lemma(p[1])].append((p[1], subj, p[3], dep, score))
            except:
                pass
    
            
    new_final = {}
    for key, value in final.items():
        if len(value) == 0:
            new_final[key] = [(key, '', order[key], 'None', 0)]
        else:
            new_final[key] = value

    allNames = sorted(final)
    combinations = it.product(*(final[Name] for Name in allNames))
    final = []
    for item in combinations:
        sort = sorted(item, key=lambda x: x[2])
        words = []
        subj_exist = False
        subj_name = ''
        for x in sort:
            if re.search('nsubj', x[3]) or x[3] == 'ROOT':
                subj_exist = True
        scores = 0
        if subj_exist: 
            subj_name = sort[0][0]
        else:
            subj_name = sort[0][1]
        for w1, w2, _, _, s in sort:
            if subj_exist:
                words.extend([w1, w2])
            else:
                words.extend([w2, w1])
            scores += s
        final_sent = re.sub('\s+', ' ', ' '.join(words)).rstrip().lstrip()
        if final_sent.find("one's") != -1:
            if is_popular(subj_name, with_example['article'][idx]):
                if d.get_gender(subj_name.split()[0]).find('female') != -1: 
                    predlog = 'her'
                elif d.get_gender(subj_name.split()[0]).find('male') != -1: 
                    predlog = 'his'
            else: 
                predlog = 'its'
            final_sent = final_sent.replace("one's", predlog)
        final.append((final_sent, scores))
    result = list(filter(lambda x: x[0] != with_example['candidate'][idx], set(final)))
    return result

## Run

In [30]:
def run(idx):
    if with_example['pos'][idx] == 'Verb':
        pattern, order = get_pattern_verb(with_example['candidate'][idx], with_example['example'][idx])
        return replace_pattern(pattern, dependencies[idx], order, idx)
    elif with_example['pos'][idx] == 'Adjective':
        pattern, order = get_pattern_adj(with_example['candidate'][idx], with_example['example'][idx])
        return replace_pattern_adj(pattern, dependencies[idx], order, idx)

In [37]:
mods = []
for i in tqdm(range(len(with_example)), total=len(with_example)):
    mods.append(sorted(list(filter(lambda x: x[1] != 0, run(i))), key=lambda x: x[1], reverse=True))

HBox(children=(FloatProgress(value=0.0, max=150.0), HTML(value='')))




In [244]:
run(2)

{'inch-perfect': []}
Everything Is nsubj
I Done nsubj
Russia of pobj
Inquiry of pobj
{'inch-perfect': [('Everything', 'inch-perfect', 0, 'nsubj', 1), ('I', 'inch-perfect', 0, 'nsubj', 0), ('inch-perfect', 'Russia', 0, 'pobj', 3), ('inch-perfect', 'Inquiry', 0, 'pobj', 1)]}


[('I inch-perfects', 0),
 ('Russia inch-perfect', 3),
 ('Everythings inch-perfect', 1),
 ('Inquiry inch-perfect', 1)]

In [43]:
df.dropna(subset=['modification'])[['article', 'lead', 'content', 'candidate', 'pos', 'definition', 'modification']].to_csv('nyt_150_mod_2.csv')