# Set globals

In [1]:
# set paths
path_extractions = '/content/drive/My Drive/WIKING/extractions'
path_git = '/content/drive/My Drive/WIKING/WIKING_git'
path_my_tools = '/content/drive/My Drive/my_tools'

# infiles
file_CRISPR_de = path_extractions + '/wikipedia_entries/CRISPR_de' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_en = path_extractions + '/wikipedia_entries/CRISPR_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_gene_editing_en = path_extractions + '/wikipedia_entries/CRISPR_gene_editing_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/ 
file_events = path_git + '/data/CRISPR_events - events.csv'
file_accounts = path_git + '/data/CRISPR_events - accounts.csv'

# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# add additional folders to path variable
import sys
sys.path.append(path_git + '/code')

# import Wolfgang's classes:
!pip install Levenshtein # required by Wolfgang's code
from article.article import Article
# from scraper.scraper import Scraper

# import other modules
!pip install fuzzywuzzy # https://github.com/seatgeek/fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import networkx as nx
!pip install python-louvain
from community import community_louvain
import re
import os
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint
pp = pprint.PrettyPrinter(indent=4)
from lxml import etree

# # import pywikibot
# os.chdir(path_extractions)
# !pip install pywikibot
# pywikibot_config = r"""# -*- coding: utf-8  -*-

# mylang = 'en'
# family = 'wikipedia'
# usernames['wikipedia']['en'] = 'test'"""
# with open('user-config.py', 'w', encoding="utf-8") as f:
#     f.write(pywikibot_config)
# import pywikibot # https://doc.wikimedia.org/pywikibot/master/api_ref/pywikibot.page.html

# set working directory
os.chdir(path_git)
os.getcwd()

# Load data
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)
# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions

In [11]:
def make_soup(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'}):
  # return BeautifulSoup(requests.get(url).text, 'html.parser') 
  return BeautifulSoup(requests.get(url, headers=headers).text, 'html.parser')

In [None]:
def cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp']):
  # determine communities of similarity for left and right contexts
  df = pd.DataFrame(lr_contexts, columns=['revindx','timestamp','left_id','left','keyphrase','right','right_id','revid','url','user','userid','comment',])
  for side in ['left', 'right']:
    G = nx.Graph()
    unique_contexts = list(df[side].unique())
    length = len(unique_contexts)
    for query in unique_contexts:
      indx = unique_contexts.index(query) + 1
      if indx <= length:
        scores = process.extractBests(query, unique_contexts[unique_contexts.index(query) + 1:], score_cutoff=score_cutoff, limit=length, scorer=scorer) # https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py
        edges = [(query, i[0]) for i in scores]
        G.add_edges_from(edges)
    # add community_ids for lonesome contexts as well
    unique_contexts = set(unique_contexts)
    community_dic = community_louvain.best_partition(G)
    if community_dic:
      community_contexts = set(community_dic.keys())
      lonesome_contexts = unique_contexts - community_contexts
      lonesome_dic = {k:v for v,k in enumerate(lonesome_contexts,start=max(community_dic.values())+1)}
      community_dic.update(lonesome_dic) # hiermit nun 'lid' und 'rid' Spalten befüllen...
    else:
      community_dic = {k:v for v,k in enumerate(unique_contexts)}
    # add community_ids to df
    column = df.apply(lambda row: community_dic[row[side]], axis=1)
    df['{}_id'.format(side)] = column
    df.sort_values(by=sort_by)
    return df

# Revision history into table

In [4]:
articles = [    
  CRISPR_en,
  CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  differences = article.calculate_revision_size_difference()
  revisions = [
      (revision.index, revision.timestamp.string, revision.revid, revision.url, revision.user, revision.userid, revision.size, revision.comment)
      for revision in article.yield_revisions()
  ]
  df = pd.DataFrame(revisions, columns=['revindx','timestamp','revid','url','user','userid','size','comment',])
  df.insert(loc=7, column='diff', value=differences) # https://stackoverflow.com/questions/18674064/how-do-i-insert-a-column-at-a-specific-column-index-in-pandas
  df.to_excel(path_extractions + '/wikipedia_entries/revision_history_{}.xlsx'.format(article.name))
df

CRISPR_en
CRISPR_gene_editing_en


Unnamed: 0,revindx,timestamp,revid,url,user,userid,size,diff,comment
0,0,2019-02-17 06:32:48,883727671,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77238,77238,Split from CRISPR page
1,1,2019-02-17 06:33:54,883727741,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77551,313,
2,2,2019-02-17 06:35:41,883727860,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77771,220,rectify ref 8
3,3,2019-02-17 06:37:14,883727959,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,78031,260,/* Editing */ rectify ref 25
4,4,2019-02-17 06:39:37,883728113,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,78024,-7,/* Editing */
...,...,...,...,...,...,...,...,...,...
194,194,2020-12-10 20:28:06,993476518,https://en.wikipedia.org/w/index.php?title=CRI...,Nisse Hulta,11218895,96234,243,Link to the Nobel Prize site
195,195,2020-12-15 10:27:37,994366355,https://en.wikipedia.org/w/index.php?title=CRI...,Smartse,1640548,95784,-450,rm [[WP:REFSPAM]] - see talk
196,196,2020-12-19 07:34:37,995107732,https://en.wikipedia.org/w/index.php?title=CRI...,Bagumba,8005368,95699,-85,/* top */ [[WP:NAMB]]
197,197,2020-12-22 17:57:54,995747724,https://en.wikipedia.org/w/index.php?title=CRI...,PaulTConley,35393367,95975,276,/* Predecessors */ Added detail on: zinc finge...


# Check which entry pointing to CRISPR really mentions CRISPR 

In [38]:
relevant_entries = []
irrelevant_entries = []
strange_entries = []
no_entries = []
with open(path_extractions + '/wikipedia_entries/List_of_entries_linking_to_either_CRISPR_entry.txt') as f:
    titles = sorted({i.strip() for i in f.readlines()})
for title in titles:
  print(title)
  url = 'https://en.wikipedia.org/wiki/{}'.format(title)
  soup = make_soup(url)
  if 'Wikipedia does not have an article with this exact name.' in soup.text: # just a quick hack...
    print('\tNo Soup!')
    no_entries.append(title)
    continue
  if not 'CRISPR' in soup.text:
    print('\tNo CRISPR!')
    strange_entries.append(title)
    continue
  # kill navboxes
  discard = soup.find_all(attrs={"class": "navbox"}) # https://stackoverflow.com/questions/39885359/beautifulsoup-decompose
  for match in discard:
      match.decompose()
  # kill 'series' boxes
  discard = soup.find_all(attrs={"class": "sidebar vertical-navbox nomobile plainlist"}) # https://stackoverflow.com/questions/39885359/beautifulsoup-decompose
  for match in discard:
      match.decompose()
  # Check for relevance
  if 'CRISPR' in soup.text:
    relevant_entries.append(title)
  else:
    irrelevant_entries.append(title)
print('relevant_entries')
pp.pprint(relevant_entries)
with open(path_extractions + '/wikipedia_entries/List_of_RELEVANT_entries_linking_to_either_CRISPR_entry.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(relevant_entries) - 1 else i for indx, i in enumerate(relevant_entries)])
with open(path_extractions + '/wikipedia_entries/List_of_IRRELEVANT_entries_linking_to_either_CRISPR_entry.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(irrelevant_entries) - 1 else i for indx, i in enumerate(irrelevant_entries)])

15-Cis-phytoene desaturase
2010s
2012 in science
2014 in science
2015 in science
2015 in the United States
2016 in science
2017 in science
2017 in the United States
2018 in science
2019
2019 in science
2020 in science
ATUM
Aaron Traywick
Accelerating expansion of the universe
Adam Bogdanove
Addgene
Adenoviridae
Alex Marson
Alu element
Anatoly B. Kolomeisky
Andrea Crisanti (scientist)
Anna Dumitriu
Anti-CRISPR
Antibiotic
Antidote
Antisense RNA
Antiviral drug
April–June 2020 in science
Arabidopsis thaliana
Ardipithecus
Ardipithecus ramidus
Ark Invest
	No CRISPR!
Artificial cell
Asian Scientist
Avian malaria
BASF
BLESS
Bacteria
Bacterial DNA binding protein
Bacterial small RNA
Bacteriophage
Behavioural genetics
BioGRID
Biohub
Biotechnology risk
Box jellyfish
Breakthrough of the Year
Brian Hanley (microbiologist)
Brief Answers to the Big Questions
	No CRISPR!
Bryan R. Cullen
CAS
CAS2 (disambiguation)
CASPR
CITE-Seq
COVID-19 pandemic in Karnataka
COVID-19 testing
CRISPR
CRISPR Therapeutics


In [12]:
with open(path_extractions + '/wikipedia_entries/List_of_entries_linking_to_either_CRISPR_entry.txt') as f:
    titles = sorted({i.strip() for i in f.readlines()})
for title in titles[:1]:
  url = 'https://en.wikipedia.org/wiki/{}'.format(title)
  html = requests.get(url).text
  soup = make_soup(url)


In [6]:
requests.get('https://en.wikipedia.org/wiki/Alu_element').text

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Alu element - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YCB1DB@KrezOszGlJBDCNwAAAAM","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Alu_element","wgTitle":"Alu element","wgCurRevisionId":1005493888,"wgRevisionId":1005493888,"wgArticleId":367077,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles lacking reliable references","Articles lacking reliable references from December 2017","CS1 maint: DOI inactive as of January 2021","Wikipedia articles in need of u

# Extract keyphrase contexts

## Define keyphrases

In [None]:
keyphrases = [                              
        # # discoveries
        # 'first',
        # 'discover',
        # 'invent',
        # 'history',
        # # naming things
        # 'name', # CRISPR acronym
        # 'acronym', # CRISPR acronym
        # # applications/ patents
        # 'application',
        # 'editing',
        # 'engineering',
        # 'patent',
        # 'property',
        # 'right',
        # 'trial',
        # 'clinical',
        # 'patient',
        # 'approve', # e.g. FDA approved...
        # # awards
        # 'award',
        # 'prize',
        # 'nobel',
        
        
        # # people:
        
        # 'doudna', 
        #   'charpentier', # Doudna lab?
          'jinek', # Doudna lab/ first author of 2012 paper (Unsung Hero)
          'wiedenheft', # Doudna lab (Unsung Hero)
          'haurwitz', # Doudna lab (Unsung Hero)

        # 'church',
          'mali', # Church lab/ first author of 2013 paper (Unsung Hero)
          'yang', # Church lab/ first author of 2015 paper (mentioned in Wikipedia?) (Unsung Hero)

        # 'zhang', # Broad
          'cong', # Broad (Unsung Hero)
          ' ran ', # Broad (Unsung Hero)
                
        # 'šikšnys', # (Unsung Hero)
        #     'šiksnys', # only first accent
        #     'sikšnys', # only second accent
        #     'siksnys', # no accents
        #   'gašiūnas', # Siksnys lab/ first author of 2012 paper (Unsung Hero)
        #     'gasiūnas', # only first accent
        #     'gašiunas', # only second accent
        #     'gasiunas', # no accents
        #   'horvath', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
        #   'barrangou', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
        
        # 'brouns',
        # 'mojica',
        # 'ishino',
        # 'jansen',
        # 'marraffini',
        # 'sontheimer',
        # 'koonin',
        # 'moineau',
        # 'bolotin',
        # 'sorokin',
        # 'makarova',
        # 'pourcel',
        # 'overbeek',
        # 'liang',
        # ' xu', # added space to avoid false positives!
        # 'jiankui',
        # 'nishimasu',
        # 'shibata',
        # 'lander', # review
        # # institutions:
        # 'osaka',
        # 'alicante',
        # 'danisco',
        # 'broad',
        # 'harvard',
        # 'university of california', 
        # 'berkeley', 
]

## Extract and cluster keyphrase contexts WIKIPEDIA

In [None]:
articles = [    
  CRISPR_en,
  CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  for keyphrase in keyphrases:
    print('\t', keyphrase)
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, keyphrase, right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for left, right in revision.get_lr_contexts(keyphrase, width=100, lower=True)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/keyphrase_contexts/{}/{}{}_{}.xlsx'.format(article.name, '' if len(df) > 0 else 'EMPTY_', keyphrase, article.name))

CRISPR_en
	 jinek




	 wiedenheft




	 haurwitz
	 mali




	 yang
	 cong




	  ran 




CRISPR_gene_editing_en
	 jinek




	 wiedenheft
	 haurwitz
	 mali




	 yang
	 cong




	  ran 




## Extract keyphrase contexts EXTERNAL ACCOUNTS

In [None]:
accounts = {
  'Hsu_et_al_2014':'24906146',       
  'Doudna_Charpentier_2014':'25430774',       
  'Adli_2018':'29765029',       
  'Ledford_2015':'26040877',       
  'Pennisi_2013':'23970676',       
  'Lander_2016':'26771483',       
  'Mojica_Montoliu_2016':'27401123',       
  'Mojica_Rodriguez-Valera_2016':'27234458',       
  'Ishino_et_al_2018':'29358495',       
  'van_Erp_et_al_2015':'25914022',       
  'Morange_2015a':'25740136',       
  'Sherkov_2015':'25748913',       
  'Ledford_2016a':'27443723',       
  'Han_She_2017':'29150001',       
  'Morange_2015b':'26648028',       
  'Liang_et_al_2017':'28623876',       
  'Javed_et_al_2018':'30078067',       
  'Ledford_2016b':'27652544',       
  'Ledford_Callaway_2020':'33028993',       
}

accounts = {k:'https://pubmed.ncbi.nlm.nih.gov/{}/'.format(v) for k,v in accounts.items()}
accounts

{'Adli_2018': 'https://pubmed.ncbi.nlm.nih.gov/29765029/',
 'Doudna_Charpentier_2014': 'https://pubmed.ncbi.nlm.nih.gov/25430774/',
 'Han_She_2017': 'https://pubmed.ncbi.nlm.nih.gov/29150001/',
 'Hsu_et_al_2014': 'https://pubmed.ncbi.nlm.nih.gov/24906146/',
 'Ishino_et_al_2018': 'https://pubmed.ncbi.nlm.nih.gov/29358495/',
 'Javed_et_al_2018': 'https://pubmed.ncbi.nlm.nih.gov/30078067/',
 'Lander_2016': 'https://pubmed.ncbi.nlm.nih.gov/26771483/',
 'Ledford_2015': 'https://pubmed.ncbi.nlm.nih.gov/26040877/',
 'Ledford_2016a': 'https://pubmed.ncbi.nlm.nih.gov/27443723/',
 'Ledford_2016b': 'https://pubmed.ncbi.nlm.nih.gov/27652544/',
 'Ledford_Callaway_2020': 'https://pubmed.ncbi.nlm.nih.gov/33028993/',
 'Liang_et_al_2017': 'https://pubmed.ncbi.nlm.nih.gov/28623876/',
 'Mojica_Montoliu_2016': 'https://pubmed.ncbi.nlm.nih.gov/27401123/',
 'Mojica_Rodriguez-Valera_2016': 'https://pubmed.ncbi.nlm.nih.gov/27234458/',
 'Morange_2015a': 'https://pubmed.ncbi.nlm.nih.gov/25740136/',
 'Morange_20

# Extract citation contexts

## Define citations

In [None]:
citations = [
        # {'DOI':'10.1016/j.cell.2014.05.010',    'PMID':'24906146',  'PMC':'',         'name':'Hsu_et_al_2014'},
        # {'DOI':'10.1126/science.1258096',       'PMID':'25430774',  'PMC':'',         'name':'Doudna_Charpentier_2014'},
        # {'DOI':'10.1038/s41467-018-04252-2',    'PMID':'29765029',  'PMC':'5953931',  'name':'Adli_2018'},
        {'DOI':'10.1038/522020a',               'PMID':'26040877',  'PMC':'',         'name':'Ledford_2015'},
        # {'DOI':'10.1126/science.341.6148.833',  'PMID':'23970676',  'PMC':'',         'name':'Pennisi_2013'},
        # {'DOI':'10.1016/j.cell.2015.12.041',    'PMID':'26771483',  'PMC':'',         'name':'Lander_2016'},
        # {'DOI':'10.1016/j.tim.2016.06.005',     'PMID':'27401123',  'PMC':'',         'name':'Mojica_Montoliu_2016'},
        # {'DOI':'10.1111/febs.13766',            'PMID':'27234458',  'PMC':'',         'name':'Mojica_Rodriguez-Valera_2016'},
        # {'DOI':'10.1128/JB.00580-17',           'PMID':'29358495',  'PMC':'5847661',  'name':'Ishino_et_al_2018'},
        {'DOI':'10.1016/j.coviro.2015.03.011',  'PMID':'25914022',  'PMC':'',         'name':'van_Erp_et_al_2015'},
        {'DOI':'10.1007/s12038-015-9532-6',     'PMID':'25740136',  'PMC':'',         'name':'Morange_2015a'},
        {'DOI':'10.1038/nbt.3160',              'PMID':'25748913',  'PMC':'',         'name':'Sherkov_2015'},
        {'DOI':'10.1007/978-3-642-34657-6_1',   'PMID':'',          'PMC':'',         'name':'Mojica_Garett_2013'},
        # {'DOI':'10.1038/535342a',               'PMID':'27443723',  'PMC':'',         'name':'Ledford_2016a'},
        # {'DOI':'10.1016/bs.pmbts.2017.10.001',  'PMID':'29150001',  'PMC':'',         'name':'Han_She_2017'},
        {'DOI':'10.1007/s12038-015-9575-8',     'PMID':'26648028',  'PMC':'',         'name':'Morange_2015b'},
        # {'DOI':'10.1002/jgm.2963',              'PMID':'28623876',  'PMC':'',         'name':'Liang_et_al_2017'},
        # {'DOI':'10.1007/s00284-018-1547-4',     'PMID':'30078067',  'PMC':'',         'name':'Javed_et_al_2018'},
        # {'DOI':'10.1038/537460a',               'PMID':'27652544',  'PMC':'',         'name':'Ledford_2016b'},
        {'DOI':'10.1007/978-3-642-34657-6_3',   'PMID':'',          'PMC':'',         'name':'Makarova_Koonin_2013'},
        {'DOI':'10.1007/978-3-642-34657-6_11',  'PMID':'',          'PMC':'',         'name':'Horvath_et_al_2013'},
        # {'DOI':'10.1038/d41586-020-02765-9',    'PMID':'33028993',  'PMC':'',         'name':'Ledford_Callaway_2020'}, 
]

## Extract and cluster citation contexts

In [None]:
articles = [    
  CRISPR_en,
  # CRISPR_gene_editing_en,
]

for article in articles:
  print(article.name)
  for citation in citations:
    print('\t', citation['name'])
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, matched_citation.get_superscript(revision), right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for id_type, id_num in {k:v for k,v in citation.items() if v and any(k==i for i in ['DOI', 'PMC', 'PMID'])}.items()
      for matched_citation in set(
          reference for reference in revision.get_references() if reference.get_identifiers()[id_type] == id_num
          )
      for left, right in revision.get_lr_contexts(
          r'\[{}\]'.format(matched_citation.get_superscript(revision).strip().strip('[]')), # I do this crap to make sure '[5]' becomes '\\[5\\]' or else finiter cuts the brakets
          width=100, 
          lower=True, # search in lowercase text of revision
          )
      if matched_citation.get_superscript(revision)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/citation_contexts/{}/{}{}_{}.xlsx'.format(article.name, '' if len(df) > 0 else 'EMPTY_', citation['name'], article.name))

CRISPR_en
	 Ledford_2015
	 van_Erp_et_al_2015
	 Morange_2015a
	 Sherkov_2015
	 Mojica_Garett_2013
	 Morange_2015b
	 Makarova_Koonin_2013
	 Horvath_et_al_2013


# Experiments and snippets

In [None]:
# revision = CRISPR_en.get_revision(revid=725648335)
# print(revision.url, "\n")
# pp.pprint(revision.get_lr_contexts('was'))
# references = revision.get_references()
# reference = references[1]
# print("REFERENCE\n", reference.get_text())
# print("REFERENCE ID\n", reference.get_id())
# print("REFERENCE Number\n", reference.get_number_via_id())
# print("REFERENCE Superscript\n", reference.get_superscript(revision))

In [None]:
# def occurance_spans(keyphrase, article):
#   ''' 
#   returns a list of time spans during which a keyphrase 'keyphrase' continously appears in an article 'a'
#   '''
#   gen = article.yield_revisions()
#   last_rev_was_match = False
#   spans = []
#   for revision in gen:
#     if keyphrase in revision.get_text():
#     # if keyphrase in i.get_text().split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
#       last_rev_that_matched = revision
#       if not last_rev_was_match:
#         span_beg = revision
#         last_rev_was_match = True
#     else:
#       if last_rev_was_match:
#         spans.append((span_beg, last_rev_that_matched))
#         last_rev_was_match = False
#   if last_rev_was_match:
#     spans.append((span_beg, last_rev_that_matched))
#   print(
#       '"{}" was found in the following revisions of {}:\n{}'.format(keyphrase, 'Test', '\n'.join(['Span {}: {} - {}'.format(indx + 1, revision[0].timestamp, revision[1].timestamp) for indx,revision in enumerate(spans)])) 
#       if spans else '"{}" was not found in any revision of "{}"'.format(keyphrase, 'Test')
#       )
#   return spans