# Set globals

In [1]:
# set paths
path_extractions = '/content/drive/My Drive/WIKING/extractions'
path_git = '/content/drive/My Drive/WIKING/WIKING_git'
path_my_tools = '/content/drive/My Drive/my_tools'

# infiles
file_CRISPR_de = path_extractions + '/wikipedia_entries/CRISPR_de' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_en = path_extractions + '/wikipedia_entries/CRISPR_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_gene_editing_en = path_extractions + '/wikipedia_entries/CRISPR_gene_editing_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/ 
file_events = path_git + '/data/CRISPR_events - events.csv'
file_accounts = path_git + '/data/CRISPR_events - accounts.csv'

# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# add additional folders to path variable
import sys
sys.path.append(path_git + '/code')

# import Wolfgang's classes:
!pip install Levenshtein # required by Wolfgang's code
from article.article import Article
# from scraper.scraper import Scraper

# import other modules
!pip install fuzzywuzzy # https://github.com/seatgeek/fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import networkx as nx
!pip install python-louvain
from community import community_louvain
import re
import os
import pickle
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint
pp = pprint.PrettyPrinter(indent=4)
from lxml import etree
import lxml # because I want to be able to call lxml.html to avoid a name conflict with solo html 

# # import pywikibot
# os.chdir(path_extractions)
# !pip install pywikibot
# pywikibot_config = r"""# -*- coding: utf-8  -*-

# mylang = 'en'
# family = 'wikipedia'
# usernames['wikipedia']['en'] = 'test'"""
# with open('user-config.py', 'w', encoding="utf-8") as f:
#     f.write(pywikibot_config)
# import pywikibot # https://doc.wikimedia.org/pywikibot/master/api_ref/pywikibot.page.html

# set working directory
os.chdir(path_git)
os.getcwd()

# Load data
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)
# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Functions

In [2]:
def make_soup(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)'}):
  # return BeautifulSoup(requests.get(url).text, 'html.parser') 
  return BeautifulSoup(requests.get(url, headers=headers).text, 'html.parser')

In [3]:
def cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp']):
  # determine communities of similarity for left and right contexts
  df = pd.DataFrame(lr_contexts, columns=['revindx','timestamp','left_id','left','keyphrase','right','right_id','revid','url','user','userid','comment',])
  for side in ['left', 'right']:
    G = nx.Graph()
    unique_contexts = list(df[side].unique())
    length = len(unique_contexts)
    for query in unique_contexts:
      indx = unique_contexts.index(query) + 1
      if indx <= length:
        scores = process.extractBests(query, unique_contexts[unique_contexts.index(query) + 1:], score_cutoff=score_cutoff, limit=length, scorer=scorer) # https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py
        edges = [(query, i[0]) for i in scores]
        G.add_edges_from(edges)
    # add community_ids for lonesome contexts as well
    unique_contexts = set(unique_contexts)
    community_dic = community_louvain.best_partition(G)
    if community_dic:
      community_contexts = set(community_dic.keys())
      lonesome_contexts = unique_contexts - community_contexts
      lonesome_dic = {k:v for v,k in enumerate(lonesome_contexts,start=max(community_dic.values())+1)}
      community_dic.update(lonesome_dic) # hiermit nun 'lid' und 'rid' Spalten befüllen...
    else:
      community_dic = {k:v for v,k in enumerate(unique_contexts)}
    # add community_ids to df
    column = df.apply(lambda row: community_dic[row[side]], axis=1)
    df['{}_id'.format(side)] = column
  df.sort_values(by=sort_by)
  return df

In [4]:
# https://stackoverflow.com/questions/6822725/rolling-or-sliding-window-iterator
def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(itertools.islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

# Check names


In [2]:
with open(path_extractions + '/author_names.txt', 'r') as reader:
  names = sorted(set(name[1].title() # normalize style (what about accents?!)
              for name in [line.strip().split('\t') for line in reader.readlines()][1:] 
              if len(name[1].title()) > 2 # exclude short names
              ))

names = ['Doudna','Charpentier']

include_sections = ['History', 'Applications', 'Discovery and properties', 'The significance for evolution and possible applications']
exclude_sections = ['Contents', 'References', 'Reference', 'Further reading', 'External links', 'See also', 'Notes', 'Web sites', ]

articles = [    
  CRISPR_gene_editing_en,
  # CRISPR_en,
]

name_dict = {name:{} for name in names}
for article in articles:
  # print(article.name)
  for revision in article.yield_revisions():
    # print(revision.index, revision.url)
    for section in [s for s in revision.section_tree().subsections if s.level == 1 and not s.name in exclude_sections]:
      # print()
      # print(section.name)
      # print('*'*80)
      # print(section.get_text(level=3, with_headings=True))
      for name in names:
        if name in section.get_text(level=3, with_headings=True): # with_headings = True -> Evtl gibts auch mal ne Doudna Section...
          if section.name in name_dict[name]:
            name_dict[name][section.name].append((revision.index, revision.url))
          else:
            name_dict[name][section.name] = [(revision.index, revision.url)]
  with open(path_extractions + f'/name_dict{article.name}.pickle', 'wb') as handle:
      pickle.dump(name_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(path_extractions + f'/name_dict{article.name}.pickle', 'rb') as handle:
    name_dict = pickle.load(handle)

pp.pprint(name_dict)

# print(article.name)
# names_found = sorted(set(
#     name
#     for revision in article.yield_revisions()
#     for name in names
#     for section in revision.get_arno_sections()
#     if section.level == 2
#     if not section.heading in exclude_sections
#     if name in section.get_all_text()
# ))
# print('\tNames found:',len(names_found))

# with open(path_extractions + f'/author_names_in_{article.name}.txt', 'w') as writer:
#   for name in names_found:
#     writer.write(name + '\n')

0 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883727671
1 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883727741
2 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883727860
3 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883727959
4 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883728113
5 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883728392
6 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883728593
7 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883729514
8 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883729569
9 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883729682
10 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883730286
11 https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=883730589
12 https://en.wikipedia.or

In [12]:
'sdfgdfgh [edit]'.split('[edit]')[0]

'sdfgdfgh '

In [14]:
pp.pprint(name_dict['Doudna'])

{   'History': [   (   188,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583328838'),
                   (   189,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583328914'),
                   (   190,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583346775'),
                   (   191,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583365114'),
                   (   192,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583509901'),
                   (   193,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583542972'),
                   (   194,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583706288'),
                   (   195,
                       'https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=583918416'),
                   (   196,
    

In [14]:
('HEADING' + ("\n0\n0" if 5==5 else '')) * True

'HEADING\n0\n0'

# Extract sections

## Using the functions I already pushed to the git

In [None]:
# revision = CRISPR_en.get_revision(revid=701817377)
revision = CRISPR_gene_editing_en.get_revision(revid=986682164)
revision.url

'https://en.wikipedia.org/w/index.php?title=CRISPR gene editing&oldid=986682164'

In [None]:
section = revision.get_specific_sections(['History'])[0]
section.get_all_text()

'Predecessors\n\nIn the early 2000s, researchers developed zinc finger nucleases (ZFNs), synthetic proteins whose DNA-binding domains enable them to create double-stranded breaks in DNA at specific points. In 2010, synthetic nucleases called transcription activator-like effector nucleases (TALENs) provided an easier way to target a double-stranded break to a specific location on the DNA strand. Both zinc finger nucleases and TALENs require the design and creation of a custom protein for each targeted DNA sequence, which is a much more difficult and time-consuming process than that of designing guide RNAs. CRISPRs are much easier to design because the process requires synthesizing only a short RNA sequence, a procedure that is already widely used for many other molecular biology techniques (e.g. creating oligonucleotide primers).[12]\nWhereas methods such as RNA interference (RNAi) do not fully suppress gene function, CRISPR, ZFNs, and TALENs provide full irreversible gene knockout.[13]

In [None]:
revision.get_section_headings()

['Contents',
 'Synopsis',
 'History',
 'History/Predecessors',
 'History/Discovery',
 'History/Patents and commercialization',
 'History/Recent events',
 'Genome engineering',
 'Genome engineering/Major components',
 'Genome engineering/Structure',
 'Genome engineering/Delivery',
 'Genome engineering/Controlled genome editing',
 'CRISPR screening',
 'Applications',
 'Applications/Disease models',
 'Applications/Biomedicine',
 'Applications/Biomedicine/CRISPR in the treatment of infection',
 'Applications/Biomedicine/CRISPR and cancer',
 'Applications/Knockdown/activation',
 'Applications/RNA editing',
 'Applications/Gene drive',
 'Applications/In vitro genetic depletion',
 'Applications/Prime editing',
 'Society and culture',
 'Society and culture/Human germline modification',
 'Society and culture/Policy barriers to genetic engineering',
 'Society and culture/Recognition',
 'See also',
 'References']

In [None]:
[section for section in revision.get_arno_sections() if any(section.heading == heading for heading in ['History'])][0].get_all_text()

'Repeated sequences\n\nThe discovery of clustered DNA repeats occurred independently in three parts of the world. The first description of what would later be called CRISPR is from Osaka University researcher Yoshizumi Ishino and his colleagues in 1987. They accidentally cloned part of a CRISPR sequence together with the "iap" gene (isozyme conversion of alkaline phosphatase)[14] that was their target. The organization of the repeats was unusual. Repeated sequences are typically arranged consecutively, without interspersed different sequences.[14][11] They did not know the function of the interrupted clustered repeats.\nIn 1993, researchers of Mycobacterium tuberculosis in the Netherlands published two articles about a cluster of interrupted direct repeats (DR) in that bacterium. They recognized the diversity of the sequences that intervened the direct repeats among different strains of M. tuberculosis[15] and used this property to design a typing method that was named spoligotyping, w

In [None]:
revision.get_arno_sections()[1].heading

'History'

In [None]:
for ref in revision.get_arno_sections()[1].get_all_references():
  print(ref.get_text())

"Cpf1 Nuclease". abmgood.com. Retrieved 2017-12-14.
Abudayyeh OO, Gootenberg JS, Konermann S, Joung J, Slaymaker IM, Cox DB,  et al. (August 2016). "C2c2 is a single-component programmable RNA-guided RNA-targeting CRISPR effector". Science. 353 (6299): aaf5573. doi:10.1126/science.aaf5573. PMC 5127784. PMID 27256883.
Baltimore D, Berg P, Botchan M, Carroll D, Charo RA, Church G, Corn JE, Daley GQ, Doudna JA, Fenner M, Greely HT, Jinek M, Martin GS, Penhoet E, Puck J, Sternberg SH, Weissman JS, Yamamoto KR (April 2015). "Biotechnology. A prudent path forward for genomic engineering and germline gene modification". Science. 348 (6230): 36–38. Bibcode:2015Sci...348...36B. doi:10.1126/science.aab1028. PMC 4394183. PMID 25791083.
Barrangou R (November 2015). "Diversity of CRISPR-Cas immune systems and molecular machines". Genome Biology. 16: 247. doi:10.1186/s13059-015-0816-9. PMC 4638107. PMID 26549499.
Barrangou R, Fremaux C, Deveau H, Richards M, Boyaval P, Moineau S,  et al. (March 2007

In [None]:
for section in revision.get_arno_sections():
  print(section.heading)
  if section.children:
    print('\t', [c.heading for c in section.children])

Contents
History
	 ['Repeated sequences', 'CRISPR-associated systems', 'Cas9', 'Cas12a (formerly Cpf1)', 'Cas13 (formerly C2c2)']
Repeated sequences
CRISPR-associated systems
Cas9
Cas12a (formerly Cpf1)
Cas13 (formerly C2c2)
Locus structure
	 ['Repeats and spacers', 'CRISPR RNA structures', 'Cas genes and CRISPR subtypes']
Repeats and spacers
CRISPR RNA structures
Cas genes and CRISPR subtypes
Mechanism
	 ['Spacer acquisition', 'Biogenesis', 'Interference']
Spacer acquisition
	 ['Protospacer adjacent motifs', 'Insertion variants']
Protospacer adjacent motifs
Insertion variants
Biogenesis
Interference
Evolution
	 ['Coevolution', 'Rates']
Coevolution
Rates
Identification
Use by phages
Applications
	 ['CRISPR gene editing', 'CRISPR as diagnostic tool']
CRISPR gene editing
CRISPR as diagnostic tool
See also
Notes
References
Further reading
External links


## Building the functions (prior to pushing them to the git)

In [None]:
class Section:
  
  def __init__(self, heading_html, text_html):
    self.heading_html = heading_html
    self.heading = ''.join(lxml.html.fromstring(heading_html).itertext()) if heading_html else ''
    self.text_html = text_html
    self.text = ''.join(lxml.html.fromstring(text_html).itertext()) if text_html else ''
    self.level = int(heading_html.strip()[2:3]) # so far, ony one-digit levels

    self.children = []
    self.parent = None
    self.next = None
    self.previous = None
  
  def get_html(self):
    return self.heading_html + self.text_html
  
  def get_text(self):
    return self.text

  def get_all_text(self):
    return self.text + '\n\n'.join(['\n\n'.join([child.heading, child.text]) for child in self.children])


In [None]:
headline_range = range(1,7)
starts = [m.start() for m in re.finditer(r'|'.join([r'<h{0}.*?h{0}>'.format(i) for i in headline_range]), revision.html)]
ends = [m.end() for m in re.finditer(r'|'.join([r'<h{0}.*?h{0}>'.format(i) for i in headline_range]), revision.html)]

headings_html = [revision.html[start:end] for start, end in zip(starts,ends)]
texts_html = [revision.html[end:start] for end, start in zip(ends, starts[1:])] + [revision.html[ends[-1]:]]

sections = [Section(heading_html.strip(), text_html.strip()) for heading_html, text_html in zip(headings_html, texts_html)]

last_parents = [None for i in range(0,10)]
last_section = None
for section in sections:
  if last_section:
    last_section.next = section
    if section.level > last_section.level:
      last_section.children.append(section)
      section.parent = last_section
      last_parents[section.level] = section
    elif section.level == last_section.level:
      if last_section.parent:
        section.parent = last_section.parent
        last_section.parent.children.append(section)
      last_parents[section.level] = section
    elif section.level < last_section.level:
      if last_parents[section.level]:
        if last_parents[section.level].parent:
          section.parent = last_parents[section.level].parent 
          last_parents[section.level].parent .children.append(section)
    section.previous = last_section
  last_section = section
  last_parents[section.level] = section

In [None]:
pp.pprint(sections[17].get_all_text())

('CRISPR associated proteincrystal structure of a crispr-associated protein '
 'from Thermus thermophilusIdentifiersSymbolCRISPR_assocPfamPF08798Pfam '
 'clanCL0362InterProIPR010179CDDcd09727Available protein structures:Pfam\n'
 ' \xa0structures / ECOD\n'
 ' \xa0PDBRCSB PDB; PDBe; PDBjPDBsumstructure summary\n'
 'CRISPR associated protein Cas2crystal structure of a hypothetical protein '
 'tt1823 from Thermus '
 'thermophilusIdentifiersSymbolCRISPR_Cas2PfamPF09827InterProIPR019199CDDcd09638Available '
 'protein structures:Pfam\n'
 ' \xa0structures / ECOD\n'
 ' \xa0PDBRCSB PDB; PDBe; PDBjPDBsumstructure summary\n'
 'CRISPR-associated protein '
 'Cse1IdentifiersSymbolCRISPR_Cse1PfamPF09481InterProIPR013381CDDcd09729Available '
 'protein structures:Pfam\n'
 ' \xa0structures / ECOD\n'
 ' \xa0PDBRCSB PDB; PDBe; PDBjPDBsumstructure summary\n'
 'The cas genes in the adaptor and effector modules of the CRISPR-Cas system '
 'are believed to have evolved from two different ancestral modules. A '

In [None]:
for s in sections:
  print(s.heading)
  if s.children:
    print('\t', [c.heading for c in s.children])

Contents
History
	 ['Repeated sequences', 'CRISPR-associated systems', 'Cas9', 'Cas12a (formerly Cpf1)', 'Cas13 (formerly C2c2)']
Repeated sequences
CRISPR-associated systems
Cas9
Cas12a (formerly Cpf1)
Cas13 (formerly C2c2)
Locus structure
	 ['Repeats and spacers', 'CRISPR RNA structures', 'Cas genes and CRISPR subtypes']
Repeats and spacers
CRISPR RNA structures
Cas genes and CRISPR subtypes
Mechanism
	 ['Spacer acquisition', 'Biogenesis', 'Interference']
Spacer acquisition
	 ['Protospacer adjacent motifs', 'Insertion variants']
Protospacer adjacent motifs
Insertion variants
Biogenesis
Interference
Evolution
	 ['Coevolution', 'Rates']
Coevolution
Rates
Identification
Use by phages
Applications
	 ['CRISPR gene editing', 'CRISPR as diagnostic tool']
CRISPR gene editing
CRISPR as diagnostic tool
See also
Notes
References
Further reading
External links


# Evolution of headings

In [None]:
articles = [    
  CRISPR_gene_editing_en,
  CRISPR_en,
]

xpath_expression = "|".join([".//" + tag for tag in ["h1","h2","h3","h4","h5","h6"]])
for article in articles:
  # print(article.name)
  unique_headings = set()
  evolving_headings = []
  for revision in article.yield_revisions():
    headings = tuple(
      (heading.tag, ''.join(heading.itertext()).split('[edit]')[0].strip()) 
      for heading in revision.etree_from_html().xpath(xpath_expression)
    )
    if not headings in unique_headings:
      evolving_headings.append(
          (revision.index, revision.timestamp.string, revision.revid, revision.url, headings)
      )
    unique_headings.add(headings)
  df = pd.DataFrame(evolving_headings, columns=['revindx','timestamp','revid','url','new_headings',])
  # df.to_excel(path_extractions + '/wikipedia_entries/headings_history_{}.xlsx'.format(article.name))

  # Output
  with open(path_extractions + '/wikipedia_entries/headings_history_{}.txt'.format(article.name), 'w') as writer:
    for indx, row in df.iterrows():
      writer.write(row.timestamp[:10] + ' ({}/{}) '.format(indx, row.revindx) + row.url + '\n\n')
      for i in row.new_headings:
        writer.write('\t{}{}'.format('   '*(int(i[0][-1])-2) ,i[1]) + '\n')
      writer.write('\n')

# Revision history into table

In [None]:
articles = [    
  CRISPR_en,
  CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  differences = article.calculate_revision_size_difference()
  revisions = [
      (revision.index, revision.timestamp.string, revision.revid, revision.url, revision.user, revision.userid, revision.size, revision.comment)
      for revision in article.yield_revisions()
  ]
  df = pd.DataFrame(revisions, columns=['revindx','timestamp','revid','url','user','userid','size','comment',])
  df.insert(loc=7, column='diff', value=differences) # https://stackoverflow.com/questions/18674064/how-do-i-insert-a-column-at-a-specific-column-index-in-pandas
  df.to_excel(path_extractions + '/wikipedia_entries/revision_history_{}.xlsx'.format(article.name))
df

CRISPR_en
CRISPR_gene_editing_en


Unnamed: 0,revindx,timestamp,revid,url,user,userid,size,diff,comment
0,0,2019-02-17 06:32:48,883727671,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77238,77238,Split from CRISPR page
1,1,2019-02-17 06:33:54,883727741,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77551,313,
2,2,2019-02-17 06:35:41,883727860,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,77771,220,rectify ref 8
3,3,2019-02-17 06:37:14,883727959,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,78031,260,/* Editing */ rectify ref 25
4,4,2019-02-17 06:39:37,883728113,https://en.wikipedia.org/w/index.php?title=CRI...,Helito,28440460,78024,-7,/* Editing */
...,...,...,...,...,...,...,...,...,...
194,194,2020-12-10 20:28:06,993476518,https://en.wikipedia.org/w/index.php?title=CRI...,Nisse Hulta,11218895,96234,243,Link to the Nobel Prize site
195,195,2020-12-15 10:27:37,994366355,https://en.wikipedia.org/w/index.php?title=CRI...,Smartse,1640548,95784,-450,rm [[WP:REFSPAM]] - see talk
196,196,2020-12-19 07:34:37,995107732,https://en.wikipedia.org/w/index.php?title=CRI...,Bagumba,8005368,95699,-85,/* top */ [[WP:NAMB]]
197,197,2020-12-22 17:57:54,995747724,https://en.wikipedia.org/w/index.php?title=CRI...,PaulTConley,35393367,95975,276,/* Predecessors */ Added detail on: zinc finge...


# Check which entry pointing to CRISPR really mentions CRISPR 

## Get incomming entries and users

In [None]:
target_entries =[
  'CRISPR',
  'CRISPR_gene_editing',
]

incomming_entries_titles = []
incomming_entries_urls = []

for target in target_entries:
  print(target)
  # url = 'https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/{}&hideredirs=1&hidetrans=1&limit=500'.format(target) # note: only first 500...
  url = 'https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/{}&namespace=0&limit=500&hideredirs=1&hidetrans=1'.format(target) # note: only first 500...
  target_soup = make_soup(url)
  for li in target_soup.find_all(attrs={"id": "mw-whatlinkshere-list"}):
    for i in li.find_all('a'):
      if i['href'].startswith('/wiki/'):
        if not ':' in i['href']:
          in_link_title = i['href'][6:]
          in_link_url = 'https://en.wikipedia.org' + i['href']
          incomming_entries_titles.append(in_link_title)
          incomming_entries_urls.append(in_link_url)
  # Save incoming entries
  with open(path_extractions + '/wikipedia_entries/{}_incomming_entries_titles.txt'.format(target), 'w') as f:
      f.writelines([i+'\n' if not indx == len(incomming_entries_titles) - 1 else i for indx, i in enumerate(sorted(incomming_entries_titles))])
  with open(path_extractions + '/wikipedia_entries/{}_incomming_entries_urls.txt'.format(target), 'w') as f:
      f.writelines([i+'\n' if not indx == len(incomming_entries_urls) - 1 else i for indx, i in enumerate(sorted(incomming_entries_urls))])

CRISPR
CRISPR_gene_editing


## Filter relevant entries

In [None]:
with open(path_extractions + '/wikipedia_entries/CRISPR_incomming_entries_titles.txt', 'r') as f:
    CRISPR_incomming_entries_titles = {i.strip() for i in f.readlines()}
with open(path_extractions + '/wikipedia_entries/CRISPR_gene_editing_incomming_entries_titles.txt', 'r') as f:
    CRISPR_gene_editing_incomming_entries_titles = {i.strip() for i in f.readlines()}
incomming_entries_titles = sorted(CRISPR_incomming_entries_titles | CRISPR_gene_editing_incomming_entries_titles)

relevant_entries_text = []
relevant_entries_else = []
irrelevant_entries = []
no_crispr = []
no_entries = []

for title in incomming_entries_titles:
  print(title)
  url = 'https://en.wikipedia.org/wiki/{}'.format(title)
  soup = make_soup(url)
  if 'Wikipedia does not have an article with this exact name.' in soup.text: # just a quick hack...
    print('\tNo Soup!')
    no_entries.append(title)
    continue
  if not 'CRISPR' in str(soup.html): # Sollte nicht vorkommen!
    print('\tNo CRISPR!')
    no_crispr.append(title)
    continue
  # kill navboxes
  discard = soup.find_all(attrs={"class": "navbox"}) # https://stackoverflow.com/questions/39885359/beautifulsoup-decompose
  for match in discard:
      match.decompose()
  # kill 'series' boxes
  discard = soup.find_all(attrs={"class": "sidebar vertical-navbox nomobile plainlist"}) # https://stackoverflow.com/questions/39885359/beautifulsoup-decompose
  for match in discard:
      match.decompose()
  # Check for relevance
  if not 'CRISPR' in str(soup.html):
    irrelevant_entries.append(title)
    continue
  if 'CRISPR' in soup.text:
    relevant_entries_text.append(title)
  else:
    relevant_entries_else.append(title)

print('relevant_entries')
pp.pprint(relevant_entries)

# Save files
with open(path_extractions + '/wikipedia_entries/TEXT_incomming_entries_titles.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(relevant_entries_text) - 1 else i for indx, i in enumerate(relevant_entries_text)])
with open(path_extractions + '/wikipedia_entries/ELSE_incomming_entries_titles.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(relevant_entries_else) - 1 else i for indx, i in enumerate(relevant_entries_else)])
with open(path_extractions + '/wikipedia_entries/BOX_incomming_entries_titles.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(irrelevant_entries) - 1 else i for indx, i in enumerate(irrelevant_entries)])
with open(path_extractions + '/wikipedia_entries/NO_CRISPR_incomming_entries_titles.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(no_crispr) - 1 else i for indx, i in enumerate(no_crispr)])
with open(path_extractions + '/wikipedia_entries/NO_ENTRIES_incomming_entries_titles.txt', 'w') as f:
    f.writelines([i+'\n' if not indx == len(no_entries) - 1 else i for indx, i in enumerate(no_entries)])

15-Cis-phytoene_desaturase
2010s
2012_in_science
2014_in_science
2015_in_science
2015_in_the_United_States
2016_in_science
2017_in_science
2017_in_the_United_States
2018_in_science
2019
2019_in_science
2020_in_science
ATUM
Aaron_Traywick
Accelerating_expansion_of_the_universe
Adam_Bogdanove
Addgene
Adenoviridae
Alex_Marson
Alu_element
Anatoly_B._Kolomeisky
Andrea_Crisanti_(scientist)
Anna_Dumitriu
Anti-CRISPR
Antibiotic
Antidote
Antisense_RNA
Antiviral_drug
April%E2%80%93June_2020_in_science
Arabidopsis_thaliana
Ardipithecus
Ardipithecus_ramidus
Ark_Invest
Artificial_cell
Asian_Scientist
Avian_malaria
BASF
BLESS
Bacteria
Bacterial_DNA_binding_protein
Bacterial_small_RNA
Bacteriophage
Behavioural_genetics
BioGRID
Biohub
Biotechnology_risk
Box_jellyfish
Breakthrough_of_the_Year
Brian_Hanley_(microbiologist)
Brief_Answers_to_the_Big_Questions
Bryan_R._Cullen
CAS
CAS2_(disambiguation)
CASPR
CITE-Seq
COVID-19_pandemic_in_Karnataka
COVID-19_testing
CRISPR
CRISPR-Display
CRISPR/Cas_Tools
CRIS

# Extract keyphrase contexts

## Define keyphrases

In [None]:
keyphrases = [                              
        # discoveries
        'first',
        'discover',
        'invent',
        'independent',
        'history',
        # naming things
        'name', # CRISPR acronym
        'acronym', # CRISPR acronym
        # applications/ patents
        'application',
        'editing',
        'efforts to edit genomes',
        'engineering',
        'patent',
        'property',
        'right',
        'trial',
        'clinical',
        'patient',
        'approve', # e.g. FDA approved...
        # awards
        'award',
        'prize',
        'nobel',
        # people:
        'doudna', 
          'charpentier', # Doudna lab?
          'jinek', # Doudna lab/ first author of 2012 paper (Unsung Hero)
          'wiedenheft', # Doudna lab (Unsung Hero)
          'haurwitz', # Doudna lab (Unsung Hero)

        'church',
          'mali', # Church lab/ first author of 2013 paper (Unsung Hero)
          'yang', # Church lab/ first author of 2015 paper (mentioned in Wikipedia?) (Unsung Hero)

        'zhang', # Broad
          'cong', # Broad (Unsung Hero)
          ' ran ', # Broad (Unsung Hero)
                
        'šikšnys', # (Unsung Hero)
            'šiksnys', # only first accent
            'sikšnys', # only second accent
            'siksnys', # no accents
          'gašiūnas', # Siksnys lab/ first author of 2012 paper (Unsung Hero)
            'gasiūnas', # only first accent
            'gašiunas', # only second accent
            'gasiunas', # no accents

        'horvath', # Danisco Boyz
          'barrangou', # Danisco Boyz
        
        'brouns',
        'mojica',
        'ishino',
        'jansen',
        'marraffini',
        'sontheimer',
        'koonin',
          'lamarck', # Koonin claimed that CRISPR was Lamarckian
        'moineau',
          'laval',
        'bolotin',
        'sorokin',
        'makarova',
        'pourcel',
          'adaptive immunity', # 2005 studies: 1) Pourcel, 2) Mojica, 3) Bolotin
          'plasmids', # 2005 studies: 1) Pourcel, 2) Mojica, 3) Bolotin
          'rejected', # 2005 studies: 1) Pourcel, 2) Mojica, 3) Bolotin
        'overbeek',
        'liang',
        ' xu', # added space to avoid false positives!
        'jiankui',
        'nishimasu',
        'shibata',
        'lander', # review
        # institutions:
        'osaka',
        'alicante',
        'danisco',
        'broad',
        'harvard',
        'university of california', 
        'berkeley', 
]

## Extract and cluster keyphrase contexts WIKIPEDIA

In [None]:
articles = [    
  CRISPR_en,
  # CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  for keyphrase in keyphrases:
    print('\t', keyphrase)
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, keyphrase, right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for left, right in revision.get_lr_contexts(keyphrase, width=100, lower=True)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/keyphrase_contexts/{}/{}{}_{}.xlsx'.format(article.name, '' if len(df) > 0 else 'EMPTY_', keyphrase, article.name))

CRISPR_en
	 laval


## Extract keyphrase contexts EXTERNAL ACCOUNTS

In [None]:
accounts = {
  'Hsu_et_al_2014':'24906146',       
  'Doudna_Charpentier_2014':'25430774',       
  'Adli_2018':'29765029',       
  'Ledford_2015':'26040877',       
  'Pennisi_2013':'23970676',       
  'Lander_2016':'26771483',       
  'Mojica_Montoliu_2016':'27401123',       
  'Mojica_Rodriguez-Valera_2016':'27234458',       
  'Ishino_et_al_2018':'29358495',       
  'van_Erp_et_al_2015':'25914022',       
  'Morange_2015a':'25740136',       
  'Sherkov_2015':'25748913',       
  'Ledford_2016a':'27443723',       
  'Han_She_2017':'29150001',       
  'Morange_2015b':'26648028',       
  'Liang_et_al_2017':'28623876',       
  'Javed_et_al_2018':'30078067',       
  'Ledford_2016b':'27652544',       
  'Ledford_Callaway_2020':'33028993',       
}

accounts = {k:'https://pubmed.ncbi.nlm.nih.gov/{}/'.format(v) for k,v in accounts.items()}
accounts

{'Adli_2018': 'https://pubmed.ncbi.nlm.nih.gov/29765029/',
 'Doudna_Charpentier_2014': 'https://pubmed.ncbi.nlm.nih.gov/25430774/',
 'Han_She_2017': 'https://pubmed.ncbi.nlm.nih.gov/29150001/',
 'Hsu_et_al_2014': 'https://pubmed.ncbi.nlm.nih.gov/24906146/',
 'Ishino_et_al_2018': 'https://pubmed.ncbi.nlm.nih.gov/29358495/',
 'Javed_et_al_2018': 'https://pubmed.ncbi.nlm.nih.gov/30078067/',
 'Lander_2016': 'https://pubmed.ncbi.nlm.nih.gov/26771483/',
 'Ledford_2015': 'https://pubmed.ncbi.nlm.nih.gov/26040877/',
 'Ledford_2016a': 'https://pubmed.ncbi.nlm.nih.gov/27443723/',
 'Ledford_2016b': 'https://pubmed.ncbi.nlm.nih.gov/27652544/',
 'Ledford_Callaway_2020': 'https://pubmed.ncbi.nlm.nih.gov/33028993/',
 'Liang_et_al_2017': 'https://pubmed.ncbi.nlm.nih.gov/28623876/',
 'Mojica_Montoliu_2016': 'https://pubmed.ncbi.nlm.nih.gov/27401123/',
 'Mojica_Rodriguez-Valera_2016': 'https://pubmed.ncbi.nlm.nih.gov/27234458/',
 'Morange_2015a': 'https://pubmed.ncbi.nlm.nih.gov/25740136/',
 'Morange_20

# Extract citation contexts

## Define citations

In [None]:
citations = [
        # {'DOI':'10.1016/j.cell.2014.05.010',    'PMID':'24906146',  'PMC':'',         'name':'Hsu_et_al_2014'},
        # {'DOI':'10.1126/science.1258096',       'PMID':'25430774',  'PMC':'',         'name':'Doudna_Charpentier_2014'},
        # {'DOI':'10.1038/s41467-018-04252-2',    'PMID':'29765029',  'PMC':'5953931',  'name':'Adli_2018'},
        {'DOI':'10.1038/522020a',               'PMID':'26040877',  'PMC':'',         'name':'Ledford_2015'},
        # {'DOI':'10.1126/science.341.6148.833',  'PMID':'23970676',  'PMC':'',         'name':'Pennisi_2013'},
        # {'DOI':'10.1016/j.cell.2015.12.041',    'PMID':'26771483',  'PMC':'',         'name':'Lander_2016'},
        # {'DOI':'10.1016/j.tim.2016.06.005',     'PMID':'27401123',  'PMC':'',         'name':'Mojica_Montoliu_2016'},
        # {'DOI':'10.1111/febs.13766',            'PMID':'27234458',  'PMC':'',         'name':'Mojica_Rodriguez-Valera_2016'},
        # {'DOI':'10.1128/JB.00580-17',           'PMID':'29358495',  'PMC':'5847661',  'name':'Ishino_et_al_2018'},
        {'DOI':'10.1016/j.coviro.2015.03.011',  'PMID':'25914022',  'PMC':'',         'name':'van_Erp_et_al_2015'},
        {'DOI':'10.1007/s12038-015-9532-6',     'PMID':'25740136',  'PMC':'',         'name':'Morange_2015a'},
        {'DOI':'10.1038/nbt.3160',              'PMID':'25748913',  'PMC':'',         'name':'Sherkov_2015'},
        {'DOI':'10.1007/978-3-642-34657-6_1',   'PMID':'',          'PMC':'',         'name':'Mojica_Garett_2013'},
        # {'DOI':'10.1038/535342a',               'PMID':'27443723',  'PMC':'',         'name':'Ledford_2016a'},
        # {'DOI':'10.1016/bs.pmbts.2017.10.001',  'PMID':'29150001',  'PMC':'',         'name':'Han_She_2017'},
        {'DOI':'10.1007/s12038-015-9575-8',     'PMID':'26648028',  'PMC':'',         'name':'Morange_2015b'},
        # {'DOI':'10.1002/jgm.2963',              'PMID':'28623876',  'PMC':'',         'name':'Liang_et_al_2017'},
        # {'DOI':'10.1007/s00284-018-1547-4',     'PMID':'30078067',  'PMC':'',         'name':'Javed_et_al_2018'},
        # {'DOI':'10.1038/537460a',               'PMID':'27652544',  'PMC':'',         'name':'Ledford_2016b'},
        {'DOI':'10.1007/978-3-642-34657-6_3',   'PMID':'',          'PMC':'',         'name':'Makarova_Koonin_2013'},
        {'DOI':'10.1007/978-3-642-34657-6_11',  'PMID':'',          'PMC':'',         'name':'Horvath_et_al_2013'},
        # {'DOI':'10.1038/d41586-020-02765-9',    'PMID':'33028993',  'PMC':'',         'name':'Ledford_Callaway_2020'}, 
]

## Extract and cluster citation contexts

In [None]:
articles = [    
  CRISPR_en,
  # CRISPR_gene_editing_en,
]

for article in articles:
  print(article.name)
  for citation in citations:
    print('\t', citation['name'])
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, matched_citation.get_superscript(revision), right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for id_type, id_num in {k:v for k,v in citation.items() if v and any(k==i for i in ['DOI', 'PMC', 'PMID'])}.items()
      for matched_citation in set(
          reference for reference in revision.get_references() if reference.get_identifiers()[id_type] == id_num
          )
      for left, right in revision.get_lr_contexts(
          r'\[{}\]'.format(matched_citation.get_superscript(revision).strip().strip('[]')), # I do this crap to make sure '[5]' becomes '\\[5\\]' or else finiter cuts the brakets
          width=100, 
          lower=True, # search in lowercase text of revision
          )
      if matched_citation.get_superscript(revision)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/citation_contexts/{}/{}{}_{}.xlsx'.format(article.name, '' if len(df) > 0 else 'EMPTY_', citation['name'], article.name))

CRISPR_en
	 Ledford_2015
	 van_Erp_et_al_2015
	 Morange_2015a
	 Sherkov_2015
	 Mojica_Garett_2013
	 Morange_2015b
	 Makarova_Koonin_2013
	 Horvath_et_al_2013


# Experiments and snippets

In [None]:
# revision = CRISPR_en.get_revision(revid=725648335)
# print(revision.url, "\n")
# pp.pprint(revision.get_lr_contexts('was'))
# references = revision.get_references()
# reference = references[1]
# print("REFERENCE\n", reference.get_text())
# print("REFERENCE ID\n", reference.get_id())
# print("REFERENCE Number\n", reference.get_number_via_id())
# print("REFERENCE Superscript\n", reference.get_superscript(revision))

In [None]:
# def occurance_spans(keyphrase, article):
#   ''' 
#   returns a list of time spans during which a keyphrase 'keyphrase' continously appears in an article 'a'
#   '''
#   gen = article.yield_revisions()
#   last_rev_was_match = False
#   spans = []
#   for revision in gen:
#     if keyphrase in revision.get_text():
#     # if keyphrase in i.get_text().split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
#       last_rev_that_matched = revision
#       if not last_rev_was_match:
#         span_beg = revision
#         last_rev_was_match = True
#     else:
#       if last_rev_was_match:
#         spans.append((span_beg, last_rev_that_matched))
#         last_rev_was_match = False
#   if last_rev_was_match:
#     spans.append((span_beg, last_rev_that_matched))
#   print(
#       '"{}" was found in the following revisions of {}:\n{}'.format(keyphrase, 'Test', '\n'.join(['Span {}: {} - {}'.format(indx + 1, revision[0].timestamp, revision[1].timestamp) for indx,revision in enumerate(spans)])) 
#       if spans else '"{}" was not found in any revision of "{}"'.format(keyphrase, 'Test')
#       )
#   return spans