## Set globals

In [1]:
# set paths
path_extractions = '/content/drive/My Drive/WIKING/extractions'
path_git = '/content/drive/My Drive/WIKING/WIKING_git'
path_my_tools = '/content/drive/My Drive/my_tools'

# infiles
file_CRISPR_de = path_extractions + '/wikipedia_entries/CRISPR_de' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_en = path_extractions + '/wikipedia_entries/CRISPR_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_gene_editing_en = path_extractions + '/wikipedia_entries/CRISPR_gene_editing_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/ 
file_events = path_git + '/data/CRISPR_events - events.csv'
file_accounts = path_git + '/data/CRISPR_events - accounts.csv'

# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# add git folder to path variable
import sys
sys.path.append(path_git + '/code')

# import Wolfgang's classes:
from article.article import Article

# import other modules
!pip install fuzzywuzzy # https://github.com/seatgeek/fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import networkx as nx
!pip install python-louvain
from community import community_louvain
import re
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint
pp = pprint.PrettyPrinter(indent=4)
from lxml import etree

# set working directory
os.chdir(path_git)
os.getcwd()

# Load data
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)
# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).






## Experiments and snippets

In [6]:
# revision = CRISPR_en.get_revision(revid=725648335)
# print(revision.url, "\n")
# references = revision.get_references()
# reference = references[1]
# print("REFERENCE\n", reference.get_text())
# print("REFERENCE ID\n", reference.get_id())
# print("REFERENCE Number\n", reference.get_number_via_id())
# print("REFERENCE Superscript\n", reference.get_superscript(revision))

https://en.wikipedia.org/w/index.php?title=CRISPR&oldid=725648335 

REFERENCE
 Horvath P, Barrangou R (January 2010). "CRISPR/Cas, the immune system of bacteria and archaea". Science. 327 (5962): 167–70. Bibcode:2010Sci...327..167H. doi:10.1126/Science.1179555. PMID 20056882.
REFERENCE ID
 cite_note-pmid20056882-1
REFERENCE Number
 1
REFERENCE Superscript
 [1]


'[1]'

In [None]:
# def occurance_spans(keyphrase, article):
#   ''' 
#   returns a list of time spans during which a keyphrase 'keyphrase' continously appears in an article 'a'
#   '''
#   gen = article.yield_revisions()
#   last_rev_was_match = False
#   spans = []
#   for revision in gen:
#     if keyphrase in revision.get_text():
#     # if keyphrase in i.get_text().split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
#       last_rev_that_matched = revision
#       if not last_rev_was_match:
#         span_beg = revision
#         last_rev_was_match = True
#     else:
#       if last_rev_was_match:
#         spans.append((span_beg, last_rev_that_matched))
#         last_rev_was_match = False
#   if last_rev_was_match:
#     spans.append((span_beg, last_rev_that_matched))
#   print(
#       '"{}" was found in the following revisions of {}:\n{}'.format(keyphrase, 'Test', '\n'.join(['Span {}: {} - {}'.format(indx + 1, revision[0].timestamp, revision[1].timestamp) for indx,revision in enumerate(spans)])) 
#       if spans else '"{}" was not found in any revision of "{}"'.format(keyphrase, 'Test')
#       )
#   return spans

In [None]:
# def find_all(text, keyphrase): # https://stackoverflow.com/questions/4664850/how-to-find-all-occurrences-of-a-substring
#   ''' Returns indices for all matches of keyphrase '''
#   start = 0
#   while True:
#     start = text.find(keyphrase, start)
#     if start == -1: 
#       return
#     yield start
#     start += len(keyphrase) # use start += 1 to find overlapping matches

## Functions

In [8]:
def cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp']):
  # determine communities of similarity for left and right contexts
  df = pd.DataFrame(lr_contexts, columns=['revindx','timestamp','left_id','left','keyphrase','right','right_id','revid','url','user','userid','comment',])
  for side in ['left', 'right']:
    G = nx.Graph()
    unique_contexts = list(df[side].unique())
    length = len(unique_contexts)
    for query in unique_contexts:
      indx = unique_contexts.index(query) + 1
      if indx <= length:
        scores = process.extractBests(query, unique_contexts[unique_contexts.index(query) + 1:], score_cutoff=score_cutoff, limit=length, scorer=scorer) # https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py
        edges = [(query, i[0]) for i in scores]
        G.add_edges_from(edges)
    # add community_ids for lonesome contexts as well
    unique_contexts = set(unique_contexts)
    community_dic = community_louvain.best_partition(G)
    if community_dic:
      community_contexts = set(community_dic.keys())
      lonesome_contexts = unique_contexts - community_contexts
      lonesome_dic = {k:v for v,k in enumerate(lonesome_contexts,start=max(community_dic.values())+1)}
      community_dic.update(lonesome_dic) # hiermit nun 'lid' und 'rid' Spalten befüllen...
    else:
      community_dic = {k:v for v,k in enumerate(unique_contexts)}
    # add community_ids to df
    column = df.apply(lambda row: community_dic[row[side]], axis=1)
    df['{}_id'.format(side)] = column
    df.sort_values(by=sort_by)
    return df

## Extract keyphrase contexts

### Define keyphrases

In [25]:
keyphrases = [                              
        # discoveries
        # 'first',
        'discover',
        # # naming things
        # 'name', # CRISPR acronym
        # 'acronym', # CRISPR acronym
        # # patents
        # 'patent',
        # 'property',
        # 'right',
        # # clinical trials
        # 'trial',
        # 'clinical',
        # 'patient',
        # 'approve', # e.g. FDA approved...
        # # awards
        # 'award',
        # 'prize',
        # 'nobel',
        # # people:
        # 'doudna', 
        # 'charpentier', 
        # 'zhang', 
        # 'šikšnys', # with all accents
        #   'šiksnys', # only first accent
        #   'sikšnys', # only second accent
        #   'siksnys', # no accents
        # 'gašiūnas', # with all accents
        #   'gasiūnas', # only first accent
        #   'gašiunas', # only second accent
        #   'gasiunas', # no accents
        # 'horvath', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
        # 'barrangou', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
        # 'church',
        # 'brouns',
        # 'mojica',
        # 'ishino',
        # 'jansen',
        # 'marraffini',
        # 'sontheimer',
        # 'koonin',
        # 'moineau',
        # 'bolotin',
        # 'sorokin',
        # 'makarova',
        # 'pourcel',
        # 'overbeek',
        # 'liang',
        # ' xu', # added space!
        # 'jiankui',
        # 'nishimasu',
        # 'shibata',
        # 'lander', # review
        # # institutions:
        # 'osaka',
        # 'alicante',
        # 'danisco',
        # 'broad',
        # 'harvard',
        # 'university of california', 
        # 'berkeley', 
]

### Extract and cluster keyphrase contexts

In [26]:
articles = [    
  # CRISPR_en,
  CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  for keyphrase in keyphrases:
    print('\t', keyphrase)
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, keyphrase, right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for left, right in revision.get_lr_contexts(keyphrase, width=100, lower=True)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/keyphrase_contexts/{}_{}.xlsx'.format(keyphrase, article.name))

CRISPR_gene_editing_en
	 discover




## Extract citation contexts

### Define citations

In [35]:
citations = [
        # {'DOI':'10.1016/j.cell.2014.05.010',    'PMID':'24906146',  'PMC':'',         'name':'Hsu_et_al_2014'},
        {'DOI':'10.1126/science.1258096',       'PMID':'25430774',  'PMC':'',         'name':'Doudna_Charpentier_2014'},
        {'DOI':'10.1038/s41467-018-04252-2',    'PMID':'29765029',  'PMC':'5953931',  'name':'Adli_2018'},
        # {'DOI':'10.1038/522020a',               'PMID':'26040877',  'PMC':'',         'name':'Ledford_2015'},
        {'DOI':'10.1126/science.341.6148.833',  'PMID':'23970676',  'PMC':'',         'name':'Pennisi_2013'},
        {'DOI':'10.1016/j.cell.2015.12.041',    'PMID':'26771483',  'PMC':'',         'name':'Lander_2016'},
        {'DOI':'10.1016/j.tim.2016.06.005',     'PMID':'27401123',  'PMC':'',         'name':'Mojica_Montoliu_2016'},
        {'DOI':'10.1111/febs.13766',            'PMID':'27234458',  'PMC':'',         'name':'Mojica_Rodriguez-Valera_2016'},
        {'DOI':'10.1128/JB.00580-17',           'PMID':'29358495',  'PMC':'5847661',  'name':'Ishino_et_al_2018'},
        {'DOI':'10.1016/j.coviro.2015.03.011',  'PMID':'25914022',  'PMC':'',         'name':'van_Erp_et_al_2015'},
        {'DOI':'10.1007/s12038-015-9532-6',     'PMID':'25740136',  'PMC':'',         'name':'Morange_2015a'},
        {'DOI':'10.1038/nbt.3160',              'PMID':'25748913',  'PMC':'',         'name':'Sherkov_2015'},
        {'DOI':'10.1007/978-3-642-34657-6_1',   'PMID':'',          'PMC':'',         'name':'Mojica_Garett_2013'},
        {'DOI':'10.1038/535342a',               'PMID':'27443723',  'PMC':'',         'name':'Ledford_2016a'},
        {'DOI':'10.1016/bs.pmbts.2017.10.001',  'PMID':'29150001',  'PMC':'',         'name':'Han_She_2017'},
        {'DOI':'10.1007/s12038-015-9575-8',     'PMID':'26648028',  'PMC':'',         'name':'Morange_2015b'},
        {'DOI':'10.1002/jgm.2963',              'PMID':'28623876',  'PMC':'',         'name':'Liang_et_al_2017'},
        {'DOI':'10.1007/s00284-018-1547-4',     'PMID':'30078067',  'PMC':'',         'name':'Javed_et_al_2018'},
        {'DOI':'10.1038/537460a',               'PMID':'27652544',  'PMC':'',         'name':'Ledford_2016b'},
        {'DOI':'10.1007/978-3-642-34657-6_3',   'PMID':'',          'PMC':'',         'name':'Makarova_Koonin_2013'},
        {'DOI':'10.1007/978-3-642-34657-6_11',  'PMID':'',          'PMC':'',         'name':'Horvath_et_al_2013'},
        {'DOI':'10.1038/d41586-020-02765-9',    'PMID':'33028993',  'PMC':'',         'name':'Ledford_Callaway_2020'}, 
]

### Extract and cluster citation contexts

In [34]:
articles = [    
  CRISPR_en,
  # CRISPR_gene_editing_en
]

for article in articles:
  print(article.name)
  for citation in citations:
    print('\t', citation['name'])
    lr_contexts = [
      (revision.index, revision.timestamp.string, 0, left, target_match.get_superscript(revision), right, 0, revision.revid, revision.url, revision.user, revision.userid, revision.comment,)
      for revision in article.yield_revisions()
      for id_type, id_num in {k:v for k,v in citation.items() if v and any(k==i for i in ['DOI', 'PMC', 'PMID'])}.items()
      for target_match in set(
          reference for reference in revision.get_references() if reference.get_identifiers()[id_type] == id_num
      )
      for left, right in revision.get_lr_contexts(target_match.get_superscript(revision), width=100, lower=True)
      ]
    df = cluster_lr_contexts(lr_contexts, score_cutoff=75, scorer=fuzz.ratio, sort_by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/citation_contexts/{}_{}.xlsx'.format( citation['name'], article.name))

CRISPR_en
	 Hsu_et_al_2014




	 Ledford_2015
