## Set globals

In [1]:
# set paths
path_extractions = '/content/drive/My Drive/WIKING/extractions'
path_git = '/content/drive/My Drive/WIKING/WIKING_git'
path_my_tools = '/content/drive/My Drive/my_tools'

# infiles
file_CRISPR_de = path_extractions + '/CRISPR_de' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_en = path_extractions + '/CRISPR_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_gene_editing_en = path_extractions + '/CRISPR_gene_editing_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/ 
file_events = path_git + '/data/CRISPR_events - events.csv'
file_accounts = path_git + '/data/CRISPR_events - accounts.csv'

# outfiles
file_contexts_CRISPR_en = path_extractions + '/contexts_CRISPR_en.xlsx'
file_contexts_CRISPR_gene_editing_en = path_extractions + '/contexts_CRISPR_gene_editing_en.xlsx'

# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# add git folder to path variable
import sys
sys.path.append(path_git + '/code')

# import Wolfgang's classes:
from article.article import Article

# import other modules
!pip install fuzzywuzzy # https://github.com/seatgeek/fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import networkx as nx
!pip install python-louvain
from community import community_louvain
import re
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import pprint
pp = pprint.PrettyPrinter(indent=4)

# set working directory
os.chdir(path_git)
os.getcwd()

Mounted at /content/drive
Collecting fuzzywuzzy
  Downloading https://files.pythonhosted.org/packages/43/ff/74f23998ad2f93b945c0309f825be92e04e0348e062026998b5eefef4c33/fuzzywuzzy-0.18.0-py2.py3-none-any.whl
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0






'/content/drive/My Drive/WIKING/WIKING_git'

## Functions

In [3]:
def find_all(text, keyphrase): # https://stackoverflow.com/questions/4664850/how-to-find-all-occurrences-of-a-substring
  ''' Returns indices for all matches of keyphrase '''
  start = 0
  while True:
    start = text.find(keyphrase, start)
    if start == -1: 
      return
    yield start
    start += len(keyphrase) # use start += 1 to find overlapping matches

In [6]:
def keyphrase_in_context(keyphrase, article, width=50, score_cutoff=75, scorer=fuzz.ratio):
  ''' 
  Returns df for contexts to the left and to the right of keyphrase
  '''
  gen = article.yield_revisions()
  contexts = []
  for revision in gen:
    text = revision.get_text()
    text = text.lower()
    if keyphrase in text:
      for indx in find_all(text, keyphrase):
        # context left
        left = ''
        for char in text[indx - width : indx][::-1]:
          if char == '\n': # make '\n' the boundary for context
            break
          left += char
        left = left[::-1].strip()
        # context right
        right = ''
        for char in text[indx + len(keyphrase): indx + len(keyphrase) + width]:
          if char == '\n': # make '\n' the boundary for context
            break
          right += char
        right = right.strip()
        contexts.append((revision.index, revision.timestamp.string, 0, left, keyphrase, right, 0, revision.revid, revision.url,))
  df = pd.DataFrame(contexts, columns=['revindx','timestamp','left_id','left','keyphrase','right','right_id', 'revid','url',])
  # determine communities of similarity for left and right contexts
  
  for side in ['left', 'right']:
    G = nx.Graph()
    unique_contexts = list(df[side].unique())
    length = len(unique_contexts)
    for query in unique_contexts:
      indx = unique_contexts.index(query) + 1
      if indx <= length:
        scores = process.extractBests(query, unique_contexts[unique_contexts.index(query) + 1:], score_cutoff=score_cutoff, limit=length, scorer=scorer) # https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py
        edges = [(query, i[0]) for i in scores]
        G.add_edges_from(edges)
    # add community_ids for lonesome contexts as well
    unique_contexts = set(unique_contexts)
    community_dic = community_louvain.best_partition(G)
    if community_dic:
      community_contexts = set(community_dic.keys())
      lonesome_contexts = unique_contexts - community_contexts
      lonesome_dic = {k:v for v,k in enumerate(lonesome_contexts,start=max(community_dic.values())+1)}
      community_dic.update(lonesome_dic) # hiermit nun 'lid' und 'rid' Spalten befüllen...
    else:
      community_dic = {k:v for v,k in enumerate(unique_contexts)}
    # add community_ids to df
    column = df.apply(lambda row: community_dic[row[side]], axis=1)
    df['{}_id'.format(side)] = column
  return df
  # # add community_ids for lonesome contexts as well
  # unique_contexts = set(unique_contexts['left'] + unique_contexts['right'])
  # community_dic = community_louvain.best_partition(G)
  # community_contexts = set(community_dic.keys())
  # lonesome_contexts = unique_contexts - community_contexts
  # lonesome_dic = {k:v for v,k in enumerate(lonesome_contexts,start=max(community_dic.values())+1)}
  # community_dic.update(lonesome_dic) # hiermit nun 'lid' und 'rid' Spalten befüllen...
  # # add community_ids for all left and right contexts in df
  # for side in ['left', 'right']:
  #   column = df.apply(lambda row: community_dic[row[side]], axis=1)
  #   df['{}_id'.format(side)] = column
  # return df

## Load data

In [7]:
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)
# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

## Extract contexts

In [None]:
for article, article_name in [
                # (CRISPR_en, 'CRISPR_en'),
                # (CRISPR_gene_editing_en, 'CRISPR_gene_editing_en'),
                ]:
  print(article_name)
  for keyphrase in [
                    # terms
                    'first',
                    'discover',
                    'patent',
                    'nobel',

                    # people:
                    'doudna', 
                    'charpentier', 
                    'zhang', 
                    'siksnys',
                      # 'purified', # falls Šikšnys wg spelling schwer zu finden ist
                    'horvath', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
                    'barrangou', # auch im Zusammenhang mit Šikšnys, aber auch so interessant
                    'church',
                    'lander', # review
                    
                    # institutions:
                    'broad',
                    'harvard',
                    'university of california', 
                    'berkeley',
                    
                    ]:
    print('\t' + keyphrase)
    df = keyphrase_in_context(keyphrase, article, width=100, score_cutoff=75)
    df.sort_values(by=['left_id','timestamp'])
    df.to_excel(path_extractions + '/contexts_{}_{}.xlsx'.format(article_name, keyphrase))

CRISPR_en
	nobel


### CRISPR_en: "first"



In [None]:
df = keyphrase_in_context('first', CRISPR_en, width=50, score_cutoff=75)
df.to_excel(file_contexts_CRISPR_en)
df.sort_values(by=['left_id','timestamp'])



Unnamed: 0,revindx,timestamp,left_id,left,keyphrase,right,right_id,revid,url
25,133,2012-01-07 04:36:56,0,n of the crispr-cas system in 2012[21] provided a,first,step toward realization of some of the several pr,33,470025144,https://en.wikipedia.org/w/index.php?title=CRI...
27,134,2012-08-01 19:27:33,0,n of the crispr-cas system in 2012[21] provided a,first,step toward realization of some of the several pr,33,505309163,https://en.wikipedia.org/w/index.php?title=CRI...
29,135,2012-08-22 21:44:35,0,n of the crispr-cas system in 2012[21] provided a,first,step toward realization of some of the several pr,33,508685743,https://en.wikipedia.org/w/index.php?title=CRI...
31,136,2012-12-06 19:12:03,0,n of the crispr-cas system in 2012[21] provided a,first,step toward realization of some of the several pr,33,526751001,https://en.wikipedia.org/w/index.php?title=CRI...
33,137,2012-12-11 08:13:19,0,n of the crispr-cas system in 2012[21] provided a,first,step toward realization of some of the several pr,33,527490124,https://en.wikipedia.org/w/index.php?title=CRI...
...,...,...,...,...,...,...,...,...,...
26284,2039,2020-11-02 21:24:41,75,th the newly acquired spacer inserted between the,first,and second direct repeats.[95][116],5,986762186,https://en.wikipedia.org/w/index.php?title=CRI...
26297,2040,2020-11-03 09:53:58,75,th the newly acquired spacer inserted between the,first,and second direct repeats.[95][116],5,986842914,https://en.wikipedia.org/w/index.php?title=CRI...
26310,2041,2020-11-03 20:16:34,75,th the newly acquired spacer inserted between the,first,and second direct repeats.[95][116],5,986927023,https://en.wikipedia.org/w/index.php?title=CRI...
26323,2042,2020-11-03 22:56:29,75,th the newly acquired spacer inserted between the,first,and second direct repeats.[95][116],5,986949106,https://en.wikipedia.org/w/index.php?title=CRI...


### CRISPR_gene_editing_en: "first"



In [None]:
df = keyphrase_in_context('first', CRISPR_gene_editing_en, width=50, score_cutoff=75)
df.to_excel(file_contexts_CRISPR_gene_editing_en)
df.sort_values(by=['left_id','timestamp'])

Unnamed: 0,revindx,timestamp,left_id,left,keyphrase,right,right_id,revid,url
3,0,2019-02-17 06:32:48,0,340.2.[131] yang's white button mushroom was the,first,organism genetically modified with the crispr/cas,3,883727671,https://en.wikipedia.org/w/index.php?title=CRI...
10,1,2019-02-17 06:33:54,0,340.2.[131] yang's white button mushroom was the,first,organism genetically modified with the crispr/cas,3,883727741,https://en.wikipedia.org/w/index.php?title=CRI...
17,2,2019-02-17 06:35:41,0,340.2.[131] yang's white button mushroom was the,first,organism genetically modified with the crispr/cas,3,883727860,https://en.wikipedia.org/w/index.php?title=CRI...
24,3,2019-02-17 06:37:14,0,340.2.[132] yang's white button mushroom was the,first,organism genetically modified with the crispr/cas,3,883727959,https://en.wikipedia.org/w/index.php?title=CRI...
31,4,2019-02-17 06:39:37,0,340.2.[131] yang's white button mushroom was the,first,organism genetically modified with the crispr/cas,3,883728113,https://en.wikipedia.org/w/index.php?title=CRI...
...,...,...,...,...,...,...,...,...,...
1447,184,2020-10-23 09:36:41,13,ay:table-row}.mw-parser-output .portal>ul>li>s...,first,-child{display:table-cell;padding:0.2em;vertic...,15,984993980,https://en.wikipedia.org/w/index.php?title=CRI...
1458,185,2020-11-02 04:29:36,13,ay:table-row}.mw-parser-output .portal>ul>li>s...,first,-child{display:table-cell;padding:0.2em;vertic...,15,986649176,https://en.wikipedia.org/w/index.php?title=CRI...
1469,186,2020-11-02 10:34:00,13,ay:table-row}.mw-parser-output .portal>ul>li>s...,first,-child{display:table-cell;padding:0.2em;vertic...,15,986682078,https://en.wikipedia.org/w/index.php?title=CRI...
1480,187,2020-11-02 10:35:18,13,ay:table-row}.mw-parser-output .portal>ul>li>s...,first,-child{display:table-cell;padding:0.2em;vertic...,15,986682164,https://en.wikipedia.org/w/index.php?title=CRI...


# Old stuff...

### Occurrence spans

In [None]:
def occurance_spans(keyphrase, article):
  ''' 
  returns a list of time spans during which a keyphrase 'keyphrase' continously appears in an article 'a'
  '''
  gen = article.yield_revisions()
  last_rev_was_match = False
  spans = []
  for revision in gen:
    if keyphrase in revision.get_text():
    # if keyphrase in i.get_text().split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
      last_rev_that_matched = revision
      if not last_rev_was_match:
        span_beg = revision
        last_rev_was_match = True
    else:
      if last_rev_was_match:
        spans.append((span_beg, last_rev_that_matched))
        last_rev_was_match = False
  if last_rev_was_match:
    spans.append((span_beg, last_rev_that_matched))
  print(
      '"{}" was found in the following revisions of {}:\n{}'.format(keyphrase, 'Test', '\n'.join(['Span {}: {} - {}'.format(indx + 1, revision[0].timestamp, revision[1].timestamp) for indx,revision in enumerate(spans)])) 
      if spans else '"{}" was not found in any revision of "{}"'.format(keyphrase, 'Test')
      )
  return spans

### Unique contexts

In [None]:
def unique_contexts(keyphrase, article, width=50):
  ''' 
  returns a pandas df with three columns:
  1) unique contexts of width in which keyphrase appears in article
  2) timestamp for first occurrence of each context
  3) url for first occurrence of each context
  '''
  kill = re.compile(r'\s*\[.*?\]')
  gen = article.yield_revisions()
  unique_contexts = []
  timestamps = []
  urls = []
  for revision in gen:
    text = revision.get_text()
    # if s in text.split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
    if keyphrase in text:
      text = kill.sub('', text) # kill intext refs like '[23]'
      text = text.lower()
      for indx in find_all(text, keyphrase):
        # make '/n' the boundary for context
        before = text[indx - width : indx]
        after = text[indx + len(keyphrase): indx + len(keyphrase) + width]
        context = keyphrase[::-1]
        for char in before[::-1]:
          if char == '\n':
            break
          context += char
        context = context[::-1]
        for char in after:
          if char == '\n':
            break
          context += char
        # add unique contexts and add their meta data
        context = context.strip()
        if not context in unique_contexts:
          unique_contexts.append(context)
          timestamps.append(revision.timestamp.string)
          urls.append(revision.url)    
  print('Found {} unique contexts for "{}" in article "{}".'.format(len(unique_contexts), keyphrase, article.name))
  # return list(zip(unique_contexts, timestamps, urls))
  return pd.DataFrame({'Context':unique_contexts, 
        'Timestamp':timestamps,
        'Url':urls}) 

### Left-right contexts

In [None]:
def lr_context_graph(keyphrase, article, width=50):
  ''' 
  Returns networkx DiGraph for links between contexts to the left and to the right of keyphrase
  '''
  gen = article.yield_revisions()
  DG = nx.DiGraph(keyphrase=keyphrase)
  for revision in gen:
    text = revision.get_text()
    text = text.lower()
    # text = re.sub(r'\s*\[.*?\]','', text) # kill intext refs like '[23]'
    metadata = (revision.timestamp.string, revision.url)
    if keyphrase in text:
      for indx in find_all(text, keyphrase):
        # context left
        left = ''
        for char in text[indx - width : indx][::-1]:
          if char == '\n': # make '/n' the boundary for context
            break
          left += char
        left = left[::-1].strip()
        # add/update node for context left
        if not left in DG:
          DG.add_node(left, as_left=[metadata], as_right=[])
        else:
          DG.nodes[left]['as_left'].append(metadata)
        # context right
        right = ''
        for char in text[indx + len(keyphrase): indx + len(keyphrase) + width]:
          if char == '\n': # make '/n' the boundary for context
            break
          right += char
        right = right.strip()
        # add/update node for context right
        if not right in DG:
          DG.add_node(right, as_right=[metadata], as_left=[])
        else:
          DG.nodes[right]['as_right'].append(metadata)
        # add/update edge from context left to context right
        if not [left, right] in DG.edges:
          DG.add_edge(left, right, occurrences=[metadata])
        else:
            DG.edges[left,right]['occurrences'].append(metadata)
  return DG

### Node communities

In [None]:
def node_community_dic(DG, score_cutoff=90, scorer=fuzz.ratio):
  keyphrase = DG.graph['keyphrase']
  G = nx.Graph()
  for side in ['left', 'right']:
    choices = [node[0] for node in DG.nodes.data() if node[1]['as_{}'.format(side)]]
    length = len(choices)
    for query in choices:
      indx = choices.index(query) + 1
      if indx <= length:
        scores = process.extractBests(query, choices[choices.index(query) + 1:], score_cutoff=score_cutoff, limit=length, scorer=scorer) # https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py
        edges = [(query, i[0]) for i in scores]
        G.add_edges_from(edges)
  return community_louvain.best_partition(G)

### Master node for each community

In [None]:
def master_nodes_dic(DG, ncd):
  '''
  Returns dictionary with community_ids as keys and nodes + latest occurrence as value
  '''
  result = {}
  reverse_mnd = {v:[k for k in mnd if mnd[k] == v] for v in mnd.values()}
  for community_id, nodes in reverse_mnd.items():
    candidates = []
    for node in nodes:
      latest = max([DG.nodes[node]['as_{}'.format(side)][-1][0] for side in ['left', 'right'] if DG.nodes[node]['as_{}'.format(side)]])
      candidates.append((node, latest))
    winner = sorted(candidates, key= lambda x: (x[1], x[0]), reverse=True)[0]
    result[community_id] = winner
  return result

### Reduced LR contexts graph (using ``lr_context_graph``, ``node_community_dic``, and ``master_nodes_dic``)

In [None]:
def reduce_lr_context_graph(DG, ncd, mnd):
  pass

## Load data

In [None]:
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)

# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

## Experiments

### Extract LR contexts and merge nodes 

In [None]:
DG = lr_context_graph('first', CRISPR_gene_editing_en, width=50)
ncd = node_community_dic(DG)
# sorted(ncd.items(), key=lambda item:(item[1], item[0]))
mnd = master_nodes_dic(DG, ncd)
mnd

{0: ("340.2.[145] yang's white button mushroom was the",
  '2020-11-02 10:35:49'),
 1: ('t position 8 (help); no-break space character in |',
  '2020-01-31 21:33:44'),
 2: ('demonstrated in 2012 in:[25][26]', '2019-02-18 01:48:30'),
 3: ('organism genetically modified with the crispr-cas',
  '2020-11-02 10:35:49'),
 4: ('= at position 5 (help); no-break space character i',
  '2020-01-31 21:33:44')}

### Older stuff

In [None]:
contexts_first = unique_contexts('first', CRISPR_en, width=50)
contexts_first.to_excel(file_contexts_CRISPR_en)

contexts_first = unique_contexts('first', CRISPR_gene_editing_en, width=50)
contexts_first.to_excel(file_contexts_CRISPR_gene_editing_en)

Found 143 unique contexts for "first" in article "CRISPR_en".
Found 23 unique contexts for "first" in article "CRISPR_gene_editing_en".


In [None]:
timeline = occurance_spans('CRISPR Timeline', CRISPR_en)

"CRISPR Timeline" was found in the following revisions of Test:
Span 1: {'datetime': datetime.datetime(2016, 9, 28, 9, 26, 29),
 'day': 28,
 'hour': 9,
 'minute': 26,
 'month': 9,
 'second': 29,
 'string': '2016-09-28 09:26:29',
 'year': 2016} - {'datetime': datetime.datetime(2016, 12, 28, 8, 5, 49),
 'day': 28,
 'hour': 8,
 'minute': 5,
 'month': 12,
 'second': 49,
 'string': '2016-12-28 08:05:49',
 'year': 2016}
Span 2: {'datetime': datetime.datetime(2019, 3, 18, 14, 33, 42),
 'day': 18,
 'hour': 14,
 'minute': 33,
 'month': 3,
 'second': 42,
 'string': '2019-03-18 14:33:42',
 'year': 2019} - {'datetime': datetime.datetime(2019, 3, 18, 14, 34, 29),
 'day': 18,
 'hour': 14,
 'minute': 34,
 'month': 3,
 'second': 29,
 'string': '2019-03-18 14:34:29',
 'year': 2019}


In [None]:
# history_trick17 = occurance_spans('\nLocus structure\n', CRISPR_en)

"
Locus structure
" was not found in any revision of "Test"


In [None]:
history = occurance_spans('\nLocus structure\n', CRISPR_en)

"
Locus structure
" was found in the following revisions of Test:
Span 1: {'datetime': datetime.datetime(2013, 11, 24, 21, 3, 53),
 'day': 24,
 'hour': 21,
 'minute': 3,
 'month': 11,
 'second': 53,
 'string': '2013-11-24 21:03:53',
 'year': 2013} - {'datetime': datetime.datetime(2016, 9, 22, 22, 1, 43),
 'day': 22,
 'hour': 22,
 'minute': 1,
 'month': 9,
 'second': 43,
 'string': '2016-09-22 22:01:43',
 'year': 2016}
Span 2: {'datetime': datetime.datetime(2016, 9, 23, 0, 51, 24),
 'day': 23,
 'hour': 0,
 'minute': 51,
 'month': 9,
 'second': 24,
 'string': '2016-09-23 00:51:24',
 'year': 2016} - {'datetime': datetime.datetime(2017, 9, 18, 8, 58, 26),
 'day': 18,
 'hour': 8,
 'minute': 58,
 'month': 9,
 'second': 26,
 'string': '2017-09-18 08:58:26',
 'year': 2017}
Span 3: {'datetime': datetime.datetime(2017, 9, 18, 9, 0, 40),
 'day': 18,
 'hour': 9,
 'minute': 0,
 'month': 9,
 'second': 40,
 'string': '2017-09-18 09:00:40',
 'year': 2017} - {'datetime': datetime.datetime(2018, 9, 24, 

In [None]:
mojica = occurance_spans('Mojica', CRISPR_en)
# for span in doudna:
#   print(span[0].timestamp, span[1].timestamp)

"Mojica" was found in the following revisions of Test:
Span 1: 2010-01-11T02:11:54Z - 2020-09-02T08:32:27Z
