## Set globals

In [122]:
# set paths
path_extractions = '/content/drive/My Drive/WIKING/extractions'
path_git = '/content/drive/My Drive/WIKING/WIKING_git'
path_my_tools = '/content/drive/My Drive/my_tools'

# infiles
file_CRISPR_de = path_extractions + '/CRISPR_de' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_en = path_extractions + '/CRISPR_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/
file_CRISPR_gene_editing_en = path_extractions + '/CRISPR_gene_editing_en' # get latest file version here: https://files.webis.de/wikipedia-tracing-innovations/ 
file_events = path_git + '/data/CRISPR_events - events.csv'
file_accounts = path_git + '/data/CRISPR_events - accounts.csv'

# outfiles
file_contexts_CRISPR_en = path_extractions + '/contexts_CRISPR_en.xlsx'
file_contexts_CRISPR_gene_editing_en = path_extractions + '/contexts_CRISPR_gene_editing_en.xlsx'

# mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# add git folder to path variable
import sys
sys.path.append(path_git + '/code')

# import Wolfgang's classes:
from article.article import Article

# import other modules
import re
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pprint
pp = pprint.PrettyPrinter(indent=4)

# set working directory
os.chdir(path_git)
os.getcwd()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'/content/drive/My Drive/WIKING/WIKING_git'

## Functions

In [123]:
def occurance_spans(keyphrase, article):
  ''' 
  returns a list of time spans during which a keyphrase 'keyphrase' continously appears in an article 'a'
  '''
  gen = article.yield_revisions()
  last_rev_was_match = False
  spans = []
  for revision in gen:
    if keyphrase in revision.get_text():
    # if keyphrase in i.get_text().split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
      last_rev_that_matched = revision
      if not last_rev_was_match:
        span_beg = revision
        last_rev_was_match = True
    else:
      if last_rev_was_match:
        spans.append((span_beg, last_rev_that_matched))
        last_rev_was_match = False
  if last_rev_was_match:
    spans.append((span_beg, last_rev_that_matched))
  print(
      '"{}" was found in the following revisions of {}:\n{}'.format(keyphrase, 'Test', '\n'.join(['Span {}: {} - {}'.format(indx + 1, revision[0].timestamp, revision[1].timestamp) for indx,revision in enumerate(spans)])) 
      if spans else '"{}" was not found in any revision of "{}"'.format(keyphrase, 'Test')
      )
  return spans

In [124]:
def find_all(s, sub): # https://stackoverflow.com/questions/4664850/how-to-find-all-occurrences-of-a-substring
  start = 0
  while True:
    start = s.find(sub, start)
    if start == -1: return
    yield start
    start += len(sub) # use start += 1 to find overlapping matches

In [129]:
def unique_contexts(keyphrase, article, width=100):
  ''' 
  returns a pandas df with three columns:
  1) unique contexts of width in which keyphrase appears in article
  2) timestamp for first occurrence of each context
  3) url for first occurrence of each context
  '''
  kill = re.compile(r'\s*\[.*?\]')
  gen = article.yield_revisions()
  unique_contexts = []
  timestamps = []
  urls = []
  for revision in gen:
    text = revision.get_text()
    # if s in text.split('\nLocus structure\n')[0]: # Trick 17 to exclude everything after history section and practially only search in histroy section
    if keyphrase in text:
      text = kill.sub('', text) # kill intext refs like '[23]'
      text = text.lower()
      for indx in find_all(text, keyphrase):
        # make '/n' the boundary for context
        before = text[indx - width : indx]
        after = text[indx + len(keyphrase): indx + len(keyphrase) + width]
        context = keyphrase[::-1]
        for char in before[::-1]:
          if char == '\n':
            break
          context += char
        context = context[::-1]
        for char in after:
          if char == '\n':
            break
          context += char
        # only add unique contexts and add their meta data
        context = context.strip()
        if not context in unique_contexts:
          unique_contexts.append(context)
          timestamps.append(revision.timestamp.string)
          urls.append(revision.url)    
  print('Found {} unique contexts for "{}" in article "{}".'.format(len(unique_contexts), keyphrase, article.name))
  # return list(zip(unique_contexts, timestamps, urls))
  return pd.DataFrame({'Context':unique_contexts, 
        'Timestamp':timestamps,
        'Url':urls}) 

## Load data

In [126]:
CRISPR_en = Article(file_CRISPR_en)
CRISPR_gene_editing_en = Article(file_CRISPR_gene_editing_en)

# events = pd.read_csv(file_events)
# accounts = pd.read_csv(file_accounts)

## Experiments

In [130]:
contexts_first = unique_contexts('first', CRISPR_en, width=50)
contexts_first.to_excel(file_contexts_CRISPR_en)

contexts_first = unique_contexts('first', CRISPR_gene_editing_en, width=50)
contexts_first.to_excel(file_contexts_CRISPR_gene_editing_en)

Found 143 unique contexts for "first" in article "CRISPR_en".
Found 23 unique contexts for "first" in article "CRISPR_gene_editing_en".


In [9]:
timeline = occurance_spans('CRISPR Timeline', CRISPR_en)

"CRISPR Timeline" was found in the following revisions of Test:
Span 1: {'datetime': datetime.datetime(2016, 9, 28, 9, 26, 29),
 'day': 28,
 'hour': 9,
 'minute': 26,
 'month': 9,
 'second': 29,
 'string': '2016-09-28 09:26:29',
 'year': 2016} - {'datetime': datetime.datetime(2016, 12, 28, 8, 5, 49),
 'day': 28,
 'hour': 8,
 'minute': 5,
 'month': 12,
 'second': 49,
 'string': '2016-12-28 08:05:49',
 'year': 2016}
Span 2: {'datetime': datetime.datetime(2019, 3, 18, 14, 33, 42),
 'day': 18,
 'hour': 14,
 'minute': 33,
 'month': 3,
 'second': 42,
 'string': '2019-03-18 14:33:42',
 'year': 2019} - {'datetime': datetime.datetime(2019, 3, 18, 14, 34, 29),
 'day': 18,
 'hour': 14,
 'minute': 34,
 'month': 3,
 'second': 29,
 'string': '2019-03-18 14:34:29',
 'year': 2019}


In [None]:
# history_trick17 = occurance_spans('\nLocus structure\n', CRISPR_en)

"
Locus structure
" was not found in any revision of "Test"


In [None]:
history = occurance_spans('\nLocus structure\n', CRISPR_en)

"
Locus structure
" was found in the following revisions of Test:
Span 1: {'datetime': datetime.datetime(2013, 11, 24, 21, 3, 53),
 'day': 24,
 'hour': 21,
 'minute': 3,
 'month': 11,
 'second': 53,
 'string': '2013-11-24 21:03:53',
 'year': 2013} - {'datetime': datetime.datetime(2016, 9, 22, 22, 1, 43),
 'day': 22,
 'hour': 22,
 'minute': 1,
 'month': 9,
 'second': 43,
 'string': '2016-09-22 22:01:43',
 'year': 2016}
Span 2: {'datetime': datetime.datetime(2016, 9, 23, 0, 51, 24),
 'day': 23,
 'hour': 0,
 'minute': 51,
 'month': 9,
 'second': 24,
 'string': '2016-09-23 00:51:24',
 'year': 2016} - {'datetime': datetime.datetime(2017, 9, 18, 8, 58, 26),
 'day': 18,
 'hour': 8,
 'minute': 58,
 'month': 9,
 'second': 26,
 'string': '2017-09-18 08:58:26',
 'year': 2017}
Span 3: {'datetime': datetime.datetime(2017, 9, 18, 9, 0, 40),
 'day': 18,
 'hour': 9,
 'minute': 0,
 'month': 9,
 'second': 40,
 'string': '2017-09-18 09:00:40',
 'year': 2017} - {'datetime': datetime.datetime(2018, 9, 24, 

In [None]:
mojica = occurance_spans('Mojica', CRISPR_en)
# for span in doudna:
#   print(span[0].timestamp, span[1].timestamp)

"Mojica" was found in the following revisions of Test:
Span 1: 2010-01-11T02:11:54Z - 2020-09-02T08:32:27Z
