#Setup

In [None]:
#@title Google Drive {run: "auto"}
from google.colab import drive
import os

drive.mount('/gdrive', force_remount=True)
drive_folder = "COMP762_IntentionMining" # @param {type:"string"}
drive_folder = os.path.join("/gdrive/My Drive/", drive_folder)


Mounted at /gdrive


In [None]:
# @title Download datasets from Google Drive, and define Sentence, Token, GetAllText() and GetTextByCategories() {display-mode: "form"}
 
import os
import gdown
import json
import re
import nltk
nltk.download('stopwords')
 
class Token:
  def __init__(self, default_format, properties):
    if not callable(default_format):
      self.default_format = lambda: default_format
    else:
      self.default_format = default_format
    self.properties = properties
  
  def __getitem__(self, ix):
    return self.properties.__getitem__(ix)
 
  def __setitem__(self, ix, val):
    return self.properties.__setitem__(ix, val)
 
  def to_string(self, format_):
    return format_.format(**self.properties)
 
  def __string_rep(self):
    return self.to_string(self.default_format())
 
  def __str__(self):
    return self.__string_rep()
  
  def __repr__(self):
    return self.__str__() + ": " + self.properties.__repr__()
 
  def __hash__(self):
    return self.__string_rep().__hash__()
 
  def __eq__(self, other):
    return self.__string_rep() == other.__string_rep()
  
  def __gt__(self, other):
    return self.__string_rep() > other.__string_rep()
    
  def __lt__(self, other):
    return self.__string_rep() < other.__string_rep()
 
 
class Sentence:
  def __init__(self, json_, fmt, get_ancestor):
    self.json = {**json_}
    self.fmt = fmt
    # Lowercase lemmata (plural of lemma)
    for t in self.json['tokens']:
      t['lemma'] = t['lemma'].lower()
    # Create tokens
    self.tokens = \
    [
      Token(self.getFormat, token) 
      for token in self.json['tokens']
    ]
    # Find each token's ancestor
    if get_ancestor:
      tree = self.makeParseTree()
      for ix, token in enumerate(self.tokens):
        tree_ix = tree.leaf_treeposition(ix)
        token["ancestor"] = tree[tree_ix[:-2]].label()
  
  def __getitem__(self, ix):
    return self.tokens[ix]
  
  def __setitem__(self, ix, val):
    self.tokens[ix] = val
 
  def getFormat(self):
    return self.fmt
  
  def setFormat(self, newFmt):
    self.fmt = newFmt
  
  def withoutPunctuation(self):
    without = Sentence({**self.json}, self.getFormat(), False)
    # Remove punctuation from parse
    parse = self.json["parse"]
    punct_ix = list(re.finditer("\([^a-zA-Z0-9(]\S* \S*[^a-zA-Z0-9)]\)", parse))
    for m in reversed(punct_ix):
      parse = parse[:m.start()]+ parse[m.end():]
    without.json["parse"] = parse
    # Remove punctuation from tokens
    tokens = []
    for ix, t in enumerate(self.tokens):
      if t["pos"][0].isalpha():
        tokens.append(Token(without.getFormat, {**t.properties}))
      elif 0 < len(tokens):
        tokens[-1]["after"] += t["after"]
    without.tokens = tokens
    # Done, leave the other fields untouched
    return without
 
  def withoutStopWords(self):
    without = Sentence({**self.json}, self.getFormat(), False)
    # Remove stopwords from tokens
    tokens = []
    for ix, t in enumerate(self.tokens):
      if t["lemma"].lower() not in stopwords.words('english'):
        tokens.append(Token(without.getFormat, {**t.properties}))
    without.tokens = tokens
    # Done, leave the other fields untouched
    return without
 
  def makeParseTree(self):
    from nltk.tree import Tree
    parse = self.json['parse']
    # Replace words with indices in parse string
    indices = list(enumerate(re.finditer('\s[^ )]+\)', parse)))
    for ix, match in reversed(indices):
      parse = parse[:match.start() + 1] + str(ix) + parse[match.end()-1:]
    # Use parse string to create a tree
    tree = Tree.fromstring(parse)
    # Replace indices in tree with tokens
    for lix, leaf in enumerate([leaf for leaf in tree.leaves() if leaf.isnumeric()]):
      tree[tree.leaf_treeposition(lix)] = self.tokens[int(leaf)]
    return tree
  
  def to_string(self, fmt=None, after=None):
    if fmt is None:
      fmt = self.fmt
    out = ''
    for t in self.tokens:
      out += t.to_string(fmt) + (t["after"] if after is None else after)
    return out
 
  def __str__(self):
    return self.to_string()
  
  def __repr__(self):
    return self.__str__().strip() + ": " + repr(self.json)
  
  def __len__(self):
    return len(self.tokens)
 
  @staticmethod
  def __findLabelInTree(tree, label):
    return [p for p in tree.treepositions() if isinstance(tree[p], nltk.tree.Tree) and label == tree[p].label()]
  
  @staticmethod
  def __getParseStringFromTreeWithTokens(tree):
    tree_copy = tree.copy(deep=True)
    for lix in range(len(tree_copy.leaves())):
      tree_copy[tree_copy.leaf_treeposition(lix)] = \
        tree_copy[tree_copy.leaf_treeposition(lix)].to_string("{originalText}")
    return str(tree_copy)
  
  @staticmethod
  def __treeToSentence(tree, fmt):
    return Sentence(
      {
        "parse": Sentence.__getParseStringFromTreeWithTokens(tree),
        "tokens": [token.properties for token in tree.leaves()]
      },
      fmt, False
    )
  
  @staticmethod
  def SwapPhrases(sentence_1, sentence_2, label, prefix=None, suffix=None):
    import numpy
    
    # Build a tree from each sentence
    tree_1 = sentence_1.makeParseTree()
    tree_2 = sentence_2.makeParseTree()
    # Find phrase in both trees
    tree_1_phrases = Sentence.__findLabelInTree(tree_1, label)
    tree_2_phrases = Sentence.__findLabelInTree(tree_2, label)
    # If there is nothing to swap, return
    if 0 == len(tree_1_phrases) or 0 == len(tree_2_phrases):
      return (None, None)
    # Pick a phrase at random from each sentence
    tree_1_index = tree_1_phrases[np.random.randint(len(tree_1_phrases))]
    tree_2_index = tree_2_phrases[np.random.randint(len(tree_2_phrases))]
    # DEBUG #
    if prefix is not None:
      if isinstance(prefix, str):
        prefix = [prefix, prefix]
      keys = ["originalText", "lemma"]
      for k in keys:
        tree_1[tree_1_index].leaves()[0][k] = \
          prefix[0] + tree_1[tree_1_index].leaves()[0][k]
        tree_2[tree_2_index].leaves()[0][k] = \
          prefix[1] + tree_2[tree_2_index].leaves()[0][k]
    if suffix is not None:
      if isinstance(suffix, str):
        suffix = [suffix, suffix]
      keys = ["originalText", "lemma"]
      for k in keys:
        tree_1[tree_1_index].leaves()[-1][k] = \
          tree_1[tree_1_index].leaves()[-1][k] + suffix[0]
        tree_2[tree_2_index].leaves()[-1][k] = \
          tree_2[tree_2_index].leaves()[-1][k] + suffix[1]
    #########
    # Swap
    swap = tree_1[tree_1_index]
    tree_1[tree_1_index] = tree_2[tree_2_index]
    tree_2[tree_2_index] = swap
    # Create Sentences
    #   Create copy, convert leaves to strings (lemma), and get parse string
    return \
    (
      Sentence.__treeToSentence(tree_1, sentence_1.fmt),
      Sentence.__treeToSentence(tree_2, sentence_2.fmt)
    )
 
 
archive = "Automating-Intention-Mining-parsed-data.tar.gz"
url = "https://drive.google.com/uc?id=1MYR04EN9wyEw5C-RhpAmX5Xnat-jiBSy"
print("Downloading {}: ".format(archive), end="")
if not os.path.isfile(archive):
  gdown.download(url, archive, 0)
else:
  print('file already exists. Skipping download.')
print("done")
 
# Remove old paths
parsed_folder = 'parsed'
if os.path.exists(parsed_folder):
  !rm -r parsed
 
print("Extracting files... ")
!tar xvf Automating-Intention-Mining-parsed-data.tar.gz
print("Extracting files... done")
 
# Load data
projects = ['DECA', 'bootstrap', 'docker', 'tensorflow', 'vscode']
categories = [
  'aspect evaluation', 'feature request', 'information giving',
  'information seeking', 'problem discovery', 'solution proposal', 'others'
]
 
_parsed_cat_proj = {}
for c in categories:
  _parsed_cat_proj[c] = {}
  for p in projects:
    with open(os.path.join(parsed_folder, p, c + ".json"), 'r', encoding='latin-1') \
      as f:
      j = json.load(f)
      assert c == j["docId"]
      _parsed_cat_proj[c][p] = j["sentences"]
      # for s in j["sentences"]:
      #   _parsed_cat_proj[c][p].append(Sentence(s, "{lemma}/{pos}"))
 
def GetAllText(
  word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
  get_ancestors=True, projects_to_exclude=None
):
 
  if word == "word":
    fmt = "{originalText}"
  elif word == "lemma":
    fmt = "{lemma}"
  else:
    raise Exception("Value (\"{}\") for @word not recognized.")
  if show_pos:
    fmt += "/{pos}"
 
  if remove_stopwords:
    constructor1 = lambda *args: Sentence(*args).withoutStopWords()
  else:
    constructor1 = lambda *args: Sentence(*args)
 
  if remove_punctuation:
    constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
  else:
    constructor2 = constructor1
 
  if projects_to_exclude is None:
    projects_to_exclude = []
  elif isinstance(projects_to_exclude, str):
    projects_to_exclude = [projects_to_exclude]
 
  projects_to_exclude = [p.lower() for p in projects_to_exclude]
 
  return \
  [
    constructor2(sentence, fmt, get_ancestors)
    for category_name, projects in _parsed_cat_proj.items()
    for project_name, project_text in projects.items()
    for sentence in project_text
    if project_name.lower() not in projects_to_exclude
  ]
 
 
def GetTextByCategories(
  word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
  get_ancestors=True, projects_to_exclude=None
):
 
  if word == "word":
    fmt = "{originalText}"
  elif word == "lemma":
    fmt = "{lemma}"
  else:
    raise Exception("Value (\"{}\") for @word not recognized.")
  if show_pos:
    fmt += "/{pos}"
 
  if remove_stopwords:
    constructor1 = lambda *args: Sentence(*args).withoutStopWords()
  else:
    constructor1 = lambda *args: Sentence(*args)
 
  if remove_punctuation:
    constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
  else:
    constructor2 = constructor1
 
  if projects_to_exclude is None:
    projects_to_exclude = []
  elif isinstance(projects_to_exclude, str):
    projects_to_exclude = [projects_to_exclude]
 
  projects_to_exclude = [p.lower() for p in projects_to_exclude]
 
  return \
  {
    category_name:
    [
      constructor2(sentence, fmt, get_ancestors)
      for project_name, project_text in projects.items()
      for sentence in project_text
      if project_name.lower() not in projects_to_exclude
    ]
    for category_name, projects in _parsed_cat_proj.items()
  }
 
 
print(
  '\n\n=======================================\n'
  + 'Use GetAllText() to get all text as a list.\n'
  + 'Use GetTextByCategories() to get a dictionary with category names as\n'
  + '  keys and lists of sentences belonging to that category as values.\n'
  + '=======================================\n'
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading Automating-Intention-Mining-parsed-data.tar.gz: file already exists. Skipping download.
done
Extracting files... 
parsed/
parsed/bootstrap/
parsed/bootstrap/information seeking.json
parsed/bootstrap/information giving.json
parsed/bootstrap/feature request.json
parsed/bootstrap/others.json
parsed/bootstrap/solution proposal.json
parsed/bootstrap/problem discovery.json
parsed/bootstrap/aspect evaluation.json
parsed/DECA/
parsed/DECA/information seeking.json
parsed/DECA/information giving.json
parsed/DECA/feature request.json
parsed/DECA/others.json
parsed/DECA/solution proposal.json
parsed/DECA/problem discovery.json
parsed/DECA/aspect evaluation.json
parsed/docker/
parsed/docker/information seeking.json
parsed/docker/information giving.json
parsed/docker/feature request.json
parsed/docker/others.json
parsed/docker/solution proposal.json
parsed/docker/problem

In [None]:
#@title StandfordNLP { display-mode: "form" }
force_reparse = False #@param {type:"boolean"}

if force_reparse:
  # Install stanfordnlp (Needed for CoreNLPParser to work)
  !pip install stanfordnlp
  import os
  import subprocess
  import google.colab.files
  
  #   Download the Stanford CoreNLP Java library and unzip it to a ./corenlp folder
  if not os.path.exists('./corenlp/'):
    !echo "Downloading CoreNLP..."
    !wget "http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip" -O corenlp.zip
    !unzip "corenlp.zip"
    !mv ./stanford-corenlp* ./corenlp
  
  
  base_folder = "data/"
  out_folder = "parsed/"
  
  try:
    os.mkdir(out_folder)
  except:
    pass
  
  for (path, _, files) in os.walk(base_folder):
    for category in files:
      dirout = path.replace(base, out_folder)
      try:
        os.mkdir(dirout)
      except:
        pass
      subprocess.call([
        "java", "-Xmx4G",
        "-cp", "./corenlp/*", "edu.stanford.nlp.pipeline.StanfordCoreNLP",
        "-annotators", "tokenize,ssplit,pos,lemma,parse",
        "-threads", "5",
        "-outputFormat", "json", 
        "-outputDirectory", dirout,
        "-ssplit.eolonly", 
        "-isOneDocument",
        "-file", os.path.join(path, category)
      ])
  
  !tar cvzf aim_parsed.tar.gz parsed
  google.colab.files.download('aim_parsed.tar.gz')

# Utility functions

In [None]:
# @title Preprocess(dataset__or__list_of_tuples)
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer as lem
stop_words = set(stopwords.words('english')) 
stop_words.add('would')

def Preprocess(dataset):
    for ix, (sent, label) in enumerate(dataset):
        sent = word_tokenize(sent)
        sent = [w.lower() for w in sent if re.fullmatch('[a-zA-Z.][a-zA-Z.]+', w)]
        lemmatizer = lem()
        sent = [lemmatizer.lemmatize(w) for w in sent]
        # Removing stop words seems to hurt the model's performance.
        # sent = [w for w in sent if w not in stop_words]
        dataset[ix] = (sent, label)

# def preprocess(data):
#   filtered = []
#   for i in data:
#     sent = re.sub(r'[^a-zA-Z ]+', '', i).lower()
#     sent = word_tokenize(sent)
#     tok_sent = []
#     for w in sent:
#       if not w in stop_words:
#         lemmatized = lem().lemmatize(w)
#         tok_sent.append(lemmatized)
#     filtered.append(tok_sent)
#   return filtered

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# @title word2tensor(dataset__or__list_of_tuples, text_vocab, label_vocab, pad_length)

def word2tensor(dataset, text_vocab, label_vocab, pad_length):
    # Transforms text and labels to numerical indices using vocabulary built
    # using this dataset.
    #
    #   pad_length      Length to pad to (e.g., pad_length = 100, but
    #                   sentence is 75 characters, then 25 <pad> characters
    #                   will be added.
    #   dataset         Dataset to which to apply this. Iterating dataset
    #                   should return a (text, label) tuple. Default: self 
    #                   (if dataset=None, use self).
    #
    # Warning: assumes that text has already been preprocessed and split!

    out = []

    # Text to numerical indices (tensors)
    for ix in range(len(dataset)):
        sentence = [text_vocab.stoi[word] for word in dataset[ix][0]]
        if label_vocab is not None:
            label = label_vocab.stoi[dataset[ix][1]]
        else:
            label = []

        if pad_length is not None and len(sentence) > pad_length:
            warnings.warn(
                'The following sentence has {} characters which is longer '\
                'than your padding length ({}).\nSentence = "{}"'\
                .format(len(sentence), pad_length, sentence)
            )
        elif pad_length is not None:
            sentence = sentence + [text_vocab.stoi['<pad>']]*(pad_length-len(sentence))

        out.append((torch.tensor(sentence), torch.tensor(label)))
    
    return out

In [None]:
# @title Predict(model, text) {display-mode: "form"}

def Predict(model, vocab, text):
    # Not learning
    model.eval()
    # For strings
    if not isinstance(text, list):
        text = [text]
    # Create "dataset"
    text_labels = [(sentence, '') for sentence in text]
    Preprocess(text_labels)
    # Word 2 tensor + padding
    text_labels = word2tensor(text_labels, vocab, None, saved['params']['_padded_string_length'])
    # (text,label)->[text,text,...]
    text = torch.stack([sentence for sentence, label in text_labels], dim=0)
    # CUDA?
    text = text.to(next(model.parameters()).device)
    # Prediction
    return torch.argmax(model(text).cpu(), 1)

In [None]:
# @title GetWordsByTfidf(dict_corpus) {display-mode: "form"}

def GetWordsByTfidf(dict_corpus):
# dict_corpus:  list or dictionary of documents.
#               If a dictionary is provided, the key names are used as column
#               names in the words_by_tfidf dataframe.
  from sklearn.feature_extraction.text import TfidfVectorizer
  import pandas as pd
  import numpy as np

  # corpus
  if isinstance(dict_corpus, dict):
    corpus = list(dict_corpus.values())
    category_names = dict_corpus.keys()
  else:
    corpus = dict_corpus
    category_names = None
  
  # create vectorizer
  #   Note: Lemmatizer needs original case, so don't lowercase. Let
  #         the tokenizer/lemmatizer handle that.
  tfidf_vectorizer = TfidfVectorizer(
    use_idf=True, lowercase=False, preprocessor=lambda x: x, tokenizer=lambda x: x 
  )
    
  # just send in all your docs here
  tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(corpus)

  words = tfidf_vectorizer.get_feature_names()
  words_by_tfidf = []
  for tf in tfidf_vectorizer_vectors:
    # np.sort() and np.argsort() always sort by ascending order, so sort negative
    # scores to get descending order
    #   Words in current category sorted by tfidf score
    cat_by_tfidf = []
    for ix in np.argsort(-tf.toarray()).flatten():
      cat_by_tfidf.append(Token(words[ix].default_format, {**words[ix].properties, "score": tf[0,ix]}))
    words_by_tfidf.append(cat_by_tfidf)

  words_by_tfidf = pd.DataFrame(data=words_by_tfidf, index=category_names).transpose()

  return words_by_tfidf, tfidf_vectorizer_vectors, words

In [None]:
# @title GetWordsByZInOut(dict_corpus) {display-mode: "form"}

def GetWordsByZInOut(dict_corpus):
# dict_corpus:  list or dictionary of documents.
#               If a dictionary is provided, the key names are used as column
#               names in the words_by_tfidf dataframe.
  from sklearn.feature_extraction.text import CountVectorizer
  import pandas as pd
  import numpy as np

  # corpus
  if isinstance(dict_corpus, dict):
    corpus = list(dict_corpus.values())
    category_names = dict_corpus.keys()
  else:
    corpus = dict_corpus
    category_names = None
  
  # create vectorizer
  count_vectorizer = CountVectorizer(
    lowercase=False, preprocessor=lambda x: x, tokenizer=lambda x: x
  )
    
  # just send in all your docs here
  count_vectorizer_vectors = count_vectorizer.fit_transform(corpus)

  # Use counts to calculate z-scored in-out
  #   Counts of the times the word appeared in a sentence of that category
  count_in = pd.DataFrame(
    data=count_vectorizer_vectors.toarray(),
    index=category_names,
    columns=count_vectorizer.get_feature_names()
  ).transpose()
  #   zscores for in-category occurrences
  z_in = (count_in - count_in.mean(axis=0)) / count_in.std(axis=0)
  #   Counts of the times the word appeared in the other categories
  count_out = -count_in.subtract(count_in.sum(axis=1), axis='rows')
  #   zscores for out-category occurrences
  z_out = (count_out - count_out.mean(axis=0)) / count_out.std(axis=0)
  #   z-scored in-out: final score representing words that are frequent in this
  #     category, but not in others
  z_inout = z_in - z_out

  words = count_vectorizer.get_feature_names()

  # We originally wanted to divide, but given the nature of z-scores, that
  # favored words which appeared an average amount of times (denom near 0).
  # This measure, instead, favors words which appear frequently inside the
  # category and infrequently outside of the category.

  words_by_zinout = []
  for col in range(z_inout.shape[1]):
    sorted_desc_index = np.argsort(-z_inout.iloc[:, col].to_numpy())
    cat_by_zinout = []
    for ix in sorted_desc_index:
      w = words[ix]
      cat_by_zinout.append(Token(w.default_format, {**w.properties, "score": z_inout.iloc[ix, col]}))
    words_by_zinout.append(cat_by_zinout)

  words_by_zinout = pd.DataFrame(data=words_by_zinout, index=category_names).transpose()

  return words_by_zinout, z_inout, count_vectorizer.vocabulary_

In [None]:
# @title CalculateOverlap() {display-mode: "form"}

def CalculateOverlap(df_sorted_words, N, ignore_pos=False):
  # df_sorted_words:  DataFrame containing words sorted by some score. The top 
  #                   N words are compared across columns and a matrix with
  #                   the overlap is returned
  import pandas as pd
  import numpy as np

  df_sorted_words = df_sorted_words[:N]
  if ignore_pos:
    df_sorted_words = df_sorted_words.applymap(lambda e: e.to_string("{lemma}"))
  else:
    df_sorted_words = df_sorted_words.applymap(lambda e: e.to_string("{lemma}/{pos}"))

  matrix = np.zeros(2*(len(df_sorted_words.columns),)) # 2*(7,) = (7,7)

  for ix in range(matrix.shape[0]):
    matrix[ix,ix] = 1
    for jx in range(1+ix, matrix.shape[1]):
      matrix[ix, jx] = matrix[jx, ix] = len(
        set(df_sorted_words.iloc[:, ix]) &  set(df_sorted_words.iloc[:, jx])
      ) / N
  
  return pd.DataFrame(matrix, df_sorted_words.columns, df_sorted_words.columns)

In [None]:
# @title PennToSimple() {display-mode: "form"}

# _PennSimpleDict = \
# {
#   'CC': 'other',
#   'CD': 'other',
#   'DT': 'determiners',
#   'EX': 'other',
#   'FW': 'other',
#   'IN': 'other',
#   'JJ': 'adjectives',
#   'JJR': 'adjectives',
#   'JJS': 'adjectives',
#   'LS': 'other',
#   'MD': 'verbs',  #will, would, can, could, etc.
#   'NN': 'nouns',
#   'NNS': 'nouns',
#   'NNP': 'nouns',
#   'NNPS': 'nouns',
#   'PDT': 'determiners',
#   'POS': 'other',
#   'PRP': 'pronouns',
#   'PRP$': 'other',
#   'RB': 'adverbs',
#   'RBR': 'adverbs',
#   'RBS': 'adverbs',
#   'RP': 'other',
#   'SYM': 'other',
#   'TO': 'other',
#   'UH': 'other',
#   'VB': 'verbs',
#   'VBD': 'verbs',
#   'VBG': 'verbs',
#   'VBN': 'verbs',
#   'VBP': 'verbs',
#   'VBZ': 'verbs',
#   'WDT': 'determiners',
#   'WP': 'other',
#   'WP$': 'other',
#   'WRB': 'adverbs'
# }

_PennSimpleDict = \
{
  'JJ': 'J',
  'JJS': 'J',
  'JJR': 'J',
  'NN': 'N',
  'NNS': 'N',
  'NNP': 'N',
  'NNPS': 'N',
  'RB': 'R',
  'RBR': 'R',
  'RBS': 'R',
  'VB': 'V',
  'VBD': 'V',
  'VBG': 'V',
  'VBN': 'V',
  'VBP': 'V',
  'VBZ': 'V',
  'MD': 'V',  #verb: will, would, can, could, etc.
}

def PennToSimple(tag):
  # Penn POS tagging to simple, intuitive tagging
  # return _PennSimpleDict[tag]
  return _PennSimpleDict.get(tag, tag)

In [None]:
# @title WordByScore2TagByScore() {display-mode: "form"}

def _AggregateTagScores(series_tags_scores):
  import numpy as np

  tag_set = set(series_tags_scores.apply(lambda e: e[1]))
  total_tag_score = {
      cat: [] for cat in tag_set
  }
  for score, tag in series_tags_scores:
    total_tag_score[tag].append(score)
  total_tag_score = [(np.sum(scores), tag) for tag, scores in total_tag_score.items()]
  # total_tag_score = [(np.mean(scores), tag) for tag, scores in total_tag_score.items()]
  return pd.Series(data=total_tag_score, index=tag_set)

def WordByScore2TagByScore(words_sorted_by_score, pos_key="pos"):
  tag_by_score = words_sorted_by_score.applymap(lambda e: (e["score"], PennToSimple(e[pos_key])))
  tag_by_score = tag_by_score.apply(_AggregateTagScores)
  tag_by_score.reset_index(drop=True, inplace=True)
  return tag_by_score.apply(
    lambda S: pd.Series(data=S.sort_values(ascending=False).to_list())
  )

In [None]:
# @title DisplayTopN(), DisplayOverlapMatrix() {display-mode: "form"}

def DisplayTopN(df_sorted_words, N, display_score=False):
  df_sorted_words = df_sorted_words[:N]
  if display_score:
    df_sorted_words = df_sorted_words.applymap(
      lambda e: e.to_string(lambda x: old_format() + "/{score:.2f}")
    )
  display(df_sorted_words)
  
def DisplayOverlapMatrix(matrix, upper_triangle=True, remove_pos=False):
  matrix = matrix.applymap(lambda e: "{:.2f}".format(float(e)))
  if upper_triangle:
    for (x,y) in zip(*np.where(np.invert(np.triu(np.full_like(matrix, True, dtype=bool))))):
      matrix.iloc[x,y] = ' '
  display(matrix)

In [None]:
# @title POS tagging and chunking {display-mode: "form"}
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tag(sentence):
  return nltk.pos_tag(nltk.word_tokenize(sentence))

def chunk_sentences(sentence):
  grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')
  tagged = pos_tag(sentence)
  chunkParser = nltk.RegexpParser(grammar)
  tree = chunkParser.parse(tagged)
  return tree

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Analysis

In [None]:
# @title Augmented sentences {display-mode: "form"}
from IPython.core.display import display, HTML
import numpy as np

text = GetTextByCategories(word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False)

len(text)
categories = list(text.keys())

for ii in range(10):
  swap_1 = swap_2 = None
  while swap_1 is None or swap_2 is None:
    cat_1 = categories[np.random.randint(len(text))]
    cat_2 = categories[np.random.randint(len(text))]
    sent_1 = text[cat_1][np.random.randint(len(text[cat_1]))]
    sent_2 = text[cat_2][np.random.randint(len(text[cat_2]))]
    swap_1, swap_2 = Sentence.SwapPhrases(
      sent_1, sent_2, "ADJP",
      prefix=['<span style="color: red">', '<span style="color: blue">'],
      suffix="</span>"
    )

  display(HTML("<b>O1:</b> " + str(sent_1)))
  display(HTML("<b>O2:</b> " + str(sent_2)))
  display(HTML("<b>S1:</b> " + str(swap_1)))
  display(HTML("<b>S2:</b> " + str(swap_2)))
  print()

print("\n\n\n")

# for ii in range(10):
#   cat_1 = categories[np.random.randint(len(text))]
#   cat_2 = categories[np.random.randint(len(text))]
#   sent_1 = text[cat_1][np.random.randint(len(text[cat_1]))]
#   sent_2 = text[cat_2][np.random.randint(len(text[cat_2]))]
#   swap_1, swap_2 = Sentence.SwapPhrases(sent_1, sent_2, "VP")

#   print(sent_1, sent_2)
#   print(swap_1, swap_2)



































## Baseline

### With stopwords

In [None]:
# @title TFIDF (w/ stopwords): top-10 words for each category + overlap matrix {display-mode: "form"}
# Q: What are the documents and what is the corpus?
# A: There are a few options.
#   One option is to consider each category as a document and the corpus as the
# set of categories. That is to say that we combine each category's sentences
# into a single string (document), thus obtaining 7 documents which form our
# corpus.
#  Another option is to ignore category boundaries. Each sentence is its own
# document and the corpus is the set of sentencs. The TFIDF score should then
# indicate the importance of a given word (e.g., 'the') relative to the sentence
# rather than the category. Then, for each word which appears at least once in
# a given category's sentences, we would obtain a score which could be something
# like the mean of all of that words' tfidf scores (or the max). The issue with
# this analysis is that it is unclear how to combine the scores or what they
# mean.
#   Finally, an option that one could consider would be to set the documents to
# be the sentences of a category and the corpus to be only those sentences
# belonging to that category. In that case, however, it is unclear what the
# TFIDF represents. For example, let's say "Thank you" appears once in "Solution
# proposal." "Thank" may get a (decently) high score for that category would
# consequently be considered (decently) meaningful. However, we know that the
# "other" category has many occurrences of the word "thank," meaning that it
# isn't diagnostic of "solution proposal" (even though it would get a decent
# score).

# The difference between the first and second analysis is the difference between
# calculating tfidf scores for a list of sentences of using a single document
# comprised of those same sentences.

import warnings
import pandas as pd
import numpy as np

text_by_categories = GetTextByCategories(word="lemma", show_pos=False, remove_punctuation=True, remove_stopwords=False)

# (1) Documents = category; Corpus = all categories
#   Combine all the sentences of a given category into a single document. Tfidf
# then is just the colum corresponding to each category.

# Combine categories' sentences (list of words) into a single document (list)
documents = {}
for cat, sentences in text_by_categories.items():
  documents[cat] = []
  for s in sentences:
    documents[cat].extend(s.tokens)
  
words_by_tfidf_with, tfidf, _ = GetWordsByTfidf(documents)
pd.options.display.max_rows = 100
DisplayTopN(words_by_tfidf_with.applymap(lambda t: Token("{lemma}", t.properties)), 10, False)

# Overlap in top-100
over = CalculateOverlap(words_by_tfidf_with, 100)
DisplayOverlapMatrix(over)

tag_by_tfidf = WordByScore2TagByScore(words_by_tfidf_with, pos_key="pos")
display(tag_by_tfidf.applymap(lambda e: (e[1], round(e[0], 2))))

ancestor_by_tfidf = WordByScore2TagByScore(words_by_tfidf_with, pos_key="ancestor")
display(ancestor_by_tfidf.applymap(lambda e: (e[1], round(e[0], 2))))

Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,be,be,the,be,the,the,for
1,the,to,be,the,be,to,thanks
2,i,the,to,you,i,be,you
3,to,a,a,to,to,a,i
4,a,would,i,a,a,you,be
5,it,it,and,do,not,i,the
6,not,i,in,this,and,and,thank
7,that,for,of,i,_,for,this
8,of,that,have,there,it,this,to
9,think,and,that,can,error,_,it


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,1.0,0.73,0.72,0.67,0.57,0.66,0.42
feature request,,1.0,0.71,0.66,0.58,0.64,0.42
information giving,,,1.0,0.74,0.69,0.7,0.42
information seeking,,,,1.0,0.62,0.69,0.45
problem discovery,,,,,1.0,0.66,0.39
solution proposal,,,,,,1.0,0.43
others,,,,,,,1.0


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(N, 2.61)","(N, 3.02)","(N, 3.65)","(N, 2.8)","(N, 3.7)","(N, 3.56)","(N, 2.1)"
1,"(V, 2.26)","(V, 2.5)","(V, 2.27)","(V, 2.5)","(V, 2.13)","(V, 2.39)","(V, 1.78)"
2,"(J, 1.12)","(J, 1.07)","(IN, 1.23)","(DT, 0.95)","(IN, 1.0)","(IN, 1.03)","(J, 0.84)"
3,"(IN, 1.06)","(IN, 1.05)","(DT, 1.01)","(IN, 0.93)","(DT, 0.94)","(DT, 1.03)","(IN, 0.83)"
4,"(R, 0.89)","(DT, 0.85)","(J, 0.9)","(PRP, 0.63)","(J, 0.79)","(J, 0.87)","(PRP, 0.75)"
5,"(DT, 0.85)","(R, 0.57)","(R, 0.7)","(J, 0.6)","(R, 0.68)","(R, 0.61)","(DT, 0.64)"
6,"(PRP, 0.65)","(PRP, 0.45)","(PRP, 0.46)","(R, 0.38)","(PRP, 0.53)","(PRP, 0.6)","(R, 0.57)"
7,"(CC, 0.29)","(TO, 0.41)","(CC, 0.28)","(TO, 0.29)","(CD, 0.43)","(TO, 0.44)","(TO, 0.16)"
8,"(TO, 0.26)","(CC, 0.23)","(TO, 0.28)","(CD, 0.2)","(CC, 0.29)","(CC, 0.27)","(CC, 0.12)"
9,"(PRP$, 0.11)","(CD, 0.16)","(CD, 0.28)","(WRB, 0.19)","(TO, 0.21)","(CD, 0.27)","(PRP$, 0.09)"


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(NP, 5.03)","(NP, 5.36)","(NP, 6.45)","(NP, 5.26)","(NP, 6.42)","(NP, 6.44)","(NP, 3.92)"
1,"(VP, 2.75)","(VP, 2.95)","(VP, 2.62)","(VP, 2.77)","(VP, 2.51)","(VP, 2.77)","(VP, 1.94)"
2,"(ADJP, 0.87)","(PP, 0.73)","(PP, 0.98)","(PP, 0.7)","(PP, 0.79)","(PP, 0.87)","(ADJP, 0.8)"
3,"(PP, 0.74)","(ADJP, 0.7)","(ADVP, 0.46)","(ADJP, 0.39)","(ADVP, 0.41)","(ADJP, 0.48)","(PP, 0.69)"
4,"(ADVP, 0.46)","(SBAR, 0.36)","(ADJP, 0.44)","(SBAR, 0.27)","(ADJP, 0.39)","(ADVP, 0.35)","(ADVP, 0.33)"
5,"(SBAR, 0.36)","(ADVP, 0.32)","(SBAR, 0.26)","(ADVP, 0.21)","(SBAR, 0.24)","(SBAR, 0.23)","(SBAR, 0.16)"
6,"(WHNP, 0.05)","(WHNP, 0.06)","(WHNP, 0.06)","(WHADVP, 0.19)","(WHADVP, 0.11)","(QP, 0.05)","(INTJ, 0.06)"
7,"(WHADVP, 0.03)","(QP, 0.04)","(WHADVP, 0.05)","(WHNP, 0.15)","(QP, 0.05)","(WHNP, 0.05)","(PRT, 0.05)"
8,"(QP, 0.03)","(WHADVP, 0.03)","(QP, 0.05)","(NP-TMP, 0.03)","(WHNP, 0.04)","(WHADVP, 0.04)","(QP, 0.04)"
9,"(PRT, 0.02)","(PRT, 0.02)","(PRT, 0.04)","(PRT, 0.03)","(NP-TMP, 0.04)","(PRT, 0.04)","(NP-TMP, 0.04)"


In [None]:
# @title In-Out (w/ stopwords): top-10 words for each category + overlap matrix {display-mode: "form"}
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

text_by_categories = GetTextByCategories(word="lemma", show_pos=False, remove_punctuation=True, remove_stopwords=False)

# Combine categories' sentences (list of words) into a single document (list)
documents = {}
for cat, sentences in text_by_categories.items():
  documents[cat] = []
  for s in sentences:
    documents[cat].extend(s.tokens)

words_by_zinout_with, _, _ = GetWordsByZInOut(documents)
DisplayTopN(words_by_zinout_with.applymap(lambda t: Token("{lemma}", t.properties)), 10, False)

over = CalculateOverlap(words_by_zinout_with, 100)
DisplayOverlapMatrix(over)
upper = [over.iloc[ix,jx] for ix in range(over.shape[0]) for jx in range(over.shape[1]) if ix < jx]
print(upper)

tag_by_zinout = WordByScore2TagByScore(words_by_zinout_with, pos_key="pos")
display(tag_by_zinout.applymap(lambda e: e[1]))

# ttt = tag_by_zinout[:6].applymap(lambda e: e[1])
ttt = tag_by_zinout[:6].applymap(lambda e: (e[1], round(e[0], 2)))
for ix, e in enumerate(ttt.iloc[-1, :]):
  # ttt.iloc[-1, ix] = ("", chr(int("22ee", 16)))
  # ttt.iloc[-1, ix] = ("", chr(int("2507", 16)))
  ttt.iloc[-1, ix] = chr(int("2507", 16))
display(ttt)

ancestor_by_zinout = WordByScore2TagByScore(words_by_zinout_with, pos_key="ancestor")
display(ancestor_by_zinout.applymap(lambda e: e[1]))

# ttt = ancestor_by_zinout[:6].applymap(lambda e: e[1])
ttt = ancestor_by_zinout[:6].applymap(lambda e: (e[1], round(e[0], 2)))
for ix, e in enumerate(ttt.iloc[-1, :]):
  # ttt.iloc[-1, ix] = ("", chr(int("22ee", 16)))
  # ttt.iloc[-1, ix] = ("", chr(int("2507", 16)))
  ttt.iloc[-1, ix] = chr(int("2507", 16))
display(ttt)

Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,think,would,the,you,error,to,thanks
1,i,to,we,do,i,you,for
2,not,a,in,what,the,_,you
3,be,nice,of,there,problem,solution,help
4,it,it,and,any,_,fix,thank
5,but,add,on,how,issue,workaround,sorry
6,like,if,have,can,when,work,hope
7,seem,great,will,be,same,use,appreciate
8,very,should,here,this,with,can,this
9,that,+1,currently,why,not,by,great


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,1.0,0.18,0.05,0.07,0.05,0.04,0.13
feature request,,1.0,0.09,0.07,0.06,0.08,0.13
information giving,,,1.0,0.06,0.13,0.16,0.04
information seeking,,,,1.0,0.06,0.12,0.12
problem discovery,,,,,1.0,0.17,0.05
solution proposal,,,,,,1.0,0.07
others,,,,,,,1.0


[0.18, 0.05, 0.07, 0.05, 0.04, 0.13, 0.09, 0.07, 0.06, 0.08, 0.13, 0.06, 0.13, 0.16, 0.04, 0.06, 0.12, 0.12, 0.17, 0.05, 0.07]


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,J,J,N,V,N,TO,N
1,R,V,IN,WRB,CD,N,J
2,PRP,TO,PRP$,WP,WRB,DT,PRP
3,CC,N,DT,EX,R,PRP,UH
4,FW,PRP$,CD,N,CC,CD,CD
5,SYM,WDT,RP,PRP,FW,RP,R
6,LS,LS,CC,CD,WDT,CC,RP
7,POS,SYM,POS,SYM,SYM,FW,SYM
8,PRP$,FW,WDT,UH,UH,SYM,LS
9,WDT,POS,R,LS,POS,LS,FW


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(J, 26.23)","(J, 21.69)","(N, 16.79)","(V, 20.19)","(N, 45.71)","(TO, 12.45)","(N, 46.26)"
1,"(R, 21.3)","(V, 18.29)","(IN, 12.3)","(WRB, 11.4)","(CD, 11.69)","(N, 12.39)","(J, 36.34)"
2,"(PRP, 7.76)","(TO, 9.74)","(PRP$, 4.87)","(WP, 9.61)","(WRB, 3.36)","(DT, 4.38)","(PRP, 18.76)"
3,"(CC, 1.96)","(N, 1.48)","(DT, 3.01)","(EX, 8.73)","(R, 2.58)","(PRP, 1.68)","(UH, 3.52)"
4,"(FW, 0.11)","(PRP$, 0.84)","(CD, 2.34)","(N, 7.71)","(CC, 2.47)","(CD, 1.52)","(CD, 3.32)"
5,┇,┇,┇,┇,┇,┇,┇


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,ADJP,VP,NP,WHADVP,NP,NP,ADJP
1,ADVP,ADJP,PP,WHNP,WHADVP,PP,NP
2,SBAR,SBAR,ADVP,NP,ADVP,PRT,ADVP
3,VP,QP,PRT,VP,NP-TMP,QP,INTJ
4,SBARQ,X,QP,NP-TMP,X,X,NP-TMP
5,S,SBARQ,NP-TMP,INTJ,INTJ,SBARQ,QP
6,PRN,S,INTJ,X,SBARQ,S,PRT
7,FRAG,PRN,PRN,S,S,PRN,X
8,X,FRAG,FRAG,SBARQ,PRN,FRAG,SBARQ
9,QP,INTJ,SBARQ,PRN,FRAG,NP-TMP,S


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(ADJP, 26.16)","(VP, 22.66)","(NP, 21.25)","(WHADVP, 11.4)","(NP, 45.22)","(NP, 23.56)","(ADJP, 34.11)"
1,"(ADVP, 7.03)","(ADJP, 16.6)","(PP, 14.02)","(WHNP, 8.44)","(WHADVP, 3.36)","(PP, 0.98)","(NP, 30.49)"
2,"(SBAR, 6.06)","(SBAR, 6.79)","(ADVP, 5.34)","(NP, 7.77)","(ADVP, 2.43)","(PRT, 0.56)","(ADVP, 3.3)"
3,"(VP, 1.8)","(QP, 0.52)","(PRT, 0.9)","(VP, 7.13)","(NP-TMP, 1.07)","(QP, 0.39)","(INTJ, 2.96)"
4,"(SBARQ, 0.0)","(X, 0.06)","(QP, 0.55)","(NP-TMP, 0.45)","(X, 0.09)","(X, 0.29)","(NP-TMP, 2.35)"
5,┇,┇,┇,┇,┇,┇,┇


### Without stopwords

In [None]:
# @title TFIDF (w/out stopwords): top-10 words for each category + overlap matrix {display-mode: "form"}
# Q: What are the documents and what is the corpus?
# A: There are a few options.
#   One option is to consider each category as a document and the corpus as the
# set of categories. That is to say that we combine each category's sentences
# into a single string (document), thus obtaining 7 documents which form our
# corpus.
#  Another option is to ignore category boundaries. Each sentence is its own
# document and the corpus is the set of sentencs. The TFIDF score should then
# indicate the importance of a given word (e.g., 'the') relative to the sentence
# rather than the category. Then, for each word which appears at least once in
# a given category's sentences, we would obtain a score which could be something
# like the mean of all of that words' tfidf scores (or the max). The issue with
# this analysis is that it is unclear how to combine the scores or what they
# mean.
#   Finally, an option that one could consider would be to set the documents to
# be the sentences of a category and the corpus to be only those sentences
# belonging to that category. In that case, however, it is unclear what the
# TFIDF represents. For example, let's say "Thank you" appears once in "Solution
# proposal." "Thank" may get a (decently) high score for that category would
# consequently be considered (decently) meaningful. However, we know that the
# "other" category has many occurrences of the word "thank," meaning that it
# isn't diagnostic of "solution proposal" (even though it would get a decent
# score).

# The difference between the first and second analysis is the difference between
# calculating tfidf scores for a list of sentences of using a single document
# comprised of those same sentences.

import warnings
import pandas as pd
import numpy as np

text_by_categories = GetTextByCategories(word="lemma", show_pos=False, remove_punctuation=True, remove_stopwords=True)

# (1) Documents = category; Corpus = all categories
#   Combine all the sentences of a given category into a single document. Tfidf
# then is just the colum corresponding to each category.

# Combine categories' sentences (list of words) into a single document (list)
documents = {}
for cat, sentences in text_by_categories.items():
  documents[cat] = []
  for s in sentences:
    documents[cat].extend(s.tokens)
  
words_by_tfidf_wout, tfidf, _ = GetWordsByTfidf(documents)
pd.options.display.max_rows = 100
DisplayTopN(words_by_tfidf_wout.applymap(lambda t: Token("{lemma}", t.properties)), 10, False)

# Overlap in top-100
over = CalculateOverlap(words_by_tfidf_wout, 100)
DisplayOverlapMatrix(over)

tag_by_tfidf = WordByScore2TagByScore(words_by_tfidf_wout, pos_key="pos")
display(tag_by_tfidf.applymap(lambda e: (e[1], round(e[0], 2))))

ancestor_by_tfidf = WordByScore2TagByScore(words_by_tfidf_wout, pos_key="ancestor")
display(ancestor_by_tfidf.applymap(lambda e: (e[1], round(e[0], 2))))

Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,think,would,use,use,_,_,thanks
1,would,add,docker,could,error,use,thank
2,like,like,code,would,problem,work,sorry
3,use,+1,_,_,issue,fix,help
4,seem,could,run,wonder,use,solution,hope
5,make,need,build,know,get,would,appreciate
6,good,nice,work,anyone,work,workaround,great
7,_,use,file,version,fail,add,reply
8,agree,support,see,file,try,qt,would
9,one,great,support,way,still,file,look


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,1.0,0.54,0.5,0.46,0.35,0.4,0.25
feature request,,1.0,0.53,0.5,0.43,0.44,0.21
information giving,,,1.0,0.56,0.47,0.53,0.21
information seeking,,,,1.0,0.45,0.47,0.27
problem discovery,,,,,1.0,0.5,0.18
solution proposal,,,,,,1.0,0.2
others,,,,,,,1.0


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(N, 9.71)","(N, 7.76)","(N, 15.54)","(N, 11.88)","(N, 10.37)","(N, 10.56)","(N, 3.67)"
1,"(V, 5.36)","(V, 4.45)","(V, 6.38)","(V, 6.39)","(V, 4.08)","(V, 5.31)","(V, 2.35)"
2,"(J, 3.96)","(J, 2.59)","(J, 3.59)","(J, 2.3)","(J, 1.88)","(J, 2.43)","(J, 1.41)"
3,"(R, 1.85)","(R, 0.84)","(R, 1.6)","(R, 0.87)","(CD, 1.2)","(R, 0.92)","(R, 0.56)"
4,"(IN, 0.56)","(CD, 0.4)","(CD, 1.18)","(CD, 0.86)","(R, 0.98)","(CD, 0.79)","(UH, 0.12)"
5,"(CD, 0.32)","(IN, 0.35)","(IN, 0.44)","(IN, 0.29)","(IN, 0.26)","(IN, 0.28)","(IN, 0.06)"
6,"(CC, 0.1)","(CC, 0.04)","(CC, 0.1)","(FW, 0.06)","(FW, 0.06)","(FW, 0.07)","(CD, 0.05)"
7,"(FW, 0.06)","(FW, 0.03)","(FW, 0.1)","(UH, 0.06)","(CC, 0.05)","(DT, 0.06)","(SYM, 0.01)"
8,"(DT, 0.06)","(DT, 0.03)","(POS, 0.09)","(CC, 0.05)","(DT, 0.05)","(CC, 0.06)","(CC, 0.01)"
9,"(POS, 0.05)","(POS, 0.03)","(UH, 0.06)","(DT, 0.03)","(POS, 0.04)","(RP, 0.04)","(POS, 0.01)"


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(NP, 12.06)","(NP, 9.68)","(NP, 19.49)","(NP, 14.33)","(NP, 12.93)","(NP, 13.45)","(NP, 4.1)"
1,"(VP, 5.15)","(VP, 4.27)","(VP, 5.97)","(VP, 6.02)","(VP, 3.82)","(VP, 4.84)","(VP, 2.24)"
2,"(ADJP, 2.56)","(ADJP, 1.39)","(ADVP, 1.49)","(ADJP, 1.03)","(ADVP, 0.88)","(ADJP, 0.95)","(ADJP, 1.17)"
3,"(ADVP, 1.53)","(ADVP, 0.72)","(ADJP, 1.41)","(ADVP, 0.77)","(ADJP, 0.82)","(ADVP, 0.77)","(ADVP, 0.45)"
4,"(SBAR, 0.42)","(SBAR, 0.25)","(PP, 0.27)","(PP, 0.3)","(PP, 0.18)","(PP, 0.24)","(INTJ, 0.1)"
5,"(PP, 0.24)","(PP, 0.17)","(SBAR, 0.18)","(SBAR, 0.16)","(SBAR, 0.16)","(SBAR, 0.13)","(NP-TMP, 0.07)"
6,"(QP, 0.05)","(QP, 0.04)","(NP-TMP, 0.14)","(NP-TMP, 0.13)","(NP-TMP, 0.11)","(NP-TMP, 0.07)","(SBAR, 0.05)"
7,"(NP-TMP, 0.03)","(NP-TMP, 0.03)","(QP, 0.1)","(INTJ, 0.04)","(QP, 0.08)","(QP, 0.06)","(PP, 0.03)"
8,"(WHNP, 0.02)","(X, 0.01)","(INTJ, 0.04)","(QP, 0.04)","(INTJ, 0.02)","(PRT, 0.04)","(X, 0.01)"
9,"(X, 0.01)","(INTJ, 0.01)","(PRT, 0.02)","(X, 0.03)","(X, 0.02)","(X, 0.04)","(WHNP, 0.01)"


In [None]:
# @title In-Out (w/out stopwords): top-10 words for each category + overlap matrix {display-mode: "form"}
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

text_by_categories = GetTextByCategories(word="lemma", show_pos=False, remove_punctuation=True, remove_stopwords=True)

# Combine categories' sentences (list of words) into a single document (list)
documents = {}
for cat, sentences in text_by_categories.items():
  documents[cat] = []
  for s in sentences:
    documents[cat].extend(s.tokens)

words_by_zinout_wout, _, _ = GetWordsByZInOut(documents)
DisplayTopN(words_by_zinout_wout.applymap(lambda t: Token("{lemma}", t.properties)), 10, False)

over = CalculateOverlap(words_by_zinout_wout, 100)
DisplayOverlapMatrix(over)
upper = [over.iloc[ix,jx] for ix in range(over.shape[0]) for jx in range(over.shape[1]) if ix < jx]
print(upper)

tag_by_zinout = WordByScore2TagByScore(words_by_zinout_wout, pos_key="pos")
display(tag_by_zinout.applymap(lambda e: e[1]))

# ttt = tag_by_zinout[:6].applymap(lambda e: e[1])
ttt = tag_by_zinout[:6].applymap(lambda e: (e[1], round(e[0], 2)))
for ix, e in enumerate(ttt.iloc[-1, :]):
  # ttt.iloc[-1, ix] = ("", chr(int("22ee", 16)))
  # ttt.iloc[-1, ix] = ("", chr(int("2507", 16)))
  ttt.iloc[-1, ix] = chr(int("2507", 16))
display(ttt)

ancestor_by_zinout = WordByScore2TagByScore(words_by_zinout_wout, pos_key="ancestor")
display(ancestor_by_zinout.applymap(lambda e: e[1]))

# ttt = ancestor_by_zinout[:6].applymap(lambda e: e[1])
ttt = ancestor_by_zinout[:6].applymap(lambda e: (e[1], round(e[0], 2)))
for ix, e in enumerate(ttt.iloc[-1, :]):
  # ttt.iloc[-1, ix] = ("", chr(int("22ee", 16)))
  # ttt.iloc[-1, ix] = ("", chr(int("2507", 16)))
  ttt.iloc[-1, ix] = chr(int("2507", 16))
display(ttt)

Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,think,would,use,anyone,error,solution,thanks
1,like,nice,code,could,problem,fix,thank
2,seem,+1,currently,know,issue,workaround,help
3,idea,great,see,wonder,_,_,sorry
4,good,add,docker,way,get,work,hope
5,agree,feature,note,please,fail,solve,appreciate
6,sure,need,release,mean,still,qt,great
7,really,could,build,reason,try,add,reply
8,probably,support,test,version,crash,remove,suggestion
9,guess,useful,support,update,bug,instead,lot


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,1.0,0.08,0.03,0.03,0.0,0.02,0.09
feature request,,1.0,0.05,0.04,0.02,0.04,0.06
information giving,,,1.0,0.17,0.05,0.09,0.01
information seeking,,,,1.0,0.04,0.1,0.06
problem discovery,,,,,1.0,0.05,0.0
solution proposal,,,,,,1.0,0.0
others,,,,,,,1.0


[0.08, 0.03, 0.03, 0.0, 0.02, 0.09, 0.05, 0.04, 0.02, 0.04, 0.06, 0.17, 0.05, 0.09, 0.01, 0.04, 0.1, 0.06, 0.05, 0.0, 0.0]


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,J,J,N,N,N,N,J
1,R,CD,POS,V,CD,CD,N
2,IN,SYM,R,CD,FW,J,CD
3,CC,FW,CC,UH,SYM,RP,UH
4,DT,LS,UH,SYM,PRP,FW,SYM
5,WDT,TO,FW,LS,WRB,SYM,FW
6,WRB,WDT,TO,TO,WDT,DT,WRB
7,TO,WRB,RP,WRB,DT,LS,PRP
8,FW,PRP,WRB,PRP,TO,WDT,TO
9,SYM,RP,PRP,FW,LS,WRB,LS


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(J, 99.56)","(J, 93.92)","(N, 48.1)","(N, 39.87)","(N, 123.28)","(N, 41.18)","(J, 112.38)"
1,"(R, 48.08)","(CD, 20.27)","(POS, 4.19)","(V, 34.25)","(CD, 54.24)","(CD, 23.75)","(N, 57.89)"
2,"(IN, 17.47)","(SYM, 1.3)","(R, 3.5)","(CD, 9.98)","(FW, 1.36)","(J, 5.63)","(CD, 35.96)"
3,"(CC, 2.35)","(FW, 0.57)","(CC, 1.91)","(UH, 0.92)","(SYM, 1.08)","(RP, 2.64)","(UH, 6.04)"
4,"(DT, 0.68)","(LS, 0.54)","(UH, 1.14)","(SYM, 0.57)","(PRP, 0.22)","(FW, 1.83)","(SYM, 2.15)"
5,┇,┇,┇,┇,┇,┇,┇


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,ADJP,ADJP,NP,NP,NP,NP,NP
1,ADVP,NP,ADVP,VP,QP,PRT,ADJP
2,SBAR,QP,NP-TMP,PP,NP-TMP,X,INTJ
3,QP,X,INTJ,NP-TMP,X,QP,NP-TMP
4,WHNP,SBAR,PRT,INTJ,WHNP,PP,QP
5,WHADVP,WHADVP,PRN,X,WHADVP,NP-TMP,X
6,SBARQ,SBARQ,FRAG,S,SBARQ,WHADVP,SBARQ
7,S,S,QP,WHADVP,S,SBARQ,WHADVP
8,PRN,PRN,WHADVP,SBARQ,PRN,S,S
9,FRAG,FRAG,SBARQ,PRN,FRAG,PRN,PRN


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
0,"(ADJP, 87.07)","(ADJP, 45.2)","(NP, 57.86)","(NP, 32.4)","(NP, 170.73)","(NP, 106.08)","(NP, 112.1)"
1,"(ADVP, 36.09)","(NP, 34.18)","(ADVP, 10.78)","(VP, 31.49)","(QP, 3.87)","(PRT, 2.62)","(ADJP, 71.3)"
2,"(SBAR, 18.83)","(QP, 3.19)","(NP-TMP, 1.64)","(PP, 4.83)","(NP-TMP, 3.55)","(X, 1.55)","(INTJ, 5.56)"
3,"(QP, 0.44)","(X, 0.89)","(INTJ, 0.8)","(NP-TMP, 2.07)","(X, 0.78)","(QP, 1.37)","(NP-TMP, 5.01)"
4,"(WHNP, 0.32)","(SBAR, 0.47)","(PRT, 0.34)","(INTJ, 1.43)","(WHNP, 0.17)","(PP, 0.42)","(QP, 3.95)"
5,┇,┇,┇,┇,┇,┇,┇


In [None]:
#@title correlations

import joblib
import numpy as np
import os

#joblib.load(drive_folder
tfidf_over_with = CalculateOverlap(words_by_tfidf_with, 100)
tfidf_upper_with = [tfidf_over_with.iloc[ix,jx] for ix in range(tfidf_over_with.shape[0]) for jx in range(tfidf_over_with.shape[1]) if ix < jx]
tfidf_over_wout = CalculateOverlap(words_by_tfidf_wout, 100)
tfidf_upper_wout = [tfidf_over_wout.iloc[ix,jx] for ix in range(tfidf_over_wout.shape[0]) for jx in range(tfidf_over_wout.shape[1]) if ix < jx]
inout_over_with = CalculateOverlap(words_by_zinout_with, 100)
inout_upper_with = [inout_over_with.iloc[ix,jx] for ix in range(inout_over_with.shape[0]) for jx in range(inout_over_with.shape[1]) if ix < jx]
inout_over_wout = CalculateOverlap(words_by_zinout_wout, 100)
inout_upper_wout = [inout_over_wout.iloc[ix,jx] for ix in range(inout_over_wout.shape[0]) for jx in range(inout_over_wout.shape[1]) if ix < jx]

print(
  'Correlation between TFIDF w/out stopwords and InOut w/out stopwords: %.4f\n' %  np.corrcoef(tfidf_upper_wout, inout_upper_wout)[0,1]
  + 'Correlation between TFIDF w/out stopwords and Inout with stopwords: %.4f\n' %  np.corrcoef(tfidf_upper_wout, inout_upper_with)[0,1]
)

joblib.dump({
    'TFIDF overlap matrix w/ stopwords': tfidf_over_with, 
    'InOut overlap matrix w/ stopwords': inout_over_with,
    'TFIDF overlap matrix w/out stopwords': tfidf_over_wout, 
    'InOut overlap matrix w/out stopwords': inout_over_wout
}, os.path.join(drive_folder, "out", "tfidf_inout_overlap.joblib"))

# mat = [0.92, 0.918, 0.922, 0.878, 0.818, 0.896, 0.838, 0.842, 0.764, 0.776, 0.826, 0.87, 0.91, 0.86, 0.814, 0.886, 0.848, 0.858, 0.786, 0.854, 0.786]
# print(tfidf_upper_wout, '\n', inout_upper_wout, '\n', mat)
# print(np.corrcoef(tfidf_upper_wout, mat))
# print(np.corrcoef(inout_upper_wout, mat))

Correlation between TFIDF w/out stopwords and InOut w/out stopwords: 0.4593
Correlation between TFIDF w/out stopwords and Inout with stopwords: 0.2316



['/gdrive/My Drive/COMP762_IntentionMining/out/tfidf_inout_overlap.joblib']

The results presented above suggest that TFIDF without stopwords and InOut with stopwords measure different things (or, at least, more different than without and without). Important question: what are the ranks of stopwords in InOut? If the ranks are high, it may be necessary to remove stopwords from both InOut and.

In [None]:
#@title Proportion of TFIDF and InOut top-100 words which are stopwords

tfidf_by_cat = words_by_tfidf_with[:100].applymap(lambda x: 1 if str(x) in stopwords.words('english') else 0).sum()
tfidf_by_cat = tfidf_by_cat.to_numpy()/100
tfidf_by_cat = np.append(tfidf_by_cat, tfidf_by_cat.sum() / tfidf_by_cat.size)

inout_by_cat = words_by_zinout_with[:100].applymap(lambda x: 1 if str(x) in stopwords.words('english') else 0).sum()
inout_by_cat = inout_by_cat.to_numpy()/100
inout_by_cat = np.append(inout_by_cat, inout_by_cat.sum() / inout_by_cat.size)

columns = list(words_by_zinout_with.columns)
columns.append("\u200b \u200b \u200b \u200b \u200b \u200b \u200b \u200b overall")
prop = pd.DataFrame(np.stack((tfidf_by_cat, inout_by_cat)), index=("TFIDF", "InOut"), columns=columns)
display(prop)

Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others,​ ​ ​ ​ ​ ​ ​ ​ overall
TFIDF,0.5,0.44,0.49,0.47,0.47,0.45,0.34,0.451429
InOut,0.18,0.14,0.3,0.19,0.16,0.23,0.11,0.187143


Conclusion: the proportion of stopwords in TFIDF (~45%) is more than sufficient to justify ysing TFIDF without stopwords. For InOut, it is sufficiently high (19%) to consider the analysis without stopwords as well.

# Garbage

In [None]:
# @title 
# title Download datasets from Google Drive, and define Sentence, Token, GetAllText() and GetTextByCategories() {display-mode: "form"}

# import os
# import gdown
# import json
# import re
# import nltk
# nltk.download('stopwords')

# class Token:
#   def __init__(self, default_format, properties):
#     if not callable(default_format):
#       self.default_format = lambda: default_format
#     else:
#       self.default_format = default_format
#     self.properties = {**properties}
  
#   def __getitem__(self, ix):
#     return self.properties.__getitem__(ix)

#   def __setitem__(self, ix, val):
#     return self.properties.__setitem__(ix, val)

#   def to_string(self, format_):
#     return format_.format(**self.properties)

#   def __string_rep(self):
#     return self.to_string(self.default_format())

#   def __str__(self):
#     return self.__string_rep()
  
#   def __repr__(self):
#     return self.__str__() + ": " + self.properties.__repr__()

#   def __hash__(self):
#     return self.__string_rep().__hash__()

#   def __eq__(self, other):
#     return self.__string_rep() == other.__string_rep()
  
#   def __gt__(self, other):
#     return self.__string_rep() > other.__string_rep()
    
#   def __lt__(self, other):
#     return self.__string_rep() < other.__string_rep()


# class Sentence:
#   def __init__(self, json_, fmt, get_ancestor):
#     self.json = {**json_}
#     self.fmt = fmt
#     # Lowercase lemmata (plural of lemma)
#     for t in self.json['tokens']:
#       t['lemma'] = t['lemma'].lower()
#     # Create tokens
#     self.tokens = \
#     [
#       Token(self.getFormat, token) 
#       for token in self.json['tokens']
#     ]
#     # Find each token's ancestor
#     if get_ancestor:
#       tree = self.makeParseTree()
#       for ix, token in enumerate(self.tokens):
#         tree_ix = tree.leaf_treeposition(ix)
#         token["ancestor"] = tree[tree_ix[:-2]].label()
  
#   def __getitem__(self, ix):
#     return self.tokens[ix]
  
#   def __setitem__(self, ix, val):
#     self.tokens[ix] = val

#   def getFormat(self):
#     return self.fmt
  
#   def setFormat(self, newFmt):
#     self.fmt = newFmt
  
#   def withoutPunctuation(self):
#     without = Sentence({**self.json}, self.getFormat(), False)
#     # Remove punctuation from parse
#     parse = self.json["parse"]
#     punct_ix = list(re.finditer("\([^a-zA-Z0-9(]\S* \S*[^a-zA-Z0-9)]\)", parse))
#     for m in reversed(punct_ix):
#       parse = parse[:m.start()]+ parse[m.end():]
#     without.json["parse"] = parse
#     # Remove punctuation from tokens
#     tokens = []
#     for ix, t in enumerate(self.tokens):
#       if t["pos"][0].isalpha():
#         tokens.append(Token(without.getFormat, {**t.properties}))
#       elif 0 < len(tokens):
#         tokens[-1]["after"] += t["after"]
#     without.tokens = tokens
#     # Done, leave the other fields untouched
#     return without

#   def withoutStopWords(self):
#     without = Sentence(self.json, self.getFormat(), False)
#     # Remove stopwords from tokens
#     tokens = []
#     for ix, t in enumerate(self.tokens):
#       if t["lemma"].lower() not in stopwords.words('english'):
#         tokens.append(Token(without.getFormat, {**t.properties}))
#     without.tokens = tokens
#     # Done, leave the other fields untouched
#     return without

#   def makeParseTree(self):
#     from nltk.tree import Tree
#     parse = self.json['parse']
#     # Replace words with indices in parse string
#     indices = list(enumerate(re.finditer('\s[^ )]+\)', parse)))
#     for ix, match in reversed(indices):
#       parse = parse[:match.start() + 1] + str(ix) + parse[match.end()-1:]
#     # Use parse string to create a tree
#     tree = Tree.fromstring(parse)
#     # Replace indices in tree with tokens
#     for lix, leaf in enumerate([leaf for leaf in tree.leaves() if leaf.isnumeric()]):
#       tree[tree.leaf_treeposition(lix)] = self.tokens[int(leaf)]
#     return tree
  
#   def to_string(self, fmt=None, after=None):
#     if fmt is None:
#       fmt = self.fmt
#     out = ''
#     for t in self.tokens:
#       out += t.to_string(fmt) + (t["after"] if after is None else after)
#     return out

#   def __str__(self):
#     return self.to_string()
  
#   def __repr__(self):
#     return self.__str__().strip() + ": " + repr(self.json)
  
#   def __len__(self):
#     return len(self.tokens)

#   @staticmethod
#   def __findLabelInTree(tree, label):
#     return [p for p in tree.treepositions() if isinstance(tree[p], nltk.tree.Tree) and label == tree[p].label()]
  
#   @staticmethod
#   def __getParseStringFromTreeWithTokens(tree):
#     tree_copy = tree.copy(deep=True)
#     for lix in range(len(tree_copy.leaves())):
#       tree_copy[tree_copy.leaf_treeposition(lix)] = \
#         tree_copy[tree_copy.leaf_treeposition(lix)].to_string("{originalText}")
#     return str(tree_copy)
  
#   @staticmethod
#   def __treeToSentence(tree, fmt):
#     return Sentence(
#       {
#         "parse": Sentence.__getParseStringFromTreeWithTokens(tree),
#         "tokens": [token.properties for token in tree.leaves()]
#       },
#       fmt, False
#     )
  
#   @staticmethod
#   def SwapPhrases(sentence_1, sentence_2, label, validate_phrase=None, prefix=None, suffix=None):
#     import numpy
    
#     # Build a tree from each sentence
#     tree_1 = sentence_1.makeParseTree()
#     tree_2 = sentence_2.makeParseTree()
#     # Find phrase in both trees
#     tree_1_phrases = Sentence.__findLabelInTree(tree_1, label)
#     tree_2_phrases = Sentence.__findLabelInTree(tree_2, label)
#     # If there is nothing to swap, return
#     if 0 == len(tree_1_phrases) or 0 == len(tree_2_phrases):
#       return (None, None)
#     # Pick a phrase at random from each sentence
#     tree_1_index = tree_1_phrases[np.random.randint(len(tree_1_phrases))]
#     tree_2_index = tree_2_phrases[np.random.randint(len(tree_2_phrases))]
#     # Validate the phrases to swap
#     if validate_phrase is not None \
#       and not validate_phrase(tree_1, tree_1_index, tree_2, tree_2_index):
#         return (None, None)
#     # DEBUG #
#     if prefix is not None:
#       if isinstance(prefix, str):
#         pprefix = [prefix, prefix]
#       ttt = tree_1[tree_1_index].leaves()[0]
#       tok_1_index = tree_1[tree_1_index].leaf_treeposition(0)
#       tree_1[tree_1_index][tok_1_index] = Token(
#           tree_1[tree_1_index][tok_1_index].default_format,
#           tree_1[tree_1_index][tok_1_index].properties
#       )
#       tok_2_index = tree_2[tree_2_index].leaf_treeposition(0)
#       tree_2[tree_2_index][tok_2_index] = Token(
#           tree_2[tree_2_index][tok_2_index].default_format,
#           tree_2[tree_2_index][tok_2_index].properties
#       )
#       keys = ["originalText", "lemma"]
#       for k in keys:
#         tree_1[tree_1_index].leaves()[0][k] = \
#           prefix[0] + tree_1[tree_1_index][tok_1_index][k]
#         tree_2[tree_2_index].leaves()[0][k] = \
#           prefix[1] + tree_2[tree_2_index][tok_2_index][k]
#     if suffix is not None:
#       if isinstance(suffix, str):
#         suffix = [suffix, suffix]
#       tok_1_index = tree_1[tree_1_index].leaf_treeposition(len(tree_1[tree_1_index].leaves())-1)
#       tree_1[tree_1_index][tok_1_index] = Token(
#           tree_1[tree_1_index][tok_1_index].default_format,
#           tree_1[tree_1_index][tok_1_index].properties
#       )
#       tok_2_index = tree_2[tree_2_index].leaf_treeposition(len(tree_2[tree_2_index].leaves())-1)
#       tree_2[tree_2_index][tok_2_index] = Token(
#           tree_2[tree_2_index][tok_2_index].default_format,
#           tree_2[tree_2_index][tok_2_index].properties
#       )
#       keys = ["originalText", "lemma"]
#       for k in keys:
#         tree_1[tree_1_index].leaves()[-1][k] = \
#           tree_1[tree_1_index][tok_1_index][k] + suffix[0]
#         tree_2[tree_2_index].leaves()[-1][k] = \
#           tree_2[tree_2_index][tok_2_index][k] + suffix[1]
#     #########
#     # Swap
#     swap = tree_1[tree_1_index]
#     tree_1[tree_1_index] = tree_2[tree_2_index]
#     tree_2[tree_2_index] = swap
#     # Create Sentences
#     #   Create copy, convert leaves to strings (lemma), and get parse string
#     return \
#     (
#       Sentence.__treeToSentence(tree_1, sentence_1.fmt),
#       Sentence.__treeToSentence(tree_2, sentence_2.fmt)
#     )


# archive = "Automating-Intention-Mining-parsed-data.tar.gz"
# url = "https://drive.google.com/uc?id=1MYR04EN9wyEw5C-RhpAmX5Xnat-jiBSy"
# print("Downloading {}: ".format(archive), end="")
# if not os.path.isfile(archive):
#   gdown.download(url, archive, 0)
# else:
#   print('file already exists. Skipping download.')
# print("done")

# # Remove old paths
# parsed_folder = 'parsed'
# if os.path.exists(parsed_folder):
#   !rm -r parsed

# print("Extracting files... ")
# !tar xvf Automating-Intention-Mining-parsed-data.tar.gz
# print("Extracting files... done")

# # Load data
# projects = ['DECA', 'bootstrap', 'docker', 'tensorflow', 'vscode']
# categories = [
#   'aspect evaluation', 'feature request', 'information giving',
#   'information seeking', 'problem discovery', 'solution proposal', 'others'
# ]

# _parsed_cat_proj = {}
# for c in categories:
#   _parsed_cat_proj[c] = {}
#   for p in projects:
#     with open(os.path.join(parsed_folder, p, c + ".json"), 'r', encoding='latin-1') \
#       as f:
#       j = json.load(f)
#       assert c == j["docId"]
#       _parsed_cat_proj[c][p] = j["sentences"]
#       # for s in j["sentences"]:
#       #   _parsed_cat_proj[c][p].append(Sentence(s, "{lemma}/{pos}"))

# def GetAllText(
#   word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
#   get_ancestors=True, projects_to_exclude=None
# ):

#   if word == "word":
#     fmt = "{originalText}"
#   elif word == "lemma":
#     fmt = "{lemma}"
#   else:
#     raise Exception("Value (\"{}\") for @word not recognized.")
#   if show_pos:
#     fmt += "/{pos}"

#   if remove_stopwords:
#     constructor1 = lambda *args: Sentence(*args).withoutStopWords()
#   else:
#     constructor1 = lambda *args: Sentence(*args)

#   if remove_punctuation:
#     constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
#   else:
#     constructor2 = constructor1

#   if projects_to_exclude is None:
#     projects_to_exclude = []
#   elif isinstance(projects_to_exclude, str):
#     projects_to_exclude = [projects_to_exclude]

#   projects_to_exclude = [p.lower() for p in projects_to_exclude]

#   return \
#   [
#     constructor2(sentence, fmt, get_ancestors)
#     for category_name, projects in _parsed_cat_proj.items()
#     for project_name, project_text in projects.items()
#     for sentence in project_text
#     if project_name.lower() not in projects_to_exclude
#   ]


# def GetTextByCategories(
#   word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
#   get_ancestors=True, projects_to_exclude=None
# ):

#   if word == "word":
#     fmt = "{originalText}"
#   elif word == "lemma":
#     fmt = "{lemma}"
#   else:
#     raise Exception("Value (\"{}\") for @word not recognized.")
#   if show_pos:
#     fmt += "/{pos}"

#   if remove_stopwords:
#     constructor1 = lambda *args: Sentence(*args).withoutStopWords()
#   else:
#     constructor1 = lambda *args: Sentence(*args)

#   if remove_punctuation:
#     constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
#   else:
#     constructor2 = constructor1

#   if projects_to_exclude is None:
#     projects_to_exclude = []
#   elif isinstance(projects_to_exclude, str):
#     projects_to_exclude = [projects_to_exclude]

#   projects_to_exclude = [p.lower() for p in projects_to_exclude]

#   return \
#   {
#     category_name:
#     [
#       constructor2(sentence, fmt, get_ancestors)
#       for project_name, project_text in projects.items()
#       for sentence in project_text
#       if project_name.lower() not in projects_to_exclude
#     ]
#     for category_name, projects in _parsed_cat_proj.items()
#   }


# print(
#   '\n\n=======================================\n'
#   + 'Use GetAllText() to get all text as a list.\n'
#   + 'Use GetTextByCategories() to get a dictionary with category names as\n'
#   + '  keys and lists of sentences belonging to that category as values.\n'
#   + '=======================================\n'
# )

In [None]:
# @title
# title Download datasets from Google Drive, and define Sentence, Token, GetAllText() and GetTextByCategories() {display-mode: "form"}

# import os
# import gdown
# import json
# import re
# import nltk
# nltk.download('stopwords')

# class Token:
#   def __init__(self, default_format, properties):
#     if not callable(default_format):
#       self.default_format = lambda: default_format
#     else:
#       self.default_format = default_format
#     self.properties = {**properties}
  
#   def __getitem__(self, ix):
#     return self.properties.__getitem__(ix)

#   def __setitem__(self, ix, val):
#     return self.properties.__setitem__(ix, val)

#   def to_string(self, format_):
#     return format_.format(**self.properties)

#   def __string_rep(self):
#     return self.to_string(self.default_format())

#   def __str__(self):
#     return self.__string_rep()
  
#   def __repr__(self):
#     return self.__str__() + ": " + self.properties.__repr__()

#   def __hash__(self):
#     return self.__string_rep().__hash__()

#   def __eq__(self, other):
#     return self.__string_rep() == other.__string_rep()
  
#   def __gt__(self, other):
#     return self.__string_rep() > other.__string_rep()
    
#   def __lt__(self, other):
#     return self.__string_rep() < other.__string_rep()


# class Sentence:
#   def __init__(self, json_, fmt, get_ancestor):
#     self.json = {**json_}
#     self.fmt = fmt
#     # Lowercase lemmata (plural of lemma)
#     for t in self.json['tokens']:
#       t['lemma'] = t['lemma'].lower()
#     # Create tokens
#     self.tokens = \
#     [
#       Token(self.getFormat, token) 
#       for token in self.json['tokens']
#     ]
#     # Find each token's ancestor
#     if get_ancestor:
#       tree = self.makeParseTree()
#       for ix, token in enumerate(self.tokens):
#         tree_ix = tree.leaf_treeposition(ix)
#         token["ancestor"] = tree[tree_ix[:-2]].label()
  
#   def __getitem__(self, ix):
#     return self.tokens[ix]
  
#   def __setitem__(self, ix, val):
#     self.tokens[ix] = val

#   def getFormat(self):
#     return self.fmt
  
#   def setFormat(self, newFmt):
#     self.fmt = newFmt
  
#   def withoutPunctuation(self):
#     without = Sentence({**self.json}, self.getFormat(), False)
#     # Remove punctuation from parse
#     parse = self.json["parse"]
#     punct_ix = list(re.finditer("\([^a-zA-Z0-9(]\S* \S*[^a-zA-Z0-9)]\)", parse))
#     for m in reversed(punct_ix):
#       parse = parse[:m.start()]+ parse[m.end():]
#     without.json["parse"] = parse
#     # Remove punctuation from tokens
#     tokens = []
#     for ix, t in enumerate(self.tokens):
#       if t["pos"][0].isalpha():
#         tokens.append(Token(without.getFormat, {**t.properties}))
#       elif 0 < len(tokens):
#         tokens[-1]["after"] += t["after"]
#     without.tokens = tokens
#     # Done, leave the other fields untouched
#     return without

#   def withoutStopWords(self):
#     without = Sentence(self.json, self.getFormat(), False)
#     # Remove stopwords from tokens
#     tokens = []
#     for ix, t in enumerate(self.tokens):
#       if t["lemma"].lower() not in stopwords.words('english'):
#         tokens.append(Token(without.getFormat, {**t.properties}))
#     without.tokens = tokens
#     # Done, leave the other fields untouched
#     return without

#   def makeParseTree(self):
#     from nltk.tree import Tree
#     parse = self.json['parse']
#     # Replace words with indices in parse string
#     indices = list(enumerate(re.finditer('\s[^ )]+\)', parse)))
#     for ix, match in reversed(indices):
#       parse = parse[:match.start() + 1] + str(ix) + parse[match.end()-1:]
#     # Use parse string to create a tree
#     tree = Tree.fromstring(parse)
#     # Replace indices in tree with tokens
#     for lix, leaf in enumerate([leaf for leaf in tree.leaves() if leaf.isnumeric()]):
#       tree[tree.leaf_treeposition(lix)] = self.tokens[int(leaf)]
#     return tree
  
#   def to_string(self, fmt=None, after=None):
#     if fmt is None:
#       fmt = self.fmt
#     out = ''
#     for t in self.tokens:
#       out += t.to_string(fmt) + (t["after"] if after is None else after)
#     return out

#   def __str__(self):
#     return self.to_string()
  
#   def __repr__(self):
#     return self.__str__().strip() + ": " + repr(self.json)
  
#   def __len__(self):
#     return len(self.tokens)

#   @staticmethod
#   def __findLabelInTree(tree, label):
#     return [p for p in tree.treepositions() if isinstance(tree[p], nltk.tree.Tree) and label == tree[p].label()]
  
#   @staticmethod
#   def __getParseStringFromTreeWithTokens(tree):
#     tree_copy = tree.copy(deep=True)
#     for lix in range(len(tree_copy.leaves())):
#       tree_copy[tree_copy.leaf_treeposition(lix)] = \
#         tree_copy[tree_copy.leaf_treeposition(lix)].to_string("{originalText}")
#     return str(tree_copy)
  
#   @staticmethod
#   def __treeToSentence(tree, fmt):
#     return Sentence(
#       {
#         "parse": Sentence.__getParseStringFromTreeWithTokens(tree),
#         "tokens": [token.properties for token in tree.leaves()]
#       },
#       fmt, False
#     )
  
#   @staticmethod
#   def SwapPhrases(sentence_1, sentence_2, label, validate_phrase=None, prefix=None, suffix=None):
#     import numpy
    
#     # Build a tree from each sentence
#     tree_1 = sentence_1.makeParseTree()
#     tree_2 = sentence_2.makeParseTree()
#     # Find phrase in both trees
#     tree_1_phrases = Sentence.__findLabelInTree(tree_1, label)
#     tree_2_phrases = Sentence.__findLabelInTree(tree_2, label)
#     # If there is nothing to swap, return
#     if 0 == len(tree_1_phrases) or 0 == len(tree_2_phrases):
#       return (None, None)
#     # Pick a phrase at random from each sentence
#     tree_1_index = tree_1_phrases[np.random.randint(len(tree_1_phrases))]
#     tree_2_index = tree_2_phrases[np.random.randint(len(tree_2_phrases))]
#     # Validate the phrases to swap
#     if validate_phrase is not None \
#       and not validate_phrase(tree_1, tree_1_index, tree_2, tree_2_index):
#         return (None, None)
#     # DEBUG #
#     if prefix is not None:
#       if isinstance(prefix, str):
#         pprefix = [prefix, prefix]
#       ttt = tree_1[tree_1_index].leaves()[0]
#       tok_1_index = tree_1[tree_1_index].leaf_treeposition(0)
#       tree_1[tree_1_index][tok_1_index] = Token(
#           tree_1[tree_1_index][tok_1_index].default_format,
#           tree_1[tree_1_index][tok_1_index].properties
#       )
#       tok_2_index = tree_2[tree_2_index].leaf_treeposition(0)
#       tree_2[tree_2_index][tok_2_index] = Token(
#           tree_2[tree_2_index][tok_2_index].default_format,
#           tree_2[tree_2_index][tok_2_index].properties
#       )
#       keys = ["originalText", "lemma"]
#       for k in keys:
#         tree_1[tree_1_index].leaves()[0][k] = \
#           prefix[0] + tree_1[tree_1_index][tok_1_index][k]
#         tree_2[tree_2_index].leaves()[0][k] = \
#           prefix[1] + tree_2[tree_2_index][tok_2_index][k]
#     if suffix is not None:
#       if isinstance(suffix, str):
#         suffix = [suffix, suffix]
#       tok_1_index = tree_1[tree_1_index].leaf_treeposition(len(tree_1[tree_1_index].leaves())-1)
#       tree_1[tree_1_index][tok_1_index] = Token(
#           tree_1[tree_1_index][tok_1_index].default_format,
#           tree_1[tree_1_index][tok_1_index].properties
#       )
#       tok_2_index = tree_2[tree_2_index].leaf_treeposition(len(tree_2[tree_2_index].leaves())-1)
#       tree_2[tree_2_index][tok_2_index] = Token(
#           tree_2[tree_2_index][tok_2_index].default_format,
#           tree_2[tree_2_index][tok_2_index].properties
#       )
#       keys = ["originalText", "lemma"]
#       for k in keys:
#         tree_1[tree_1_index].leaves()[-1][k] = \
#           tree_1[tree_1_index][tok_1_index][k] + suffix[0]
#         tree_2[tree_2_index].leaves()[-1][k] = \
#           tree_2[tree_2_index][tok_2_index][k] + suffix[1]
#     #########
#     # Swap
#     swap = tree_1[tree_1_index]
#     tree_1[tree_1_index] = tree_2[tree_2_index]
#     tree_2[tree_2_index] = swap
#     # Create Sentences
#     #   Create copy, convert leaves to strings (lemma), and get parse string
#     return \
#     (
#       Sentence.__treeToSentence(tree_1, sentence_1.fmt),
#       Sentence.__treeToSentence(tree_2, sentence_2.fmt)
#     )


# archive = "Automating-Intention-Mining-parsed-data.tar.gz"
# url = "https://drive.google.com/uc?id=1MYR04EN9wyEw5C-RhpAmX5Xnat-jiBSy"
# print("Downloading {}: ".format(archive), end="")
# if not os.path.isfile(archive):
#   gdown.download(url, archive, 0)
# else:
#   print('file already exists. Skipping download.')
# print("done")

# # Remove old paths
# parsed_folder = 'parsed'
# if os.path.exists(parsed_folder):
#   !rm -r parsed

# print("Extracting files... ")
# !tar xvf Automating-Intention-Mining-parsed-data.tar.gz
# print("Extracting files... done")

# # Load data
# projects = ['DECA', 'bootstrap', 'docker', 'tensorflow', 'vscode']
# categories = [
#   'aspect evaluation', 'feature request', 'information giving',
#   'information seeking', 'problem discovery', 'solution proposal', 'others'
# ]

# _parsed_cat_proj = {}
# for c in categories:
#   _parsed_cat_proj[c] = {}
#   for p in projects:
#     with open(os.path.join(parsed_folder, p, c + ".json"), 'r', encoding='latin-1') \
#       as f:
#       j = json.load(f)
#       assert c == j["docId"]
#       _parsed_cat_proj[c][p] = j["sentences"]
#       # for s in j["sentences"]:
#       #   _parsed_cat_proj[c][p].append(Sentence(s, "{lemma}/{pos}"))

# def GetAllText(
#   word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
#   get_ancestors=True, projects_to_exclude=None
# ):

#   if word == "word":
#     fmt = "{originalText}"
#   elif word == "lemma":
#     fmt = "{lemma}"
#   else:
#     raise Exception("Value (\"{}\") for @word not recognized.")
#   if show_pos:
#     fmt += "/{pos}"

#   if remove_stopwords:
#     constructor1 = lambda *args: Sentence(*args).withoutStopWords()
#   else:
#     constructor1 = lambda *args: Sentence(*args)

#   if remove_punctuation:
#     constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
#   else:
#     constructor2 = constructor1

#   if projects_to_exclude is None:
#     projects_to_exclude = []
#   elif isinstance(projects_to_exclude, str):
#     projects_to_exclude = [projects_to_exclude]

#   projects_to_exclude = [p.lower() for p in projects_to_exclude]

#   return \
#   [
#     constructor2(sentence, fmt, get_ancestors)
#     for category_name, projects in _parsed_cat_proj.items()
#     for project_name, project_text in projects.items()
#     for sentence in project_text
#     if project_name.lower() not in projects_to_exclude
#   ]


# def GetTextByCategories(
#   word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False,
#   get_ancestors=True, projects_to_exclude=None
# ):

#   if word == "word":
#     fmt = "{originalText}"
#   elif word == "lemma":
#     fmt = "{lemma}"
#   else:
#     raise Exception("Value (\"{}\") for @word not recognized.")
#   if show_pos:
#     fmt += "/{pos}"

#   if remove_stopwords:
#     constructor1 = lambda *args: Sentence(*args).withoutStopWords()
#   else:
#     constructor1 = lambda *args: Sentence(*args)

#   if remove_punctuation:
#     constructor2 = lambda *args: constructor1(*args).withoutPunctuation()
#   else:
#     constructor2 = constructor1

#   if projects_to_exclude is None:
#     projects_to_exclude = []
#   elif isinstance(projects_to_exclude, str):
#     projects_to_exclude = [projects_to_exclude]

#   projects_to_exclude = [p.lower() for p in projects_to_exclude]

#   return \
#   {
#     category_name:
#     [
#       constructor2(sentence, fmt, get_ancestors)
#       for project_name, project_text in projects.items()
#       for sentence in project_text
#       if project_name.lower() not in projects_to_exclude
#     ]
#     for category_name, projects in _parsed_cat_proj.items()
#   }


# print(
#   '\n\n=======================================\n'
#   + 'Use GetAllText() to get all text as a list.\n'
#   + 'Use GetTextByCategories() to get a dictionary with category names as\n'
#   + '  keys and lists of sentences belonging to that category as values.\n'
#   + '=======================================\n'
# )

In [None]:
# @title
from nltk.tree import Tree
import numpy as np

def FindLabelInTree(tree, label):
  return [p for p in tree.treepositions() if isinstance(tree[p], nltk.tree.Tree) and label == tree[p].label()]

def GetParseString(tree):
  tree_copy = tree.copy(deep=True)
  for lix in range(len(tree_copy.leaves())):
    tree_copy[tree_copy.leaf_treeposition(lix)] = \
      tree_copy[tree_copy.leaf_treeposition(lix)].to_string("{originalText}")
  return str(tree_copy)

def SwapPhrases(sentence_1, sentence_2, label):
  # Build a tree from each sentence
  tree_1 = sentence_1.makeParseTree()
  tree_2 = sentence_2.makeParseTree()

  # Find phrase in both trees
  tree_1_phrases = FindLabelInTree(tree_1, label)
  tree_2_phrases = FindLabelInTree(tree_2, label)

  # If there is nothing to swap, return
  if 0 == len(sentence_1_phrases) or 0 == len(sentence_2_phrases):
    return (None, None)
  
  # Pick a phrase at random from each sentence
  tree_1_index = tree_1_phrases[np.random.randint(len(tree_1_phrases))]
  tree_2_index = tree_2_phrases[np.random.randint(len(tree_2_phrases))]

  # Swap
  swap = tree_1[tree_1_index]
  tree_1[tree_1_index] = tree_2[tree_2_index]
  tree_2[tree_2_index] = swap

  # Create Sentences
  #   Create copy, convert leaves to strings (lemma), and get parse string
  return \
  (
    Sentence({"parse": GetParseString(tree_1)}, sentence_1.fmt),
    Sentence({"parse": GetParseString(tree_2)}, sentence_2.fmt)
  )

text = GetTextByCategories(word="word", show_pos=False, keep_punctuation=False)

s1 = text['aspect evaluation'][0]
s1.fmt

t1 = text['aspect evaluation'][0].makeParseTree()
t2 = text['feature request'][0].makeParseTree()

t1.pretty_print()
t2.pretty_print()

t1_np = FindLabelInTree(t1, "NP")
t2_np = FindLabelInTree(t2, "NP")

t1_np_ix = t1_np[np.random.randint(len(t1_np))]
t2_np_ix = t2_np[np.random.randint(len(t2_np))]

t2[t2_np_ix].pretty_print()

swap = t1[t1_np_ix]
t1[t1_np_ix] = t2[t2_np_ix]
t2[t2_np_ix] = swap

str(t1)

t1.pretty_print()
t2.pretty_print()

t3 = t1.copy(deep=True)
t3[t3.leaf_treeposition(0)] = 'a'
t3.pretty_print()
t1.pretty_print()
# t1_tokens = t1.leaves()
# print(str(t1))
# print([token.to_string("{lemma}") for token in t1.leaves()])
# print([token.to_string("{lemma}") for token in t2.leaves()])
# print(str(t1))
# print(GetParseString(t1))
nltk.tree.Tree.fromstring(GetParseString(t1)).pretty_print()
token = t1.leaves()[0]
parse = GetParseString(t1)
tokens = [t.properties for t in t1.leaves()]
s = Sentence({"parse": parse, "tokens": tokens}, "{word}")
print(s)

# swap = t1[t1_np_ix]
# t1.insert(t1_np_ix, t2[t2_np_ix])
# t2.insert(t2_np_ix, t1[t1_np_ix])

# t1.pretty_print()
# t2.pretty_print()