#Utility functions, parameters, and Google Drive

In [None]:
#@title Google Drive {run: "auto"}
from google.colab import drive
import os

drive.mount('/gdrive', force_remount=True)
drive_folder = "COMP762_IntentionMining" # @param {type:"string"}
drive_folder = os.path.join("/gdrive/My Drive/", drive_folder)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [None]:
# @title Download text files from Google Drive

import os
import gdown
import zipfile
import shutil

zfile = "Intention-Mining-data.zip"
url = "https://drive.google.com/uc?id=1T4aW9vwu1A01-XUZ5AugoHhBR2IVrame"
print("Downloading {}:".format(zfile))
if not os.path.isfile(zfile):
    gdown.download(url, zfile, 0)
else:
    print('File already exists. Continuing.')


z = zipfile.ZipFile(zfile, 'r')
print("Extracting data...")
# Remove old paths
if os.path.exists('data'):
    !rm -r data
# Unzip data and runs
with zipfile.ZipFile(zfile, 'r') as z:
    for filename in z.namelist():
        if filename.startswith("data/"):
            z.extract(filename)
print("done")

Downloading Intention-Mining-data.zip:


Downloading...
From: https://drive.google.com/uc?id=1T4aW9vwu1A01-XUZ5AugoHhBR2IVrame
To: /content/Intention-Mining-data.zip
100%|██████████| 238k/238k [00:00<00:00, 93.7MB/s]

Extracting data...
done





In [None]:
# @title Download Token, Sentence, StanfordFiles, GetAllText(), and GetAllCategories()
import gdown

gdown.download("https://drive.google.com/uc?id=1lsi836Dfkx0S_UUAJwTPRdqolW2UREl0", 'aim_parsed_text.py', False)

# Reload just in case
import importlib, aim_parsed_text
GetAllText = aim_parsed_text.GetAllText
GetTextByCategories = aim_parsed_text.GetTextByCategories
Token = aim_parsed_text.Token
Sentence = aim_parsed_text.Sentence


Downloading...
From: https://drive.google.com/uc?id=1lsi836Dfkx0S_UUAJwTPRdqolW2UREl0
To: /content/aim_parsed_text.py
100%|██████████| 25.5k/25.5k [00:00<00:00, 30.1MB/s]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Downloading Automating-Intention-Mining-parsed-data.tar.gz: 

Downloading...
From: https://drive.google.com/uc?id=1MYR04EN9wyEw5C-RhpAmX5Xnat-jiBSy
To: /content/Automating-Intention-Mining-parsed-data.tar.gz
5.81MB [00:00, 17.4MB/s]


done
Extracting files... 
Extracting files... done


functions:
  def GetAllText(
    word, show_pos,
    remove_punctuation, remove_stopwords,
    get_ancestors, projects_to_exclude
  )
    Returns all text as a list.
    (See parameter descriptions below)

  def GetTextByCategories(
    word, show_pos,
    remove_punctuation, remove_stopwords,
    get_ancestors, projects_to_exclude
)
    Returns a dictionary with category names as keys and lists of
    sentences belonging to that category as values.

    Parameters:
      word: "word" (original text) or lemma.
      show_pos: show Part-Of-Speech tag?
      remove_punctuation: self-explanatory.
      remove_stopwords: self-explanatory.
      get_ancestors: if True, get each token's ancestor's label (chunk).
      projects_to_exclude: self-explanatory. E.g., ["DECA", "vscode"]


classes:
  class Token
    Token(default_format, properties)
      default_format: function returning format string (e.g.,
        "{{lemma}}")
      propertie

In [None]:
# @title Download models from Google Drive

import os
import gdown

model_filenames_urls = \
{
  "AIM_NB_LG_SVM.joblib": "https://drive.google.com/uc?id=14pbOMjrMnGWpZo82DzIPQZ0OEMuacinF",
  "AIM_LSTM.joblib": "https://drive.google.com/uc?id=1rr9OXO22qjbAzsyl1dIGjLKO5JeloIrZ",
  "AIM_CNN.joblib": "https://drive.google.com/uc?id=1mImAql1KXWL9OWtxhpUuOq3duWzrSrif"
}
for filename, url in model_filenames_urls.items():
  gdown.download(url, filename, 0)

Downloading...
From: https://drive.google.com/uc?id=14pbOMjrMnGWpZo82DzIPQZ0OEMuacinF
To: /content/AIM_NB_LG_SVM.joblib
2.86MB [00:00, 261MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rr9OXO22qjbAzsyl1dIGjLKO5JeloIrZ
To: /content/AIM_LSTM.joblib
119MB [00:00, 164MB/s]  
Downloading...
From: https://drive.google.com/uc?id=1mImAql1KXWL9OWtxhpUuOq3duWzrSrif
To: /content/AIM_CNN.joblib
75.5MB [00:00, 202MB/s] 


In [None]:
#@title Utility functions

def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display
    
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)
        
    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
                    
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

def try_makedirs(d):
    import os
    try:
        os.makedirs(d)
    except FileExistsError as e:
        pass

    
def load_image(filename):
    return \
        PIL.Image.open(filename)\
          .convert('RGB')

    
# def load_image(filename):
#     return \
#         PIL.Image.open(filename)\
#           .resize((64,64), _interpolation_method)\
#           .convert('RGB')

# def load_image(filename):
#     return np.array(
#         PIL.Image.open(filename)\
#           .resize((64,64), _interpolation_method)\
#           .convert('RGB'),
#         dtype=np.float32
#     )/255


# def plot_results(
#     results_1, model_1_name, model_1_color,
#     # results_2, model_2_name, model_2_color
# ):
    
    
def plot_results_to_grid(
    grid, where,
    results, model_name, model_color,
    learning_rate, lr_color
):
    import matplotlib.pyplot as plt
    
    with grid.output_to(where[0], where[1]):
        res_y = results
        res_x = np.where(np.invert(np.isnan(res_y)))[0]
        res_y = res_y[res_x]
        res_x += 1

        learning_rate = np.array(learning_rate)
        lr_x = np.where((learning_rate[0:-1] != learning_rate[1:]) * np.invert(np.isnan(learning_rate[0:-1])) * np.invert(np.isnan(learning_rate[1:])))
        lr_x = np.repeat(2+np.reshape(lr_x[0], (1, -1)), 2, 0)
        lr_y = np.repeat(np.array([[0], [100]], ndmin=2), lr_x.shape[1], 1)
        lr_r = learning_rate[lr_x[0, :] -1]

        # Show original lr
        lr_x = np.concatenate((np.array([[0.5],[0.5]], ndmin=2), lr_x), 1)
        lr_y = np.concatenate((np.array([[np.nan],[np.nan]], ndmin=2), lr_y), 1)
        lr_r = np.concatenate((np.array(learning_rate[0], ndmin=1), lr_r))

        # Alternate text y position to make it more readable       
        lr_t = np.repeat([[80], [20]], (1+lr_r.size)/2, 1).T.flatten()
        
        grid.clear_cell()

        res_plt = plt.plot(res_x, res_y, model_color + '-', res_x, res_y, model_color + 'o')
        lr_plt = plt.plot(lr_x, lr_y, lr_color + '--')
        for x, r, t in zip(lr_x.T, lr_r, lr_t):
            plt.text(x[0] + results.shape[0]/100, t, "lr = {}".format(r), color=lr_color) #withdash=True, 
        
        # plt.legend((one_plt[0], two_plt[0]), (model_1_name, model_2_name))
        plt.legend([res_plt[0]], [model_name])
        plt.title('Model accuracy improvement over time')
        plt.xlabel('# of epochs')
        plt.xlim(0.5, 0.5 + results.shape[0])
        plt.xticks(np.floor(np.linspace(1, results.shape[0], 11)))
        plt.ylabel('Accuracy (%)')
        plt.ylim(0, 100)
        plt.yticks(np.linspace(0, 100, 11));
        plt.draw()

        
def elapsed_time(model_name, i):
    import time
    
    if 0 == i:
        elapsed_time.start = time.time()
    else:
        print(
            "{{{}}} The last training epoch took {} seconds.\n\n".format(
                model_name, time.time() - elapsed_time.start
            )
        )
        

class CatchIO:
    def __init__(self):
        self._stdout = None
        self.buffer = None
    
    def __enter__(self):
        import sys
        import io
        
        self._stdout = sys.stdout
        sys.stdout = io.StringIO()
    
    def __exit__(self, type_, value, traceback):
        import sys
        
        self.buffer = sys.stdout.getvalue()
        sys.stdout = self._stdout
        if value is not None:
            raise value


#Models and datasets

In [None]:
# @title Automating Intention Mining CNN {display-mode: "form"}

import torch.nn as nn
import torch.nn.functional as F

class AIM_CNN(nn.Module):
    def __init__(self,
        sequence_length, num_classes, vocab_size, embedding_size,
        filter_sizes, num_filter, #, l2_reg_lambda
    ):
      super(AIM_CNN, self).__init__()
      self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=None)
      self.CBRs = nn.ModuleList([
        nn.Sequential(
            nn.Conv2d(1, num_filter, (fsize, embedding_size), stride=1),
            nn.BatchNorm2d(embedding_size),
            nn.ReLU()
        ) \
        for fsize in filter_sizes
      ])
      self.drop = nn.Dropout(0.5)
      self.classifier = nn.Linear(len(filter_sizes) * num_filter, num_classes)
    
    def forward_plus(self, x):
      # Word index to embedded format
      x = self.embed(x)
      # Add channel dimension
      x = torch.unsqueeze(x, 1)
      z = torch.zeros((x.shape[0], 0, 1), device=x.device)
      for cbr in self.CBRs:
        y = cbr(x)
        y = torch.squeeze(y, -1)
        # Remove last dimension (size 1)
        y = torch.squeeze(y, -1)
        y = F.max_pool2d(y, (1, y.size(2)))
        z = torch.cat((z, y), 1)

      z = self.drop(z)
      z = z.squeeze(-1)
      z = self.classifier(z)
      return (F.log_softmax(z, dim=1), z)
    
    def forward(self, x):
      return self.forward_plus(x)[0]

In [None]:
# @title LSTM

import torch.nn as nn
import torch.nn.functional as F

class AIM_LSTM(nn.Module):
    def __init__(self,
        sequence_length, num_classes, vocab_size, embedding_size,
         hidden_dim, n_layers
         #, l2_reg_lambda
    ):
      super(AIM_LSTM, self).__init__()
      self.embed = nn.Embedding(vocab_size, embedding_size, padding_idx=None)
      self.lstm = nn.LSTM(embedding_size, 
                          hidden_dim, 
                          num_layers=n_layers, 
                          bidirectional=True, 
                          dropout=0.5)
      self.drop = nn.Dropout(0.5)
      self.classifier = nn.Linear(hidden_dim * 2, num_classes)
    
    def forward_plus(self, text):
      text = text.t()
      embedded = self.drop(self.embed(text))
      packed_output, (hidden, cell) = self.lstm(embedded)
      hidden = self.drop(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
      return (
        nn.functional.log_softmax(self.classifier(hidden), dim=1),
        hidden
      )

    def forward(self, text):
      return self.forward_plus(text)[0]

In [None]:
# @title TextFiles(Dataset) {display-mode: "form"} [COMMENTED OUT]
# from torch.utils.data import Dataset
# import torchtext
# import collections
# import numpy as np
# import torch
# import warnings
# import copy
# import re
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords 
# from nltk.stem.wordnet import WordNetLemmatizer as lem
# stop_words = set(stopwords.words('english')) 
# stop_words.add('would')

# class TextFiles(Dataset):

#     def __init__(self, paths, encoding='utf-8',
#                  line_split_f=None, label_f=None):
        
#         if paths is None:
#             paths = []

#         self.text_vocab = None
#         self.label_vocab = None
#         self.text = []
#         self.processed = None
#         self.vec = None
#         self.labels = []
#         tmp = []
#         for p in paths:
#             with open(p, 'r', encoding=encoding) as f:
#                 tmp = list(f)
        
#             # If a split function is specified, join text and let the function
#             # handle it. Otherwise, use newlines to split.
#             if line_split_f is not None:
#                 self.text.extend(line_split_f("".join(tmp)))
#             else:
#                 self.text.extend(tmp)
        
#             # Get true label for each entity
#             if label_f is not None:
#                 self.labels.extend([label_f(t, p, ix) for ix, t in enumerate(tmp)])
#             else:
#                 self.labels = None

#         self.processed = copy.deepcopy(self.text)

#     @staticmethod
#     def Preprocess(dataset):
#         for ix, (sent, label) in enumerate(dataset):
#             sent = word_tokenize(sent)
#             sent = [w.lower() for w in sent if re.fullmatch('[a-zA-Z.][a-zA-Z.]+', w)]
#             lemmatizer = lem()
#             sent = [lemmatizer.lemmatize(w) for w in sent]
#             # Removing stop words seems to hurt the model's performance.
#             # sent = [w for w in sent if w not in stop_words]
#             dataset[ix] = (sent, label)

#     def build_vocabulary(self):
#         # Warning: assumes that text has already been preprocessed and split!
#         # Create vocabularies from text and label

#         self.text_vocab = torchtext.vocab.Vocab(
#             collections.Counter(
#                 [word for (sentence, label) in self for word in sentence]
#             ), specials=['<unk>', '<pad>'], specials_first=True
#         )
#         self.label_vocab = torchtext.vocab.Vocab(
#             collections.Counter([label for (sentence, label) in self]) , specials=[]
#         )

#     def word2tensor(self, pad_length, dataset=None):
#         # Transforms text and labels to numerical indices using vocabulary built
#         # using this dataset.
#         #
#         #   pad_length      Length to pad to (e.g., pad_length = 100, but
#         #                   sentence is 75 characters, then 25 <pad> characters
#         #                   will be added.
#         #   dataset         Dataset to which to apply this. Iterating dataset
#         #                   should return a (text, label) tuple. Default: self 
#         #                   (if dataset=None, use self).
#         #
#         # Warning: assumes that text has already been preprocessed and split!

#         if dataset is None:
#             dataset = self
        
#         if isinstance(dataset, TextFiles):
#             vec = [[]] * len(self.processed)
#             lab = [[]] * len(self.labels)

#         # Text to numerical indices (tensors)
#         for ix in range(len(dataset)):
#             sentence = [self.text_vocab.stoi[word] for word in dataset[ix][0]]
#             label = self.label_vocab.stoi[dataset[ix][1]]

#             if pad_length is not None and len(sentence) > pad_length:
#                 warnings.warn(
#                     'The following sentence has {} characters which is longer '\
#                     'than your padding length ({}).\nSentence = "{}"'\
#                     .format(len(sentence), pad_length, sentence)
#                 )
#             elif pad_length is not None:
#                 sentence = sentence + [self.text_vocab.stoi['<pad>']]*(pad_length-len(sentence))

#             if isinstance(dataset, TextFiles):
#                 vec[ix] = torch.tensor(sentence)
#                 lab[ix] = torch.tensor(label)
#             else:
#                 dataset[ix] = (torch.tensor(sentence), torch.tensor(label))
        
#         if isinstance(dataset, TextFiles):
#             dataset.vec = vec
#             dataset.labels = lab

#     def shuffle(self):
#         # Order the data such that the classes are ordered and proportionally
#         # represented (e.g., if for each element from categories 1 and 2 there
#         # are two from category 3, then [1, 2, 3, 3, 1, 2, 3, 3, ...])
#         # but such that the instances of within the categories are random. 
#         old_text = self.text
#         old_processed = self.processed
#         old_vec = self.vec
#         old_labels = self.labels
#         L = np.array([{lab: ix for ix, lab in enumerate(set(self.labels))}.get(lab) for lab in self.labels])

#         index = [np.where(L == ii)[0] for ii in set(L)]
#         permuted = [np.random.permutation(ii.size) for ii in index]

#         # CATegory ID's Sorted by category Size (number of elements in the category)
#         cat_id_ss = np.argsort([c.size for c in index])

#         ix = 0;
#         ct = [0]*len(index)
#         self.text = [[]]*len(self.text)
#         self.processed = [[]]*len(self.processed)
#         if self.vec is not None:
#             self.vec = [[]]*len(self.vec)
#         self.labels = [[]]*len(self.labels)
#         for ii in range(len(L)):
#             # Current category
#             cix = cat_id_ss[ix]
#             # Next category
#             nix = cat_id_ss[(1 + ix) % len(cat_id_ss)]
#             # Finished with a category? Remove it from the list of ID's
#             while ct[cix] >= len(permuted[cix]):
#                 # remove category
#                 cat_id_ss = np.delete(cat_id_ss, ix)
#                 # recalculate ix
#                 ix = ix % len(cat_id_ss)
#                 # recalculate current and next category
#                 cix = cat_id_ss[ix]
#                 nix = cat_id_ss[(1 + ix) % len(cat_id_ss)]
                
#             # Array index (within array index [see below])
#             ax = permuted[cix][ct[cix]]
#             # Permuted array index
#             px = index[cix][ax]

#             # Label = OldLabel[permuted_array_index]
#             self.labels[ii] = old_labels[px]
#             # Text = OldText[permuted_array_index]
#             self.text[ii] = old_text[px]
#             self.processed[ii] = old_processed[px]
#             if self.vec is not None:
#                 self.vec[ii] = old_vec[px]
#             # Count this as one more instance of this category
#             ct[cix] = 1 + ct[cix]
#             # How do the category proportions compare? If the category has a higher proportion
#             # of elements in the resulting array, move on to the next category.
#             if ct[cix] / len(index[cix]) >= ct[nix] / len(index[nix]) :
#                 ix = (1 + ix) % len(cat_id_ss)


#     def split(self, training_validation_split):
#         idx = int(np.ceil(len(self) * training_validation_split))

#         train = TextFiles(paths=None, encoding=None, line_split_f=None, label_f=None)
#         train.text = self.text[:idx]
#         train.processed = self.processed[:idx]
#         train.labels = self.labels[:idx]

#         validation = TextFiles(paths=None, encoding=None, line_split_f=None, label_f=None)
#         validation.text = self.text[idx:]
#         validation.processed = self.processed[idx:]
#         validation.labels = self.labels[idx:]

#         if self.vec is not None:
#             train.vec = self.vec[:idx]
#             validation.vec = self.vec[idx:]

#         return train, validation


#     def __len__(self):
#         return len(self.text)


#     def __getitem__(self, idx):
#         if 0 > idx or idx >= len(self):
#             raise IndexError

#         if self.vec is not None:
#             t = self.vec[idx]
#         else:
#             t = self.processed[idx]

#         if self.labels is None:
#             label = idx
#         else:
#             label = self.labels[idx]

#         return (t, label)


#     def __setitem__(self, idx, value):
#         if not isinstance(value, tuple):
#             raise TypeError

#         if isinstance(value[0], torch.Tensor) and self.vec is not None:
#             self.vec[idx] = value[0];
#         else:
#             self.processed[idx] = value[0];

#         if self.labels is not None:
#             self.labels[idx] = value[1]


# # def Text2Index(text, pad_length, vvv = None):
# #     vocab = torchtext.vocab.Vocab(
# #         collections.Counter(
# #             " ".join([t.strip() for t in text]).split()
# #         ), specials=['<unk>', '<pad>'], specials_first=True
# #     )
# #     text = [[vocab.stoi[w] for w in sentence.split()] for sentence in text]
# #     for ix,sentence in enumerate(text):
# #         if len(sentence) > pad_length:
# #             warnings.warn('The following sentence has {} characters which is longer than your padding length ({}).\nSentence = "{}"'\
# #                 .format(len(sentence), pad_length, sentence)
# #             )
# #         else:
# #             text[ix] = sentence + [vocab.stoi['<pad>']]*(pad_length-len(sentence))
# #     if vvv is not None:
# #         vvv.extend([vocab])
# #     return text

# # def Label2Index(labels):
# #     vocab = torchtext.vocab.Vocab(collections.Counter(labels), specials=['<unk>', '<pad>'], specials_first=True)
# #     return [vocab.stoi[lab] for lab in labels]

In [None]:
# @title PreprocessForPredict() {display-mode: "form"} [COMMENTED OUT]
# import re
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords 
# from nltk.stem.wordnet import WordNetLemmatizer as lem
# stop_words = set(stopwords.words('english')) 
# stop_words.add('would')

# def PreprocessForPredict(dataset):
#     for ix, (sent, label) in enumerate(dataset):
#         sent = word_tokenize(sent)
#         sent = [w.lower() for w in sent if re.fullmatch('[a-zA-Z.][a-zA-Z.]+', w)]
#         lemmatizer = lem()
#         sent = [lemmatizer.lemmatize(w) for w in sent]
#         # Removing stop words seems to hurt the model's performance.
#         # sent = [w for w in sent if w not in stop_words]
#         dataset[ix] = (sent, label)

# # def preprocess(data):
# #   filtered = []
# #   for i in data:
# #     sent = re.sub(r'[^a-zA-Z ]+', '', i).lower()
# #     sent = word_tokenize(sent)
# #     tok_sent = []
# #     for w in sent:
# #       if not w in stop_words:
# #         lemmatized = lem().lemmatize(w)
# #         tok_sent.append(lemmatized)
# #     filtered.append(tok_sent)
# #   return filtered


In [None]:
# @title word2tensor(dataset__or__list_of_tuples, pad_length)
import torch

def word2tensor(dataset, text_vocab, label_vocab, pad_length):
    # Transforms text and labels to numerical indices using vocabulary built
    # using this dataset.
    #
    #   pad_length      Length to pad to (e.g., pad_length = 100, but
    #                   sentence is 75 characters, then 25 <pad> characters
    #                   will be added.
    #   dataset         Dataset to which to apply this. Iterating dataset
    #                   should return a (text, label) tuple. Default: self 
    #                   (if dataset=None, use self).
    #
    # Warning: assumes that text has already been preprocessed and split!

    out = []

    # Text to numerical indices (tensors)
    for ix in range(len(dataset)):
        sentence = [text_vocab.stoi[word] for word in dataset[ix][0]]
        
        if label_vocab is not None:
          label = label_vocab.stoi[dataset[ix][1]]
        else:
          label = []

        if pad_length is not None and len(sentence) > pad_length:
            warnings.warn(
                'The following sentence has {} characters which is longer '\
                'than your padding length ({}).\nSentence = "{}"'\
                .format(len(sentence), pad_length, sentence)
            )
        elif pad_length is not None:
            sentence = sentence + [text_vocab.stoi['<pad>']]*(pad_length-len(sentence))

        out.append((torch.tensor(sentence), torch.tensor(label)))
    
    return out

In [None]:
#@title train(), validate(), and test() {display-mode: "form"}
import torch.nn.functional as F
import torch

def train(model, model_name, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
              
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % _log_interval == 0:
            print('{{{}}} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    model_name, epoch, batch_idx * len(data),
                    len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()
                )
            )


def validate(model, model_name, device, validation_loader):
    model.eval()
    validation_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in validation_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            # sum up batch loss
            validation_loss += F.nll_loss(output, target, reduction='sum').item()
            # validation_loss += F.mse_loss(output, target, reduction='sum').item()
            # get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    validation_loss /= len(validation_loader.dataset)

    print(
        '\n{{{}}} Validation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
        .format(
            model_name, validation_loss, correct,
            len(validation_loader.dataset),
            100. * correct / len(validation_loader.dataset)
        )
    )

    return validation_loss, 100. * correct / len(validation_loader.dataset)


def test(model, model_name, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    predictions = []
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            # sum up batch loss
            # test_loss += F.mse_loss(output, target, reduction='sum').item()
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            # get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(
        '\n{{{}}} Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
        .format(
            model_name, test_loss, correct, len(test_loader.dataset),
            100. * correct / len(test_loader.dataset)
        )
    )

    return test_loss, 100. * correct / len(test_loader.dataset)

torch.manual_seed(1)

<torch._C.Generator at 0x7f8a755b2e30>

In [None]:
# @title Predict(model, text_vocab, text) {display-mode: "form"} [COMMENTED OUT]

# def Predict(model, text_vocab, label_vocab, text):
#     # Not learning
#     model.eval()
#     # For strings
#     if not isinstance(text, list):
#         text = [text]
#     # Create "dataset"
#     text_labels = [(sentence, '') for sentence in text]
#     PreprocessForPredict(text_labels)
#     # Word 2 tensor + padding
#     text_labels = word2tensor(text_labels, text_vocab, None, saved['params']['_padded_string_length'])
#     # (text,label)->[text,text,...]
#     text = torch.stack([sentence for sentence, label in text_labels], dim=0)
#     # CUDA?
#     text = text.to(next(model.parameters()).device)
#     # Prediction
#     return torch.argmax(model(text).cpu(), 1)

In [None]:
# @title NNPredictStanford()

def NNPredictStanford(model, text_vocab, label_vocab, pad_size, processed_text):
    # Not learning
    model.eval()
    # Make list if not a list
    if not isinstance(processed_text, list):
        processed_text = [processed_text]
    # Make dataset
    text_labels = [(sentence, '') for sentence in processed_text]
    # Word 2 tensor + padding
    text_labels = word2tensor(text_labels, text_vocab, None, pad_size)
    # (text,label)->[text,text,...]
    text = torch.stack([sentence for sentence, label in text_labels], dim=0)
    # CUDA?
    text = text.to(next(model.parameters()).device)
    # Prediction
    return [label_vocab.itos[c] for c in torch.argmax(model(text).cpu(), 1)]

#Analyses

In [None]:
# @title Generate Augmented NP-swapped sentences
from IPython.core.display import display, HTML
import numpy as np

def validate_noun_phrase(tree_1, subtree_1_index, tree_2, subtree_2_index):
  subtree_1 = tree_1[subtree_1_index]
  subtree_2 = tree_2[subtree_2_index]
  nb_leaves_1 = len(subtree_1.leaves())
  nb_leaves_2 = len(subtree_2.leaves())
  
  # At least two words, but not the whole sentence
  return 1 < nb_leaves_1 and nb_leaves_1 < len(tree_1.leaves()) \
     and 1 < nb_leaves_2 and nb_leaves_2 < len(tree_2.leaves())

text = GetTextByCategories(
  word="word", show_pos=False, remove_punctuation=False, remove_stopwords=False
)
categories = list(text.keys())

swapped = {}
for i in range(len(categories)):
  cat_1 = categories[i]
  for j in range(i, len(categories)):
    cat_2 = categories[j]
    swapped[cat_1, cat_2] = []
    for k in range(0, 500, 2):
      swap_1 = swap_2 = None
      while swap_1 is None or swap_2 is None:
        sent_1 = text[categories[i]][np.random.randint(len(text[categories[i]]))]
        sent_2 = text[categories[j]][np.random.randint(len(text[categories[j]]))]
        swap_1, swap_2, orig_1, orig_2 = Sentence.SwapPhrases(
          sent_1, sent_2, "NP", validate_phrase=validate_noun_phrase,
          prefix=['<span style="color: red">', '<span style="color: blue">'],
          suffix="</span>"
        )
        if swap_1 is None or swap_2 is None \
        or swap_1.withoutHTML() in swapped[cat_1, cat_2] \
        or swap_2.withoutHTML() in swapped[cat_1, cat_2]:
          continue
      swapped[cat_1, cat_2].extend([swap_1.withoutHTML(), swap_2.withoutHTML()])
      if k < 3:
        display(HTML("<b>O1:</b> " + str(orig_1)))
        display(HTML("<b>O2:</b> " + str(orig_2)))
        display(HTML("<b>S1:</b> " + str(swap_1)))
        display(HTML("<b>S2:</b> " + str(swap_2)))
        print()
    print("\n")






























































































































































































In [None]:
import matplotlib.pyplot as plt
import joblib
import nltk
from nltk.corpus import stopwords
stopwords.words('english')

# NaiveBayes, LogisticRegression, and SVM
models, nb_lg_svm_params, label_to_int = \
  joblib.load("AIM_NB_LG_SVM.joblib")

train_valid_set = aim_parsed_text.StanfordFiles(["bootstrap", "docker", "tensorflow", "vscode"])
train_valid_set.shuffle()
train_valid_set.Preprocess(train_valid_set)

test_set = aim_parsed_text.StanfordFiles(["DECA"])
train_valid_set.Preprocess(test_set)

ttt = [sent for sent, _ in test_set][:20]

for model_name in models:
  models[model_name]['tfidf'].preprocessor = lambda x: x
  models[model_name]['tfidf'].tokenizer = lambda sent: [str(t) for t in sent.tokens]
  prediction = models[model_name].predict(train_valid_set.processed)
  print(models[model_name].predict(ttt))

  # train
  train_count = 0;
  for lab, pre in zip(train_valid_set.labels, prediction):
    if label_to_int[lab] == pre:
      train_count = 1 + train_count
  prediction = models[model_name].predict(test_set.processed)

  # test
  test_count = 0;
  for lab, pre in zip(test_set.labels, prediction):
    if label_to_int[lab] == pre:
      test_count = 1 + test_count

  #print
  print(train_count / len(train_valid_set.processed), test_count / len(test_set.processed))

#prediction = models['NaiveBayes'].predict(test_set.processed)



[0 6 5 3 5 5 1 1 1 5 1 5 5 5 5 5 5 1 5 0]
0.9036434251895691 0.25413223140495866
[0 6 5 3 5 5 1 4 1 5 1 5 1 0 1 5 5 6 0 0]
0.963935638986499 0.365702479338843
[0 6 5 3 2 5 1 0 1 5 1 5 5 2 5 5 5 1 0 5]
0.9750323654521916 0.36363636363636365


In [None]:
# @title Define var models = {'LSTM', lstm_predict(...), ...} 
import joblib

#### NOTE
# This function is *crucial*! If the lambda function is created directly in the
# loop, the pipeline that is used is the last one (since the lambda expression
# *closes* over the global pipeline) [In other words, the loop does not have scope
# so the variable that is used is the general one.] 
def create_model_predict_func(pipeline, int_to_label):
  return lambda sent: [int_to_label[cat] for cat in pipeline.predict(sent)]

models = {}

# NaiveBayes, LogisticRegression, and SVM
nb_lg_svm, nb_lg_svm_params, nb_lg_svm_label_vocab = \
  joblib.load('AIM_NB_LG_SVM.joblib')
nb_lg_svm_label_vocab = [cat for cat, _ in sorted(nb_lg_svm_label_vocab.items(), key=lambda tup: tup[1])]
for model_name, pipeline in nb_lg_svm.items():
  pipeline['tfidf'].preprocessor = lambda sent: sent
  pipeline['tfidf'].tokenizer = lambda sent: [str(t) for t in sent.tokens] 
  models[model_name] = create_model_predict_func(pipeline, nb_lg_svm_label_vocab)

# LSTM
lstm_params, lstm_model = joblib.load('AIM_LSTM.joblib')
lstm_model = lstm_model['DECA']
models['LSTM'] = lambda processed_text: NNPredictStanford \
(
  lstm_model['model'], lstm_model['text_vocab'], lstm_model['label_vocab'],
  lstm_params['_padded_string_length'], processed_text
)

# CNN
cnn_params, cnn_model = joblib.load('AIM_CNN.joblib')
cnn_model = cnn_model['DECA']
models['CNN'] = lambda processed_text: NNPredictStanford \
(
  cnn_model['model'], cnn_model['text_vocab'], cnn_model['label_vocab'],
  cnn_params['_padded_string_length'], processed_text
)
ttt.append(models['LSTM']([sent for sent, _ in test_set]))
ttt.append(models['CNN']([sent for sent, _ in test_set]))
np.sum([a == b for a, b in zip(ttt[3], [lab for _, lab in test_set])]) / len(test_set)

# Make sure that all models are working as expected
test_set = aim_parsed_text.StanfordFiles(["DECA"])
test_set.Preprocess(test_set)
sentences = [sent for sent, _ in test_set]
labels = [lab for _, lab in test_set]
ttt = [
  models['NaiveBayes'](sentences),
  models['LogisticRegression'](sentences),
  models['SVM'](sentences),
  models['LSTM'](sentences),
  models['CNN'](sentences)
]
print(
  "{:.6f} {:.6f} {:.6f} {:.6f} {:.6f}".format(
    np.sum([a == b for a, b in zip(ttt[0], labels)]) / len(ttt[0]), 
    np.sum([a == b for a, b in zip(ttt[1], labels)]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[2], labels)]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[3], labels)]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[4], labels)]) / len(ttt[0])
  )
)
print(
  "{:.6f} {:.6f} {:.6f} {:.6f} {:.6f}".format(
    np.sum([a == b for a, b in zip(ttt[0], ttt[1])]) / len(ttt[0]), 
    np.sum([a == b for a, b in zip(ttt[0], ttt[2])]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[1], ttt[2])]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[0], ttt[3])]) / len(ttt[0]),
    np.sum([a == b for a, b in zip(ttt[3], ttt[4])]) / len(ttt[0])
  )
)




0.254132 0.365702 0.363636 0.274793 0.271694
0.621901 0.561983 0.720041 0.258264 0.457645


In [None]:
# @title Confusion matrices

# TODO:
#  * Change code so that augmented sentences are created *within* project
#  * Models are trained using other data (include DECA?)
#     e.g., For bootstrap, models are trained using tensorflow, docker, vscode,
#           and DECA. Augmented sentences are created using bootstrap sentences
#           and "tested" using model described above.

import pandas as pd
import numpy as np

acc = {}
array = {}
upper_tri = {}
count = np.zeros((len(categories), len(categories)))
cat2int = {cat:ix for ix, cat in enumerate(categories)}
for model_name, model in models.items():
  print(model_name)

  acc[model_name] = np.zeros((len(categories), len(categories), len(list(swapped.values())[0])), dtype=np.bool)
  array[model_name] = np.zeros((len(categories), len(categories)))

  for target_categories, swapped_sentences in swapped.items():
    res = [c in target_categories for c in model(swapped_sentences)]
    tcix = cat2int[target_categories[0]]
    tcjx = cat2int[target_categories[1]]
    acc[model_name][tcix, tcjx, :] = acc[model_name][tcjx, tcix] = np.array(res)

  array[model_name] = pd.DataFrame(np.mean(acc[model_name], axis=2), categories, categories)

  upper_tri[model_name] = array[model_name]
  for ix in range(upper_tri[model_name].shape[0]):
    for jx in range(ix):
      upper_tri[model_name].iloc[ix,jx] = ''
  
  display(upper_tri[model_name])

import joblib
import os

array

#joblib.dump(array, os.path.join(drive_folder, "out", "Augmented_swapped_matrices.joblib"), compress=9)
array = joblib.load(os.path.join(drive_folder, "out", "Augmented_swapped_matrices.joblib"))
mat = array['LSTM']
upper = [mat.iloc[ix,jx] for ix in range(mat.shape[0]) for jx in range(mat.shape[1]) if ix < jx]
print(upper)

NaiveBayes


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,0.9,0.814,0.906,0.788,0.796,0.772,0.71
feature request,,0.648,0.808,0.592,0.54,0.582,0.52
information giving,,,0.878,0.878,0.858,0.806,0.748
information seeking,,,,0.59,0.704,0.66,0.546
problem discovery,,,,,0.702,0.678,0.604
solution proposal,,,,,,0.664,0.47
others,,,,,,,0.664


LogisticRegression


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,0.862,0.808,0.88,0.774,0.746,0.73,0.624
feature request,,0.66,0.798,0.62,0.594,0.614,0.518
information giving,,,0.822,0.874,0.834,0.764,0.714
information seeking,,,,0.686,0.736,0.668,0.648
problem discovery,,,,,0.666,0.676,0.574
solution proposal,,,,,,0.662,0.484
others,,,,,,,0.556


SVM


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,0.872,0.82,0.932,0.674,0.744,0.714,0.608
feature request,,0.664,0.828,0.544,0.598,0.61,0.496
information giving,,,0.894,0.864,0.886,0.81,0.798
information seeking,,,,0.51,0.664,0.588,0.466
problem discovery,,,,,0.682,0.674,0.564
solution proposal,,,,,,0.63,0.416
others,,,,,,,0.484


LSTM


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,0.916,0.872,0.916,0.916,0.864,0.82,0.838
feature request,,0.776,0.88,0.862,0.748,0.776,0.8
information giving,,,0.868,0.9,0.928,0.902,0.874
information seeking,,,,0.992,0.878,0.868,0.852
problem discovery,,,,,0.854,0.828,0.83
solution proposal,,,,,,0.798,0.748
others,,,,,,,0.87


CNN


Unnamed: 0,aspect evaluation,feature request,information giving,information seeking,problem discovery,solution proposal,others
aspect evaluation,0.932,0.886,0.958,0.936,0.88,0.852,0.838
feature request,,0.716,0.89,0.836,0.738,0.792,0.77
information giving,,,0.93,0.922,0.942,0.876,0.922
information seeking,,,,0.982,0.856,0.84,0.826
problem discovery,,,,,0.858,0.798,0.728
solution proposal,,,,,,0.768,0.776
others,,,,,,,0.948


[0.92, 0.918, 0.922, 0.878, 0.818, 0.896, 0.838, 0.842, 0.764, 0.776, 0.826, 0.87, 0.91, 0.86, 0.814, 0.886, 0.848, 0.858, 0.786, 0.854, 0.786]


In [None]:
# @title X
import pandas as pd
import numpy as np
import joblib

array = joblib.load(os.path.join(drive_folder, "out", "Augmented_swapped_matrices.joblib"))
over = joblib.load(os.path.join(drive_folder, "out", "tfidf_inout_overlap.joblib"))

mat0 = array['SVM']
upper0 = [mat0.iloc[ix,jx] for ix in range(mat0.shape[0]) for jx in range(mat0.shape[1]) if ix < jx]
mat1 = array['CNN']
upper1 = [mat1.iloc[ix,jx] for ix in range(mat1.shape[0]) for jx in range(mat1.shape[1]) if ix < jx]
mat2 = array['LSTM']
upper2 = [mat2.iloc[ix,jx] for ix in range(mat2.shape[0]) for jx in range(mat2.shape[1]) if ix < jx]
mat3 = over[0]
upper3 = [mat3.iloc[ix,jx] for ix in range(mat3.shape[0]) for jx in range(mat3.shape[1]) if ix < jx]
mat4 = over[1]
upper4 = [mat4.iloc[ix,jx] for ix in range(mat4.shape[0]) for jx in range(mat4.shape[1]) if ix < jx]

results = pd.DataFrame(np.corrcoef(np.row_stack((upper0, upper1, upper2, upper3, upper4))), index=["SVM", "CNN", "LSTM", "tfidf", "inout"], columns=["SVM", "CNN", "LSTM", "tfidf", "inout"])
results.index
for ii in range(len(results.index)):
  for jj in range(len(results.columns)):
    if ii >= jj:
      results.iloc[ii,jj] = ""
    else:
      results.iloc[ii, jj] = "{:.2f}".format(results.iloc[ii,jj])
display(results)

In [None]:
# @title Bag of words MDS - computations
import sklearn.manifold
import matplotlib.pyplot as plt
import numpy as np
import joblib
import os

array = joblib.load(os.path.join(drive_folder, "out", "Augmented_swapped_matrices.joblib"))
mat = array['CNN']
for ii in range(len(mat.index)):
  for jj in range(len(mat.columns)):
    if ii == jj:
      mat.iloc[ii,jj] = 1.0
    elif ii >= jj:
      mat.iloc[ii, jj] = mat.iloc[jj, ii]

print(mat)

mds_r = sklearn.manifold.MDS(dissimilarity="euclidean", metric=True)
mds_r = mds_r.fit_transform(mat)
plt.figure(figsize=(7,7))
colors = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 1], [.75, .75, 0], [1, 0, 1], [0.25,0.25, 1]]
categories = [
  'aspect evaluation', 'feature request', 'information giving',
  'information seeking', 'problem discovery', 'solution proposal', 'others'
]
categories = [c.replace(' ', '\n') for c in categories]
a = []
plt.rcParams.update({'font.size':16})
for ix, (color, cat) in enumerate(zip(colors, categories)):
  plt.scatter(mds_r[ix,0], mds_r[ix,1], c=np.array(color, ndmin=2))
  a.append(plt.text(mds_r[ix, 0]+.005, mds_r[ix, 1], cat))
a[0]._y = a[0]._y - 0.03
a[1]._y = a[1]._y - 0.02
a[4]._x, a[4]._y = a[4]._x - 0.1, a[4]._y - 0.02
plt.show()

In [None]:
import sklearn.manifold
import matplotlib.pyplot as plt
import numpy as np
import joblib
 
# plt.figure(figsize=(7,7))
# _, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,4))

plt.figure(figsize=(7,7))
ax = [plt]
colors = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1], [0.25,0.25, 1]]
for ii in range(len(label_names)):
    colors[ii].extend([0.5])
    ax[0].scatter(
        [x for x,lab in zip(mds_r[:,0],label) if ii == lab],
        [y for y,lab in zip(mds_r[:,1],label) if ii == lab],
        c = np.array(colors[ii], ndmin=2)
    )

# predicted = []
# for target_project in set(project):
#     predicted.extend([int(predictions[target_project][t]) for t,p in zip(text, project) if p == target_project])

# predictions = Predict(aim.cpu(), text)
# colors = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 1], [1, 1, 0], [1, 0, 1], [0.25,0.25, 1]]
# ct = 0
# for ii in range(len(label_names)):
#     colors[ii].extend([0.5])
#     ct += len([x for x,pre in zip(mds_r[:,0],predicted) if ii == pre])
#     ax[1].scatter(
#         [x for x,pre in zip(mds_r[:,0],predicted) if ii == pre],
#         [y for y,pre in zip(mds_r[:,1],predicted) if ii == pre],
#         c = np.array(colors[ii], ndmin=2)
#     )
# plt.show();
