# Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
import gc

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize

import tensorflow as tf
import tensorflow.keras as keras

from keras.preprocessing.sequence import pad_sequences

tf.random.set_seed(123)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
cd /content/drive/MyDrive/AISIA/Jira recommendation/

/content/drive/MyDrive/AISIA/Jira recommendation


# Load dataset

In [4]:
def encode_graph(row):
  new_row = []
  for i in row:
    if i==0:
      new_row.append([1,0])
    else:
      new_row.append([0,1])
  return new_row

In [5]:
def load_project(project_name):
  # Attributes
  df = pd.read_csv('data/{}/attribute_preprocess.csv'.format(project_name))
  df = df.fillna('')
  # Graph
  graph = pd.read_csv('data/{}/graph.csv'.format(project_name), delimiter=',')
  graph = graph.apply(encode_graph)
  graph = graph.values
  return df, graph

## GloVe

In [None]:
def load_word_embeddings(fname):
    wordvecs = {}
    with open(fname, 'r') as file:
        lines = file.readlines()
        for line in lines:
            tokens = line.split(' ')
            vec = np.array(tokens[1:], dtype=np.float32)
            wordvecs[tokens[0]] = vec

    return wordvecs 

wordvecs = load_word_embeddings("embedding/glove.42B.300d.txt")

In [None]:
__PADDED_INDEX__ = 0 
__UNKNOWN_WORD__ = 1

In [None]:
vocab = wordvecs.keys()
matrix = list(wordvecs.values())
del wordvecs
gc.collect()

In [None]:
word_to_index = {word: index+2 for index, word in enumerate(vocab)}
del vocab
gc.collect()

In [None]:
embedding_matrix = np.pad(matrix, [[2,0],[0,0]], mode='constant', constant_values =0.0)
del matrix
gc.collect()

In [None]:
len(word_to_index), len(embedding_matrix)

# Model

### Get features

In [None]:
def glove_tokenizer(sentences):
  tokenized_texts = [nltk.tokenize.word_tokenize(text) for text in sentences]
  X = []
  for text in tokenized_texts:
    cur_text_indices = []
    for word in text:
      if word in word_to_index:
          cur_text_indices.append(word_to_index[word])    
      else:
          cur_text_indices.append(__UNKNOWN_WORD__)  
    X.append(cur_text_indices)
  return X

In [None]:
def get_textual_features(df):
  df["title"] = df["title"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["description"] = df["description"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["summary"] = df["summary"].str.replace("[ ]+", " ", regex=True).str.strip()

  # Extract data from dataframe
  titles = df['title'].values
  descriptions = df['description'].values
  summaries = df['summary'].values

  return titles, descriptions, summaries

In [None]:
def return_model(value_maxlen):
  inputs_A = keras.Input(shape=(value_maxlen), name="input_a")

  embedding_layer = keras.layers.Embedding(input_dim=embedding_matrix.shape[0],   
                 output_dim=embedding_matrix.shape[1],   
                  embeddings_initializer = tf.keras.initializers.Constant(value=embedding_matrix),  
                  trainable=False,                     
                 mask_zero=True)                 

  # Embedding
  emb_A = embedding_layer(inputs_A)
  
  model = keras.Model(inputs=[inputs_A], outputs=emb_A)
  model.compile(optimizer="Adam", loss="mse", metrics=["categorical_accuracy"])
  
  model.summary()

  return model

### Start

In [None]:
list_project_names = [('FLUME', 1577, 5, 200, 256), ('MDLSITE', 4100, 12, 200, 256)]

In [None]:
for project in list_project_names:
  project_name = project[0]
  time_split = project[1]

  # Model params
  steps_per_epoch = project[2]
  epochs = project[3]
  batch_size = project[4]

  # Load dataset
  df, graph = load_project(project_name)

  # Get features
  titles, descriptions, summaries = get_textual_features(df)
  del df
  del graph
  gc.collect()

  # Save path
  path = 'embedding/glove/{}/'.format(project_name)
  try:
    os.mkdir(path)
  except:
    print('Cannot create path {}'.format(path))

  # All textual features
  value_maxlen = 540
  all_text = [descriptions[i] +' '+titles[i] + ' '+summaries[i] for i in range(0, len(titles))]
  model = return_model(value_maxlen)
  
  save_path = path
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  tokenized = glove_tokenizer(all_text)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  feature = model.predict(padded)
  np.save(save_path + 'textual_features.npy', feature) 
  del all_text
  del tokenized
  del model
  gc.collect()

  value_maxlen = 20
  # Only title
  model = return_model(value_maxlen)
  save_path = path + "title/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  tokenized = glove_tokenizer(titles)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature) 
  del model
  gc.collect()

  # Only summary
  model = return_model(value_maxlen)
  save_path = path + "summary/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  tokenized = glove_tokenizer(summaries)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()
  
  np.save(save_path + 'textual_features.npy', feature) 
  del model
  gc.collect()

  value_maxlen = 500
  # Only description
  model = return_model(value_maxlen)
  save_path = path + "description/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  tokenized = glove_tokenizer(descriptions)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature) 
  del model
  gc.collect()

  value_maxlen = 520
  # description + title
  model = return_model(value_maxlen)
  save_path = path + "description_title/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  all_text = [descriptions[i] +' '+titles[i] for i in range(0, len(titles))]
  tokenized = glove_tokenizer(all_text)

  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()
  
  np.save(save_path + 'textual_features.npy', feature) 
  del all_text
  del model
  gc.collect()

  # description + summary
  model = return_model(value_maxlen)
  save_path = path + "description_summary/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  all_text = [descriptions[i] +' '+summaries[i] for i in range(0, len(titles))]
  tokenized = glove_tokenizer(all_text)

  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()
  
  np.save(save_path + 'textual_features.npy', feature) 
  del all_text
  del model
  gc.collect()

  # title + summary
  model = return_model(value_maxlen)
  valye_maxlen = 40
  save_path = path + "title_summary/"
  try:
    os.mkdir(save_path)
  except:
    print('Cannot create path {}'.format(save_path))
  all_text = [titles[i] +' '+summaries[i] for i in range(0, len(titles))]
  tokenized = glove_tokenizer(all_text)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()
  
  np.save(save_path + 'textual_features.npy', feature) 
  del all_text
  del model
  gc.collect()