# Library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
import gc

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize

import tensorflow as tf
import tensorflow.keras as keras

from keras.utils import pad_sequences

tf.random.set_seed(123)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
cd /content/drive/MyDrive/TaDeR-A-New-Task-Dependency-Recommendation-for-Project-Management-Platform/

/content/drive/MyDrive/TaDeR-A-New-Task-Dependency-Recommendation-for-Project-Management-Platform


# Load dataset

In [None]:
def encode_graph(row):
  new_row = []
  for i in row:
    if i==0:
      new_row.append([1,0])
    else:
      new_row.append([0,1])
  return new_row

In [None]:
def load_project(project_name):
  # Attributes
  df = pd.read_csv('data/{}/attribute_preprocess.csv'.format(project_name))
  df = df.fillna('')
  # Graph
  graph = pd.read_csv('data/{}/graph.csv'.format(project_name), delimiter=',')
  graph = graph.apply(encode_graph)
  graph = graph.values
  return df, graph

# FastText

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip

In [None]:
!unzip crawl-300d-2M-subword.zip

In [None]:
def load_word_embeddings(fname):
    wordvecs = {}
    with open(fname, 'r') as file:
        lines = file.readlines()
        for line in tqdm(lines):
            tokens = line.split(' ')
            vec = np.array(tokens[1:], dtype=np.float32)
            wordvecs[tokens[0]] = vec

    return wordvecs

wordvecs = load_word_embeddings('crawl-300d-2M-subword.vec')

100%|██████████| 2000001/2000001 [01:49<00:00, 18298.12it/s]


In [None]:
__PADDED_INDEX__ = 0
__UNKNOWN_WORD__ = 1
vocab = wordvecs.keys()
matrix = list(wordvecs.values())
word_to_index = {word: index+2 for index, word in enumerate(vocab)}
embedding_matrix = np.pad(matrix, [[2,0],[0,0]], mode='constant', constant_values =0.0)

In [None]:
len(vocab), len(embedding_matrix)

(2000000, 2000002)

# Get features

In [None]:
def FastText_tokenizer(sentences):
  tokenized_texts = [nltk.tokenize.word_tokenize(text) for text in sentences]
  X = []
  for text in tokenized_texts:
    cur_text_indices = []
    for word in text:
      if word in word_to_index:
          cur_text_indices.append(word_to_index[word])
      else:
          cur_text_indices.append(__UNKNOWN_WORD__)
    X.append(cur_text_indices)
  return X

In [None]:
def get_textual_features(df):
  df["title"] = df["title"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["description"] = df["description"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["summary"] = df["summary"].str.replace("[ ]+", " ", regex=True).str.strip()

  # Extract data from dataframe
  titles = df['title'].values
  descriptions = df['description'].values
  summaries = df['summary'].values

  return titles, descriptions, summaries

In [None]:
def return_model(value_maxlen):
  inputs_A = keras.Input(shape=(value_maxlen), name="input_a")

  embedding_layer = keras.layers.Embedding(input_dim=embedding_matrix.shape[0],
                 output_dim=embedding_matrix.shape[1],
                  embeddings_initializer = tf.keras.initializers.Constant(value=embedding_matrix),
                  trainable=False,
                 mask_zero=True)

  # Embedding
  emb_A = embedding_layer(inputs_A)

  model = keras.Model(inputs=[inputs_A], outputs=emb_A)
  model.compile(optimizer="Adam", loss="mse", metrics=["categorical_accuracy"])

  model.summary()

  return model

### Start

In [None]:
if not os.path.exists("embedding"):
  os.mkdir("embedding")
if not os.path.exists("FastText"):
  os.mkdir("FastText")

In [None]:
list_project_names = [('FLUME', 1577, 5, 200, 256), ('MDLSITE', 4100, 12, 200, 256)]

In [None]:
for project in list_project_names:
  project_name = project[0]
  time_split = project[1]

  # Model params
  steps_per_epoch = project[2]
  epochs = project[3]
  batch_size = project[4]

  # Load dataset
  df, graph = load_project(project_name)

  # Get features
  titles, descriptions, summaries = get_textual_features(df)
  del df
  del graph
  gc.collect()

  # Save path
  path = 'embedding/FastText/{}/'.format(project_name)
  if not os.path.exists(path):
    os.mkdir(path)

  # All textual features
  value_maxlen = 540
  all_text = [descriptions[i] +' '+titles[i] + ' '+summaries[i] for i in range(0, len(titles))]
  model = return_model(value_maxlen)

  save_path = path
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  tokenized = FastText_tokenizer(all_text)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  feature = model.predict(padded)
  np.save(save_path + 'textual_features.npy', feature)
  del all_text
  del tokenized
  del model
  gc.collect()


  # Only title
  value_maxlen = 20
  model = return_model(value_maxlen)
  save_path = path + "title/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  tokenized = FastText_tokenizer(titles)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del model
  gc.collect()

  # Only summary
  value_maxlen = 20
  model = return_model(value_maxlen)
  save_path = path + "summary/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  tokenized = FastText_tokenizer(summaries)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del model
  gc.collect()

  # Only description
  value_maxlen = 500
  model = return_model(value_maxlen)
  save_path = path + "description/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  tokenized = FastText_tokenizer(descriptions)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del model
  gc.collect()

  # description + title
  value_maxlen = 520
  model = return_model(value_maxlen)
  save_path = path + "description_title/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  all_text = [descriptions[i] +' '+titles[i] for i in range(0, len(titles))]
  tokenized = FastText_tokenizer(all_text)

  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del all_text
  del model
  gc.collect()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_a (InputLayer)        [(None, 540)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 540, 300)          600000600 
                                                                 
Total params: 600,000,600
Trainable params: 0
Non-trainable params: 600,000,600
_________________________________________________________________
Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_a (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding_6 (Embedding)     (None, 20, 300)           600000600 
                                                                 
Total params: 600,000,600
Tr

In [None]:
for project in list_project_names:
  project_name = project[0]
  time_split = project[1]

  # Model params
  steps_per_epoch = project[2]
  epochs = project[3]
  batch_size = project[4]

  # Load dataset
  df, graph = load_project(project_name)

  # Get features
  titles, descriptions, summaries = get_textual_features(df)
  del df
  del graph
  gc.collect()

  # Save path
  path = 'embedding/FastText/{}/'.format(project_name)
  try:
    os.mkdir(path)
  except:
    print('Cannot create path {}'.format(path))

  # description + summary
  value_maxlen = 520
  model = return_model(value_maxlen)
  save_path = path + "description_summary/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  all_text = [descriptions[i] +' '+summaries[i] for i in range(0, len(titles))]
  tokenized = FastText_tokenizer(all_text)

  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del all_text
  del model
  gc.collect()

In [None]:
for project in list_project_names:
  project_name = project[0]
  time_split = project[1]

  # Model params
  steps_per_epoch = project[2]
  epochs = project[3]
  batch_size = project[4]

  # Load dataset
  df, graph = load_project(project_name)

  # Get features
  titles, descriptions, summaries = get_textual_features(df)
  del df
  del graph
  gc.collect()

  # Save path
  path = 'embedding/FastText/{}/'.format(project_name)
  try:
    os.mkdir(path)
  except:
    print('Cannot create path {}'.format(path))

  # title + summary
  model = return_model(value_maxlen)
  value_maxlen = 40
  save_path = path + "title_summary/"
  if not os.path.exists(save_path):
    os.mkdir(save_path)
  all_text = [titles[i] +' '+summaries[i] for i in range(0, len(titles))]
  tokenized = FastText_tokenizer(all_text)
  padded = pad_sequences(tokenized, maxlen=value_maxlen, padding = 'post', truncating="post")
  del tokenized
  gc.collect()

  feature = model.predict(padded)
  del padded
  gc.collect()

  np.save(save_path + 'textual_features.npy', feature)
  del all_text
  del model
  gc.collect()