### Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import math
import time

In [3]:
cd /content/drive/MyDrive/AISIA/Jira recommendation/

/content/drive/MyDrive/AISIA/Jira recommendation


# Load dataset

In [4]:
def encode_graph(row):
  new_row = []
  for i in row:
    if i==0:
      new_row.append([1,0])
    else:
      new_row.append([0,1])
  return new_row

In [5]:
def load_project(project_name):
  # Attributes
  df = pd.read_csv('data/{}/attribute_preprocess.csv'.format(project_name))
  df = df.fillna('')
  # Graph
  graph = pd.read_csv('data/{}/graph.csv'.format(project_name), delimiter=',')
  graph = graph.apply(encode_graph)
  graph = graph.values
  return df, graph

# Model

### Get features

In [6]:
def get_string_feature(df):
  df["title"] = df["title"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["description"] = df["description"].str.replace("[ ]+", " ", regex=True).str.strip()
  df["summary"] = df["summary"].str.replace("[ ]+", " ", regex=True).str.strip()

  # Extract data from dataframe
  title = df['title'].values
  description = df['description'].values
  summary = df['summary'].values

  return title, description, summary

In [7]:
def get_time_features(df):
  createds = pd.to_datetime(df['created'])
  updateds = pd.to_datetime(df['updated'])
  return [createds, updateds]

### Split data

In [8]:
def split_data(createds, graph, time_split):
  # Get date to split data
  x = createds[0]
  check_date = x + abs(datetime.timedelta(time_split))

  train_nodes = []
  test_nodes = []

  for i in range(0, len(createds)):
    if createds[i]<=check_date:
      train_nodes.append(i)
    else:
      test_nodes.append(i) 

  # Delete all lonely nodes in test
  c = 0
  new_test_node = []
  for i in test_nodes:
    t = True
    for j in graph[i,:]:
      if j[1]!=0: # has linked
        t = False
    if not t:
      c+=1
      new_test_node.append(i)

  test_nodes = new_test_node
  all_nodes = train_nodes + test_nodes

  return train_nodes, test_nodes, all_nodes

## Training

### Pairing

In [9]:
def get_pairs(graph, list_nodes_1, list_nodes_2):
  # list_nodes_1: list of nodes in input
  # list_nodes_2: list of nodes in dataset

  # Get size
  size_1 = len(list_nodes_1)
  size_2 = len(list_nodes_2)

  # Get index of pairs
  pairs = np.empty((size_1*size_2,2))
  # Get label
  labels = np.empty((size_1*size_2,2))

  c=0
  for i in tqdm(range(0, size_1)):
    for j in range(0, size_2):
      u = list_nodes_1[i]
      v = list_nodes_2[j]
      if u!=v:
        # Get index of pairs
        pairs[c] = [u,v]
        # Get label
        labels[c] = graph[u][v]
        c+=1

  pairs = pairs[:c]
  labels = labels[:c]
  return pairs, labels

In [10]:
def get_data_pairs(pair, all_vectors):
  from sklearn.metrics.pairwise import euclidean_distances
  data = []
  c = 0
  for index in tqdm(range(len(pair))):
    p = pair[index]
    u = int(p[0])
    v = int(p[1])
    input_A = [all_vectors[u]]
    input_B = [all_vectors[v]]
    data.append(euclidean_distances(input_A, input_B))
    c+=1

  return data

## Recommend

In [11]:
def recommend_function(createds, test_nodes, test_pair, pred_proba):
  total_size = len(test_nodes)

  recommend_s = []
  for i in range(total_size):
    recommend_s.append([])

  
  # Make dictionary of test_nodes and position of test_nodes in list
  index_dictionary = dict(zip(test_nodes, range(total_size)))


  for iter in tqdm(range(len(test_pair))):
    pair = test_pair[iter]
    u = int(pair[0])
    v = int(pair[1])
    if abs((createds[u]-createds[v]).days)<=60:
      proba = pred_proba[iter] 
      if u in index_dictionary.keys():
        index_1 = index_dictionary[u]     
        index_2 = v 
        recommend_s[index_1].append((index_2,proba))
      
      if v in index_dictionary.keys():
        index_1 = index_dictionary[v]     
        index_2 = u
        recommend_s[index_1].append((index_2,proba))

  return recommend_s

In [12]:
def Acc(pred, gt):
	acc = 0
	for i, item in enumerate(pred):
		if item in gt:
			acc += 1.0 
			break
	return acc

In [13]:
def MRR(pred, gt):
  for i, item in enumerate(pred):
	  if item in gt:
		  return 1.0/(i+1)
  return 0

In [14]:
def Precision_Recall(pred, gt):
  right = 0
  
  for item in gt:
    if item in pred: # relevant
      right+=1

  if len(pred) == 0:
    precision = 0
  else:
    precision = right/len(pred)
  recall = right/len(gt)
  
  return precision, recall

In [15]:
def NDCG_score(pred, gt):
  positions = []

  for index in range(0, len(pred)):
    if pred[index] in gt:
      positions.append(index+1) # Get positions of right results

  if len(positions)==0:
    return 0.0

  dcg = 0
  idcg = 0
  ideal_index = 1
  for pos in positions:
    dcg +=1.0/math.log(pos+1,2) # 1.0 because all true results are the same relevant
    idcg +=1.0/math.log(ideal_index+1,2)
    ideal_index+=1
  return dcg/idcg

In [16]:
def metrics(recommend, label):
  acc = 0.0
  mrr = 0.0
  precision = 0.0
  recall = 0.0
  for i in range(0, len(recommend)):
    if len(label[i])!=0:
      acc+=Acc(recommend[i], label[i])
      mrr+=MRR(recommend[i], label[i])
      precision_recall = Precision_Recall(recommend[i], label[i])
      precision+=precision_recall[0]
      recall+=precision_recall[1]
  return acc/(len(recommend)), mrr/(len(recommend)), precision/(len(recommend)), recall/(len(recommend))

### List of recommend

In [17]:
def get_result(input):
  project_name, createds, test_nodes, all_nodes, test_pair, pred_proba = input
  recommend_s = recommend_function(createds, test_nodes, test_pair, pred_proba)
  

  # Sort nodes in pairs
  recommend_s2 = []
  c = 0
  for recommend2 in recommend_s:
    c+=1
    recommend = np.array(sorted(recommend2, key = lambda x: x[1], reverse = False))
    if len(recommend)>0:
      recommend = np.array(recommend[:,0], dtype = int)
    recommend_s2.append(recommend)

  y_test = []

  for i in tqdm(range(len(test_nodes))):
    nodes = []
    for j in range(len(all_nodes)):
      if graph[test_nodes[i], all_nodes[j]][1] !=0 and graph[test_nodes[i], all_nodes[j]][1] !=0:
        nodes.append(all_nodes[j])
    y_test.append(nodes)

  f = open("results/euclidean.txt", "a")
  f.write(project_name + '\n')  

  top = 1
  recommend_s = np.array(recommend_s2)

  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 1:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')


  top = 2
  recommend_s = np.array(recommend_s2)
  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 2:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')

  top = 3
  recommend_s = np.array(recommend_s2)
  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 3:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')


  top = 5
  recommend_s = np.array(recommend_s2)

  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 5:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')


  top = 10
  recommend_s = np.array(recommend_s2)
  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 10:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')

  top = 20
  recommend_s = np.array(recommend_s2)

  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 20:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')

  top = 30
  recommend_s = np.array(recommend_s2)

  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 30:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')

  top = 50
  recommend_s = np.array(recommend_s2)

  recommend_s = [i[:top] for i in recommend_s]

  f.write('Top 50:')
  f.write('\n')
  metric = metrics(recommend_s, y_test)
  f.write('Accuracy = ' + repr(metric[0]))
  f.write('\n')
  f.write('MRR = ' + repr(metric[1]))
  f.write('\n')
  f.write('Recall = ' + repr(metric[3]))
  f.write('\n')

  f.close()

# Main program

### Start

In [18]:
list_project_names = [('FLUME', 1577, 5, 200, 256), ('MDLSITE', 4100, 12, 200, 256)]

In [19]:
for project in list_project_names:
  project_name = project[0]
  time_split = project[1]

  # Load dataset
  df, graph = load_project(project_name)

  # Split data
  createds, updateds = get_time_features(df)
  train_nodes, test_nodes, all_nodes = split_data(createds, graph, time_split)

  # Pairing
  test_pair, test_label = get_pairs(graph, test_nodes, all_nodes)

  # Get string features
  title, description, summary = get_string_feature(df)

  train_title = [title[i] for i in train_nodes]
  train_description = [description[i] for i in train_nodes]
  train_summary = [summary[i] for i in train_nodes]

  # Tfidf
  tfidf_title = TfidfVectorizer()
  tfidf_title.fit(train_title)

  tfidf_description = TfidfVectorizer()
  tfidf_description.fit(train_description)

  tfidf_summary = TfidfVectorizer()
  tfidf_summary.fit(train_summary)

  # tf_idf all texts
  all_title = tfidf_title.transform(title).toarray()
  all_description = tfidf_description.transform(description).toarray()
  all_summary = tfidf_summary.transform(summary).toarray()


  all = np.concatenate((all_description, all_title, all_summary), axis=1)

  # Calculate distance
  test_data = get_data_pairs(test_pair, all)
  get_result([project_name, createds, test_nodes, all_nodes, test_pair, test_data])

100%|██████████| 227/227 [00:02<00:00, 90.36it/s]
100%|██████████| 619256/619256 [03:07<00:00, 3294.27it/s]
100%|██████████| 619256/619256 [00:25<00:00, 24409.31it/s]
  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 227/227 [00:00<00:00, 622.48it/s]
100%|██████████| 795/795 [00:16<00:00, 46.85it/s]
100%|██████████| 4120485/4120485 [23:06<00:00, 2970.94it/s]
100%|██████████| 4120485/4120485 [02:50<00:00, 24181.15it/s]
  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 795/795 [00:04<00:00, 181.37it/s]
