# Library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import networkx as nx
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
import re
import os
import torch
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")

In [3]:
cd /content/drive/MyDrive/AISIA/Jira recommendation/

/content/drive/MyDrive/AISIA/Jira recommendation


In [4]:
!pip install html2text



# Load dataset

In [5]:
def encode_graph(row):
  new_row = []
  for i in row:
    if i==0:
      new_row.append([1,0])
    else:
      new_row.append([0,1])
  return new_row

In [6]:
def load_project(project_name):
  # Attributes
  df = pd.read_csv('data/{}/attribute.csv'.format(project_name))
  df = df.fillna('')
  # Graph
  graph = pd.read_csv('data/{}/graph.csv'.format(project_name), delimiter=',')
  graph = graph.apply(encode_graph)
  graph = graph.values
  return df, graph

In [7]:
def load_project_2(project_name):
  # Attributes
  df = pd.read_csv('data/{}/attribute_preprocess.csv'.format(project_name))
  df = df.fillna('')
  # Graph
  graph = pd.read_csv('data/{}/graph.csv'.format(project_name), delimiter=',')
  graph = graph.apply(encode_graph)
  graph = graph.values
  return df, graph

## Preprocessing

In [8]:
# ---- from nltk.stem import LancasterStemmer
import nltk
nltk.download('wordnet')
from nltk import word_tokenize
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
nltk.download('stopwords')

# Init stop_words list
stop_words = set(stopwords.words('english')) 
stop_words.add('e.g')
stop_words.add('i.e')
stop_words.add('https')
stop_words.add('http')
stop_words.add('org')
stop_words.add('www')
stop_words.add('href')
stop_words.remove('all')

from bs4 import BeautifulSoup

def stemmer_sentence(sentence):
    result = []
    for word in sentence.split(" "):
        result.append(wordnet_lemmatizer.lemmatize(word))
    return " ".join(result)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def remove_html(text):
  soup = BeautifulSoup(text, 'html.parser')
  return soup.get_text()

def delete_number(text):
  text = text.split()
  text = [text[i] for i in range(len(text)) if not re.findall('\d',text[i])]
  text = [text[i] for i in range(len(text)) if len(text[i])>1]
  text = " ".join(text)
  return text

def get_links(text):
  text = re.findall(r'(https?://\S+)', text)
  text = " ".join(text)
  text = text.replace('"','')
  text = text.replace(')','')
  text = text.replace('(','')
  text = text.replace("'",'')
  text = text.replace(">",'')
  text = text.replace("<",'')
  text = text.replace(",",'')
  text = text.replace("a",'')
  return text

def remove_link(text):
  links = re.findall(r'(https?://\S+)', text)
  links = '|'.join(links)
  text = text.replace(links,'')
  return text

In [10]:
def preprocessing(df):
  # Choose necessary features
  columns = ["title", "description", "summary", "key", "created", "updated"]
  processing_df = df.loc[:, columns]

  # Lowercase all texts
  processing_df["title"] = processing_df["title"].str.lower()
  processing_df["description"] = processing_df["description"].str.lower()
  processing_df["summary"] = processing_df["summary"].str.lower()
  processing_df["key"] = processing_df["key"].str.lower()

  # Get http links
  processing_df["http_links"] = processing_df["description"].apply(get_links)
  processing_df["description"] = processing_df["description"].apply(remove_link)

  # Remove all number 
  processing_df["title"] = processing_df["title"].apply(delete_number)
  processing_df["description"] = processing_df["description"].apply(delete_number)
  processing_df["summary"] = processing_df["summary"].apply(delete_number)

  # Remove html special elements
  processing_df["http_links"] = processing_df["http_links"].apply(remove_html)
  processing_df["description"] = processing_df["description"].apply(remove_html)

  # Remove stopwords
  pat = r'\b(?:{})\b'.format('|'.join(stop_words))
  processing_df["title"] = processing_df["title"].str.replace(pat, '')
  processing_df["description"] = processing_df["description"].str.replace(pat, '')
  processing_df["summary"] = processing_df["summary"].str.replace(pat, '')

  # Remove punctuation and space
  processing_df["title"] = processing_df["title"].str.replace("[^\w]", " ", regex=True).str.replace("[ ]+", " ", regex=True).str.strip()
  processing_df["description"] = processing_df["description"].str.replace("[^\w]", " ", regex=True).str.replace("[ ]+", " ", regex=True).str.strip()
  processing_df["summary"] = processing_df["summary"].str.replace("[^\w]", " ", regex=True).str.replace("[ ]+", " ", regex=True).str.strip()

  # Stemming
  processing_df["title"] = processing_df["title"].apply(stemmer_sentence)
  processing_df["description"] = processing_df["description"].apply(stemmer_sentence)
  processing_df["summary"] = processing_df["summary"].apply(stemmer_sentence)

  return processing_df

# Main program

In [11]:
list_project_names = ['FLUME','MDLSITE']
for project_name in list_project_names:
  # Load dataset
  df_1, graph = load_project(project_name)

  # Preprocessing
  df = preprocessing(df_1)
  display(df.head())
  df.to_csv('data/{}/attribute_preprocess.csv'.format(project_name))

Unnamed: 0,title,description,summary,key,created,updated,http_links
0,jsp web page title tag set,webapps jsp title tag set updated give informa...,jsp web page title tag set,flume-1,2010-06-29 02:44:45,2010-12-13 14:38:01,
1,flume node tail source report error go error s...,rpm install run user flume file var log messag...,flume node tail source report error go error s...,flume-2,2010-06-29 23:29:38,2010-12-13 14:37:45,
2,ant failed due old thrift jar,ant default task failed complete compilation n...,ant failed due old thrift jar,flume-3,2010-06-30 01:33:04,2010-12-13 14:37:48,
3,testconcurrentdfoman test logicalnodes fails i...,see trace belows logical node would checked em...,testconcurrentdfoman test logicalnodes fails i...,flume-4,2010-06-30 07:34:00,2010-12-13 14:37:58,
4,add post sink,web dashboard useful flume fire post event dat...,add post sink,flume-5,2010-06-30 08:36:01,2014-11-05 07:03:46,


Unnamed: 0,title,description,summary,key,created,updated,http_links
0,discussion forum scheduler module,martin could get discussion forum scheduler mo...,discussion forum scheduler module,mdlsite-360,2005-08-27 10:22:52,2013-11-25 15:53:17,
1,add email obfuscation moodle doc,rather writing doc moodle dot would nice mailt...,add email obfuscation moodle doc,mdlsite-129,2006-03-01 03:50:52,2007-06-02 07:38:53,
2,moodle doc wiki seach short word like faq,search faq doc moodle wiki return result note ...,moodle doc wiki seach short word like faq,mdlsite-32,2006-03-30 08:20:08,2008-10-02 23:37:23,http://mil.wikipedi.org/pipermil/mediwiki-l/20...
3,google site map information moodle,google provides lot information site admins tu...,google site map information moodle,mdlsite-1781,2006-04-29 02:43:31,2012-05-15 07:07:13,http://www.google.com/webmsters/sitemps/ http:...
4,add date field plugins database module,would helpful database moodle plugins moodle i...,add date field plugins database module,mdl-6395,2006-06-16 09:11:46,2015-10-08 18:30:25,
