In [82]:
%config IPCompleter.greedy=True
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [83]:
def parse_tag(msg):
    m = re.match(r'(\[.+?\])(.*)', msg)
    if m:
        return m.group(1).lower(), m.group(2)
    # m = re.match(r'([A-Z]+?)-\d+?\D(.*)', msg)
    # if m:
    #     return m.group(1).lower(), m.group(2)
    m = re.match(r'([A-Za-z]+?):(.*)', msg)
    if m and len(m.group(1)) < 10:
        return '[{}]'.format(m.group(1).lower()), m.group(2)
    return '', msg

def handle_tags(line):
    tag, rest = parse_tag(line)
    issue_tags = re.findall(r'[^\n ]??([A-Z]+)-(?:CR-)?\d+[^ ]?', rest)
    issue_tags = set(map(str.lower, issue_tags))
    issue_tags = ' '.join(map((lambda s: '{{{}}}'.format(s)), issue_tags))
    rest = re.sub(r'[^\n ]??([A-Z]+)-(?:CR-)?\d+[^ ]?', r'', rest)
    return tag, issue_tags, rest

def remove_numbers(line):
    return re.sub(r'( |^)(\d+)( |$)', r'\1\3', line)

In [84]:
def join_msg(tag, issue_tags, rest):
    res = []
    if tag:
        res.append(tag)
    if issue_tags:
        res.append(issue_tags)
    res.append(tokenize(rest))
    return ' '.join(res)

def tokenize(line):
    line = re.sub(r'(\w)(?=[^a-zA-Z0-9_ ])', r'\1 ', line)
    line = re.sub(r'([^a-zA-Z0-9_ ])(?=\w)', r'\1 ', line)
    line = re.sub(r'([^a-zA-Z0-9_ ])(?=[^a-zA-Z0-9_ ])', r'\1 ', line)
    line = remove_numbers(line)
    line = ' '.join(line.split())
    return line

In [85]:
tfidf_threshold = 0.05

def insert_placeholders(line, diff_tfidf):
    tokens = line.split()
#     print(tokens)
#     print(diff_tfidf)
#     print()
    for i in range(len(tokens)):
        if tokens[i] in diff_tfidf:
            tokens[i] = 'ENTITY'
    return ' '.join(tokens)

In [86]:
def process_messages(msgs, tfidf=None, offset=0, placeholder=True):
    for i in range(len(msgs)):
        msg = msgs[i]
        tag, issue_tags, rest = handle_tags(msg)
        rest = tokenize(rest)
        if placeholder:
            rest = insert_placeholders(rest.lower(), tfidf[i + offset])
        msgs[i] = join_msg(tag, issue_tags, rest)

In [87]:
#load diffs

diffs = []
with open('generated_data/test.diff', 'r', encoding='utf8') as inf:
    for line in inf:
        diffs.append(line.strip())

testset_len = len(diffs)

with open('generated_data/train.diff', 'r', encoding='utf8') as inf:
    for line in inf:
        diffs.append(line.strip())

In [88]:
%%time
#build tfidf for diffs


tfidf_vectorizer = TfidfVectorizer(token_pattern=r'\S+', stop_words=['<nl>'], max_df=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(diffs)
index = tfidf_vectorizer.get_feature_names()
tfidf = []
for row in tfidf_matrix:
    cur_tfidf = {index[word_id] : score for word_id, score in zip(row.indices, row.data)}
    tfidf.append(cur_tfidf)

CPU times: user 7.82 s, sys: 277 ms, total: 8.1 s
Wall time: 8.3 s


In [89]:
with open('generated_data/test.msg', 'r') as inf:
    msgs = list(inf)

process_messages(msgs, tfidf=tfidf, placeholder=True)
    
with open('generated_data/test.msg', 'w') as ouf:
    for msg in msgs:
        ouf.write(msg + '\n')
    
with open('generated_data/train.msg', 'r') as inf:
    msgs = list(inf)

process_messages(msgs, tfidf=tfidf, offset=testset_len, placeholder=True)
    
with open('generated_data/train.msg', 'w') as ouf:
    for msg in msgs:
        ouf.write(msg + '\n')

In [93]:
def extract_tag(msg):
    match = re.match(r'\[(.*?)\]', msg)
    if match is not None:
        return match.group(1).strip()
    match = re.match(r'\{(.*?)\}', msg)
    if match is not None:
        return match.group(1).strip()
    return 'no-tag'

def extract_pure_tag(msg):
    match = re.match(r'\[(.*?)\]', msg)
    if match is not None:
        return match.group(1).strip()
    match = re.match(r'\{(.*?)\}', msg)
    if match is not None:
        return ''
    return 'no-tag'

In [91]:
with open('generated_data/test.msg', 'r') as inf:
    msgs = map(extract_tag, list(inf))
    
with open('generated_data/test_tags.msg', 'w') as ouf:
    for msg in msgs:
        ouf.write(msg + '\n')
    
with open('generated_data/train.msg', 'r') as inf:
    msgs = map(extract_tag, list(inf))
    
with open('generated_data/train_tags.msg', 'w') as ouf:
    for msg in msgs:
        ouf.write(msg + '\n')

In [96]:
with open('generated_data/test.msg', 'r') as inf:
    msgs = map(extract_pure_tag, list(inf))
    
with open('generated_data/test_pure_tags.msg', 'w') as ouf:
    for msg in msgs:
            ouf.write(msg + '\n')
    
with open('generated_data/train.msg', 'r') as inf:
    msgs = map(extract_pure_tag, list(inf))
    
with open('generated_data/train_pure_tags.msg', 'w') as ouf:
    for msg in msgs:
            ouf.write(msg + '\n')