In [135]:
import json
import pandas as pd

In [136]:
def load_from_macsum(file_path):
    """load macdial_flatten jsonl data"""

    with open(file_path, "r") as f:
        raw_data = f.read()
        data = json.loads(raw_data)

    data_length = len(data)

    id_list = [idx for idx in range(data_length)]
    dialogue_list = [sample["article"].replace("</s>", "\n") for sample in data]

    if "summary" in data[0]:
        summary_list = [sample["summary"] for sample in data]
        topic_list = [sample["topic"] for sample in data]

    data_dict = {
        "id": id_list,
        "dialogue": dialogue_list,
        "summary": summary_list,
        "topic": topic_list,
    }

    return data_dict

In [137]:
train_data = load_from_macsum("./data/macdial_flatten/train.json")

In [138]:
train_data.keys()

dict_keys(['id', 'dialogue', 'summary', 'topic'])

In [139]:
train_df = pd.DataFrame.from_dict(train_data)
train_df.head()

Unnamed: 0,id,dialogue,summary,topic
0,0,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,"remote control , detailed design"
1,1,Industrial Designer : Okay well um . So our de...,Industrial Designer; our design looks somethin...,"remote control , detailed design"
2,2,Industrial Designer : Okay well um . So our de...,Marketing said that they have a presentation o...,evaluation criteria
3,3,Industrial Designer : Okay well um . So our de...,Marketing said they had a presentation of eval...,evaluation criteria
4,4,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,trendy fruit


In [140]:
# for dialogue in train_df['dialogue']:
#     print(dialogue)
#     break

In [141]:
group_train_df = train_df.groupby(['dialogue']).agg({'dialogue': 'count', 'id': tuple, 'topic': tuple}).reset_index(drop=True)
group_train_df.head()

Unnamed: 0,dialogue,id,topic
0,8,"(1760, 1761, 1762, 1763, 1764, 1765, 1766, 1767)","(efficacy of the law, efficacy of the law, leg..."
1,8,"(90, 91, 92, 93, 94, 95, 96, 97)","(employers , understanding , meaning of Welsh ..."
2,4,"(478, 479, 480, 481)","(local authorities, local authorities, great s..."
3,6,"(1592, 1593, 1594, 1595, 1596, 1597)","(opinions , specialist teacher-training qualif..."
4,10,"(760, 761, 762, 763, 764, 765, 766, 767, 768, ...","(money , supporting teachers , preparation , i..."


In [142]:
group_train_df.describe()

Unnamed: 0,dialogue
count,321.0
mean,7.283489
std,4.510821
min,2.0
25%,4.0
50%,6.0
75%,10.0
max,20.0


In [143]:
# group_train_df.to_excel("group_train_df.xlsx")

In [144]:
column_name_group = "group_id"
column_name_total_topic = "total_topic"
column_name_topic_list = "topic_list"

train_df[column_name_group] = 0
train_df[column_name_total_topic] = 0
train_df[column_name_topic_list] = None

for (index,  ids, total_topic, topic_list) in zip(group_train_df.index, group_train_df['id'], group_train_df['dialogue'], group_train_df['topic']):
    for id in ids:
        train_df.at[id, column_name_group] = index
        train_df.at[id, column_name_total_topic] = len(set(topic_list))
        train_df.at[id, column_name_topic_list] = set(topic_list)

In [145]:
train_df.head()

Unnamed: 0,id,dialogue,summary,topic,group_id,total_topic,topic_list
0,0,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,"remote control , detailed design",36,6,"{evaluation criteria, remote control , detaile..."
1,1,Industrial Designer : Okay well um . So our de...,Industrial Designer; our design looks somethin...,"remote control , detailed design",36,6,"{evaluation criteria, remote control , detaile..."
2,2,Industrial Designer : Okay well um . So our de...,Marketing said that they have a presentation o...,evaluation criteria,36,6,"{evaluation criteria, remote control , detaile..."
3,3,Industrial Designer : Okay well um . So our de...,Marketing said they had a presentation of eval...,evaluation criteria,36,6,"{evaluation criteria, remote control , detaile..."
4,4,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,trendy fruit,36,6,"{evaluation criteria, remote control , detaile..."


In [146]:
train_df.describe()

Unnamed: 0,id,group_id,total_topic
count,2338.0,2338.0,2338.0
mean,1168.5,157.380667,5.157399
std,675.066787,88.318353,2.635955
min,0.0,0.0,1.0
25%,584.25,85.0,3.0
50%,1168.5,156.0,5.0
75%,1752.75,230.0,7.0
max,2337.0,320.0,13.0


In [147]:
train_df.apply(lambda row: row['topic_list'].remove(row['topic']), axis=1)

0       None
1       None
2       None
3       None
4       None
        ... 
2333    None
2334    None
2335    None
2336    None
2337    None
Length: 2338, dtype: object

In [148]:
train_df.head()

Unnamed: 0,id,dialogue,summary,topic,group_id,total_topic,topic_list
0,0,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,"remote control , detailed design",36,6,"{evaluation criteria, button, remote control, ..."
1,1,Industrial Designer : Okay well um . So our de...,Industrial Designer; our design looks somethin...,"remote control , detailed design",36,6,"{evaluation criteria, button, remote control, ..."
2,2,Industrial Designer : Okay well um . So our de...,Marketing said that they have a presentation o...,evaluation criteria,36,6,"{remote control , detailed design, button, rem..."
3,3,Industrial Designer : Okay well um . So our de...,Marketing said they had a presentation of eval...,evaluation criteria,36,6,"{remote control , detailed design, button, rem..."
4,4,Industrial Designer : Okay well um . So our de...,Industrial Designer showed what the remote con...,trendy fruit,36,6,"{evaluation criteria, remote control , detaile..."


In [149]:
only_one_topic = 0

for id in train_df.index:
    if train_df['topic'][id] in train_df['topic_list'][id]:
        print("duplicate")
    if train_df['total_topic'][id] == 1:
        only_one_topic += 1

print("There one topic: ", only_one_topic, 'from total topic :', train_df.shape[0])

There one topic:  150 from total topic : 2338


In [150]:
# train_df.to_excel("train_df.xlsx")

# KeyWords

In [151]:
# let's store the document text into an object 'text':

text = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep learning architectures such as deep neural networks, deep belief networks, recurrent neural networks and convolutional neural networks have been applied to fields including computer vision, machine vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analog.
The adjective "deep" in deep learning comes from the use of multiple layers in the network. Early work showed that a linear perceptron cannot be a universal classifier, and then that a network with a nonpolynomial activation function with one hidden layer of unbounded width can on the other hand so be. Deep learning is a modern variation which is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions. In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability, whence the "structured" part. 
"""
print(text)

Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.
Deep learning architectures such as deep neural networks, deep belief networks, recurrent neural networks and convolutional neural networks have been applied to fields including computer vision, machine vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.
Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, neural networks tend to be static a

In [152]:
# !pip install sentence-transformers

In [153]:
# Import the packages

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

from sklearn.metrics.pairwise import cosine_similarity

In [154]:
# We can use 'nGramRange' to set the size of the resulting KeyWords or KeyPhrase candidates that we require. 
# For example, if we set 'nGramRange' to (5, 5) then the result would create 5 Phrases that include 5 keywords.
nGramRange = (5, 5)

# what type of stop words we need to exclude? let's use only English ones
stop_words = "english"

# extract candidate keywords and/or phrases from 'text'
count = CountVectorizer(ngram_range=nGramRange, stop_words=stop_words).fit([text])

# let's collect a list of strings made out of above candidate keywords/keyphrases
candidates = count.get_feature_names_out()

We will use 'Distilbert' as it has been known to show great performances in similarity tasks, which we are also aiming for keyword/keyphrase extraction from the text.

Please read here:

https://www.sbert.net/docs/pretrained_models.html

Although we may find many pre-trained BERT-based models, which we can exploit for keyword extraction in our projects. For example, some of the best available models for Semantic Textual Similarity (STS) are:

1. roberta-large-nli-stsb-mean-tokens
2. roberta-base-nli-stsb-mean-tokens
3. bert-large-nli-stsb-mean-tokens
4. distilbert-base-nli-stsb-mean-tokens

And for Semantic Similarity, some of the models are:

1. distiluse-base-multilingual-cased-v2
2. xlm-r-distilroberta-base-paraphrase-v1
3. xlm-r-bert-base-nli-stsb-mean-tokens
4. distilbert-multilingual-nli-stsb-quora-ranking


However, in this particular case (our tutorial), we will rather stick to either

'distilbert-base-nli-stsb-mean-tokens' or

'xlm-r-distilroberta-base-paraphrase-v1'

simply because these are known to have shown great performances in semantic similarity and paraphrase identification, respectively.

In [155]:
# Let's try 'distilbert-base-nli-stsb-mean-tokens'
# use pre-trained model
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
model = SentenceTransformer('bert-large-nli-stsb-mean-tokens')

# transform our 'document' 
doc_embedding = model.encode([text])

# and 'candidates' into embedding vectors
candidate_embeddings = model.encode(candidates)

# calculate distances between vectors and document
distances = cosine_similarity(doc_embedding, candidate_embeddings)

# select only top 5 keyphrases from most similar candidates to the input document
top_n = 5
keyphrases = [candidates[index] for index in distances.argsort()[0][-top_n:]]
for i in keyphrases:
    print(i)

.gitattributes:   0%|          | 0.00/744 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
[Errno 28] No space left on device: '/tmp/tmpxvga4msd'

In [18]:
import json
import pandas as pd
# train_df_cadidate = pd.read_excel('./data/dialogsum_topic.xlsx', index_col=0)
train_df_cadidate = pd.read_excel('./data/dialogsum_topic.xlsx', usecols=['top_keyphrases[T-K]', 'tail_keyphrases[T-K]'])

In [19]:
train_df_cadidate.head()

Unnamed: 0,top_keyphrases[T-K],tail_keyphrases[T-K]
0,classes medications help,cancer heart disease
1,vaccination,chickenpox
2,set keys,thank guys
3,didn tell girlfriend,person1 tell love
4,moves,woods


In [20]:
train_df_cadidate.describe()

Unnamed: 0,top_keyphrases[T-K],tail_keyphrases[T-K]
count,12460,12460
unique,10070,9750
top,buy,tomorrow
freq,30,78
