##### creating training dataset for fine-tuning DialoGPT from PersonaChat dataset 

In [None]:
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
nlp = spacy.load("en_core_web_sm")

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
def remove_starting_integer(input_string):
    match = re.match(r'^\d+', input_string)

    if match:
        leading_integer = match.group()
        result_string = input_string[len(leading_integer):].lstrip()
        return leading_integer, result_string
    else:
        print("-1 ", input_string)
        return -1, input_string

In [None]:
# loading personachat text file

file = open("/content/drive/MyDrive/BTech_Project/train_none_original2.txt","r")
all_utterances = []

while True:
    text = file.readline()
    if not text:
        break

    text = text.split("\t")

    num, u1 = remove_starting_integer(text[0])

    if(num == "1"):
        all_utterances.append("")

    all_utterances.append(u1)
    all_utterances.append(text[1])

file.close()

In [None]:
# all_utterances

['',
 'hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .',
 'you must be very fast . hunting is one of my favorite hobbies .',
 'i am ! for my hobby i like to do canning or some whittling .',
 'i also remodel homes when i am not out bow hunting .',
 'that is neat . when i was in high school i placed 6th in 100m dash !',
 'that is awesome . do you have a favorite season or time of year ?',
 'i do not . but i do have a favorite meat since that is all i eat exclusively .',
 'what is your favorite meat to eat ?',
 'i would have to say its prime rib . do you have any favorite foods ?',
 'i like chicken or macaroni and cheese .',
 'do you have anything planned for today ? i think i am going to do some canning .',
 'i am going to watch football . what are you canning ?',
 'i think i will can some jam . do you also play footfall for fun ?',
 'if i have time outside of hunting and remodeling homes . which is not much !',
 '',
 'hi , how are you doing toda

In [None]:
def get_pos_weights(word_pos):
    if word_pos == "NOUN" or word_pos == "PROPN":
        return 2.0
    elif word_pos == "VERB":
        return 1.0
    elif word_pos == "ADJ":
        return 0.5
    else:
        return 0.0

vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b') 

tfidf_matrix = vectorizer.transform(all_utterances)

# Get the feature names (words) and their corresponding TF-IDF scores
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()

In [None]:
# extracting keywords for each utterance using term frequecy inverse document frequency scores

tfidf_keywords = []
tfidf_pos_keywords = []

i = 0
for utterance in all_utterances:
    
    punctuations = "[^\w]"
    utterance = re.sub(punctuations, " ", utterance)

    doc = nlp(utterance)
    word_pos_tags = [(token.text, token.pos_) for token in doc if token.pos_ != 'SPACE']

    tfidf_values = [tfidf_scores[i][np.where(feature_names == word.lower())[0]][0] for word, pos in word_pos_tags]
    if(not len(tfidf_values)):
        tfidf_keywords.append("")
        tfidf_pos_keywords.append("")
        i += 1
        continue

    keyword_index = tfidf_values.index(max(tfidf_values))
    tfidf_keyword = word_pos_tags[keyword_index][0]
    tfidf_keywords.append(tfidf_keyword)

    tfidf_pos_values = [tfidf_values[i] * get_pos_weights(word_pos_tags[i][1]) for i in range(len(tfidf_values))]

    if(not len(tfidf_pos_values) or max(tfidf_pos_values) == 0):
        tfidf_pos_keywords.append("")
        i += 1
        continue

    keyword_index = tfidf_pos_values.index(max(tfidf_pos_values))
    tfidf_pos_keyword = word_pos_tags[keyword_index][0]
    tfidf_pos_keywords.append(tfidf_pos_keyword)

    i += 1

for i in range(10):
    print(all_utterances[i], tfidf_keywords[i], tfidf_pos_keywords[i])

# ~35min

In [None]:
def lemmatize(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])

    return lemmatized_text

lemmatized_keywords = list(map(lemmatize, tfidf_pos_keywords))

In [None]:
# writing training data to keywords.csv file

df = pd.DataFrame({'original_utterances': all_utterances, 'tfidf_keywords': tfidf_keywords, "tfidf_pos_keywords": tfidf_pos_keywords, "lemmatized_keywords": lemmatized_keywords})
df.to_csv("/content/drive/MyDrive/BTech_Project/keywords.csv", index=False)