In [None]:
# !pip3 install pyspellchecker
# !pip install -U textblob

In [None]:
import os

import pandas as pd

In [None]:
topics = pd.read_csv('topics.csv')
content = pd.read_csv('content.csv')
correlations = pd.read_csv('correlations.csv')

In [None]:
# Fillna titles
topics['title'].fillna("", inplace = True)
content['title'].fillna("", inplace = True)
# Fillna descriptions
topics['description'].fillna("", inplace = True)
content['description'].fillna("", inplace = True)
# Sort by title length to make inference faster
topics['length'] = topics['title'].apply(lambda x: len(x))
content['length'] = content['title'].apply(lambda x: len(x))
topics.sort_values('length', inplace = True)
content.sort_values('length', inplace = True)

In [None]:
print(f"topics.shape: {topics.shape}")
print(f"content.shape: {content.shape}")
print(f"correlations.shape: {correlations.shape}")

In [None]:
topics

In [None]:
content

In [None]:
# clean text
from textblob import TextBlob
import re
import string


def decontracted(phrase):

    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # ..

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    # ..

    return phrase

def remove_punctuations(text):
    for punctuation in list(string.punctuation): text = text.replace(punctuation, '')
    return text

def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

def clean_whitespace(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

def clean_repeat_words(text):
    return re.sub(r"(\w*)(\w)\2(\w*)", r"\1\2\3", text)

def clean_text(text):
    # text_blob = TextBlob(text)
    # text = str(text_blob.correct())
    text = str(text)
    text = decontracted(text)
    text = remove_punctuations(text)
    text = clean_number(text)
    text = clean_whitespace(text)
    
    return text

In [None]:
topics["title"] = topics["title"].apply(clean_text)
topics["description"] = topics["description"].apply(clean_text)

In [None]:
topics

In [None]:
content["title"] = content["title"].apply(clean_text)
content["description"] = content["description"].apply(clean_text)
content["text"] = content["text"].apply(clean_text)

In [None]:
content

In [None]:
topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])

In [None]:
# "<s_title>" + topics["title"] + "</s_title>" + "<s_description>" + topics["description"] + "</s_description>"
correlations

In [None]:
# add not correlations: get from kaggle
all_correlations = pd.read_csv("train.csv")

In [None]:
all_correlations

In [None]:
[f"<|kind_{l}|>" for l in set(content.kind.values)]

In [None]:
from tqdm import tqdm
# preprocess topic first
topic_dict = {}
for i, (index, row) in tqdm(enumerate(topics.iterrows())):
    text = "<|topic|>" + f"<|lang_{row['language']}|>" + f"<|category_{row['category']}|>" + f"<|level_{row['level']}|>"
    text += "<s_title>" + row["title"] + "</s_title>" + "<s_description>" + row["description"] + "</s_description>"
    topic_dict[row["id"]] = text

In [None]:
# preprocess content
content_dict = {}
for i, (index, row) in tqdm(enumerate(content.iterrows())):
    text = "<|content|>" + f"<|lang_{row['language']}|>" + f"<|kind_{row['kind']}|>"
    text += "<s_title>" + row["title"] + "</s_title>" + "<s_description>" + row["description"] + "</s_description>" + "<s_text>" + row["text"] + "</s_text>"
    content_dict[row["id"]] = text

In [None]:
all_correlations["topic_text"] = all_correlations["topics_ids"].apply(lambda x: topic_dict[x][:2048])
all_correlations["content_text"] = all_correlations["content_ids"].apply(lambda x: content_dict[x][:2048])

In [None]:
all_correlations

In [None]:
all_topic_ids = list(set(all_correlations.topics_ids))
len(all_topic_ids)

In [None]:
import random
random.seed(42)

fold_dict = {}
N = 6152
random.shuffle(all_topic_ids)

for i in range(10):
    keys = all_topic_ids[i * N : (i + 1) * N]
    for k in keys:
        fold_dict[k] = i

In [None]:
all_correlations["fold"] = all_correlations["topics_ids"].map(fold_dict)

In [None]:
# all_correlations = all_correlations.drop(columns=["title1", "title2"])

In [None]:
all_correlations.to_csv("siamese_train.csv")