In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import math
import string
import regex as re
from gensim.utils import tokenize

import fasttext

from gensim.parsing.preprocessing import lower_to_unicode, preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric

from tqdm.auto import tqdm
tqdm.pandas()

# Functions

In [2]:
# class for fasttext language detection
class LanguageIdentification:

    def __init__(self):
        pretrained_lang_model = '../../../fasttext/lid.176.bin'
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        predictions = self.model.predict(text, k=5) # returns top 5 matching languages
        return predictions

# function to apply fasttext model and check for less than 5 non-latin chars
def assign_language(text):
    
    processed_text = lower_to_unicode(text)
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric]
    
    processed_text = ' '.join(preprocess_string(processed_text, CUSTOM_FILTERS))
    
    if processed_text.strip() == '':
        return False
    
    result = LANGUAGE.predict_lang(processed_text)
    
    langs = [x.split('__')[-1] for x in result[0]]
    probs = result[1]
    
    if 'en' not in langs:
        return False
    en_idx = langs.index('en')
    if probs[en_idx] == max(probs):
        if len(set(re.findall(r'(\p{IsArabic}|\p{IsArmenian}|\p{IsBengali}|\p{IsBopomofo}|\p{IsBraille}|\p{IsBuhid}|\p{IsCanadian_Aboriginal}|\p{IsCherokee}|\p{IsCyrillic}|\p{IsDevanagari}|\p{IsEthiopic}|\p{IsGeorgian}|\p{IsGreek}|\p{IsGujarati}|\p{IsGurmukhi}|\p{IsHan}|\p{IsHangul}|\p{IsHanunoo}|\p{IsHebrew}|\p{IsHiragana}|\p{IsKannada}|\p{IsKatakana}|\p{IsKhmer}|\p{IsLao}|\p{IsLimbu}|\p{IsMalayalam}|\p{IsMongolian}|\p{IsMyanmar}|\p{IsOgham}|\p{IsOriya}|\p{IsRunic}|\p{IsSinhala}|\p{IsSyriac}|\p{IsTagalog}|\p{IsTagbanwa}|\p{IsTaiLe}|\p{IsTamil}|\p{IsTelugu}|\p{IsThaana}|\p{IsThai}|\p{IsTibetan}|\p{IsYi})', processed_text))) > 4:
            return False
        else:
            return True
    else:
        return False

# function to remove titles with length less than 5 words
def cleanse_by_length(text):
    
    processed_text = lower_to_unicode(text)
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]
    
    processed_text = preprocess_string(processed_text, CUSTOM_FILTERS)
    
    if len(processed_text) < 5:
        return False
    else:
        return True

# function for cluster-based cleansing
def cleanse_by_mainentity_heuristic(cur_cluster):
    
    cur_cluster = cur_cluster.set_index('id', drop=False)
    cluster_len = len(cur_cluster)
    word_dict = {}
    lengths = []
    to_remove = []

    for i, v in cur_cluster['title_processed'].iteritems():

        words = v
        lengths.append(len(words))

        for word in words:
            if word in word_dict.keys():
                word_dict[word] += 1
            else:
                word_dict[word] = 1

    for i, v in cur_cluster['title_processed'].iteritems():

        count_overlap_main = 0
        
        words = v
        for word in words:
            if word_dict[word] >= math.ceil(cluster_len / 3):
                count_overlap_main += 1
        if count_overlap_main <= math.floor((sum(lengths)/len(lengths)) / 4):
            to_remove.append(i)

    return to_remove
    
LANGUAGE = LanguageIdentification()



# Language Identification and Deduplication

In [None]:
corpus = pd.read_pickle('../../../data/interim/wdc-lspc/corpus/preprocessed_lspcV2020.pkl.gz')
corpus['title_temp'] = corpus['title'].fillna('')
corpus['description_temp'] = corpus['description'].fillna('')
corpus['title+desc'] = corpus['title_temp'] + ' ' + corpus['description_temp']
corpus['is_en'] = corpus['title+desc'].progress_apply(assign_language)

corpus = corpus[['id', 'brand', 'title', 'description', 'price', 'priceCurrency', 'specTableContent', 'cluster_id', 
                  'sku', 'mpn', 
                  'gtin', 'gtin8', 'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier', 'is_en']]
corpus.to_pickle('../../../data/interim/wdc-lspc/corpus/preprocessed_lspcV2020_only_en_strict.pkl.gz')

  0%|          | 0/98900648 [00:00<?, ?it/s]

# Deduplication on titles

In [None]:
corpus = corpus[corpus['is_en'] == True]
corpus['description_temp'] = corpus['description']
corpus['brand_temp'] = corpus['brand']
corpus['specTableContent_temp'] = corpus['specTableContent']
corpus = corpus.dropna(subset=['title'])
corpus = corpus.fillna({'description_temp':'', 'brand_temp':'', 'specTableContent_temp':''})

corpus['title+description+brand+specTableContent'] = corpus['title'] + ' ' + corpus['description_temp'] + ' ' + corpus['brand_temp'] + ' ' + corpus['specTableContent_temp']
corpus['title+description+brand+specTableContent'] = corpus['title+description+brand+specTableContent'].apply(lambda x: ' '.join(x.lower().split()))
corpus = corpus.drop_duplicates(subset=['title+description+brand+specTableContent'])

corpus = corpus[['id', 'brand', 'title', 'description', 'price', 'priceCurrency', 'specTableContent', 'cluster_id', 
                  'sku', 'mpn', 
                  'gtin', 'gtin8', 'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier', 'is_en']]

corpus.to_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict.pkl.gz')

# Remove short titles

In [None]:
corpus['has_long_title'] = corpus['title'].progress_apply(cleanse_by_length)
corpus = corpus[corpus['has_long_title'] == True]
corpus = corpus[['id', 'brand', 'title', 'description', 'price', 'priceCurrency', 'specTableContent', 'cluster_id', 
                  'sku', 'mpn', 
                  'gtin', 'gtin8', 'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier', 'is_en', 'has_long_title']]
corpus.to_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict_only_long_title.pkl.gz')

# Apply cluster-based cleansing

In [None]:
import multiprocessing as mp
from tqdm.contrib.concurrent import process_map

corpus['is_mainentity'] = True
    
corpus_selection = corpus[['title', 'cluster_id', 'id']].copy()
counts = corpus_selection['cluster_id'].value_counts()
counts = counts[counts > 1]
corpus_selection = corpus_selection[corpus_selection['cluster_id'].isin(counts.index)]

clusters = list(set(corpus_selection['cluster_id']))
to_remove = []
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]

corpus_selection['title_processed'] = corpus_selection['title'].apply(lower_to_unicode)
corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(preprocess_string, args=(CUSTOM_FILTERS,))
corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(set)
corpus_selection = corpus_selection[['title_processed', 'cluster_id', 'id']]

corpus_selection = corpus_selection.set_index('cluster_id', drop=False)
corpus_selection = corpus_selection.sort_index()

clusters_data = [corpus_selection.loc[cluster] for cluster in tqdm(clusters)]

batch = 10000
n_workers = 20

results = process_map(cleanse_by_mainentity_heuristic,clusters_data, max_workers=n_workers, chunksize=batch)

to_remove = set()

for result in tqdm(results):
    to_remove.update(result)
    
corpus.loc[to_remove, 'is_mainentity'] = False

corpus = corpus[corpus['is_mainentity'] == True]
corpus = corpus[['id', 'brand', 'title', 'description', 'price', 'priceCurrency', 'specTableContent', 'cluster_id', 
                  'sku', 'mpn', 
                  'gtin', 'gtin8', 'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier', 'is_en', 'has_long_title', 'is_mainentity']]
corpus.to_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.pkl.gz')