In [None]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

from pdb import set_trace

from statistics import median, mean

from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from gensim.parsing.preprocessing import lower_to_unicode, preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric

from tqdm.auto import tqdm
tqdm.pandas()

# Load cleansed PDC2020 corpus

In [None]:
corpus = pd.read_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.pkl.gz')
corpus.head()

In [None]:
counts = corpus['cluster_id'].value_counts()
counts = counts[counts > 3]

# Apply DBSCAN clustering and save it for manual labeling

In [None]:
eps_list = [0.35]
min_samples_list = [1]

for eps in eps_list:
    for min_sample in min_samples_list:
        print(f'eps: {eps}, min_samples: {min_sample}')
        
        corpus_selection = corpus[corpus['cluster_id'].isin(counts.index)].copy()
        corpus_selection = corpus_selection.drop_duplicates('cluster_id')
        CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]

        corpus_selection['title_processed'] = corpus_selection['title'].apply(lower_to_unicode)
        corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(preprocess_string, args=(CUSTOM_FILTERS,))
        corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(lambda x: ' '.join(x))
        
        vectorizer = CountVectorizer(strip_accents='unicode', binary=True, min_df=4)
        #vectorizer = TfidfVectorizer(strip_accents='unicode', use_idf=False)
        matrix = vectorizer.fit_transform(corpus_selection['title_processed'])

        dbscan = DBSCAN(metric='cosine', eps=eps, min_samples=min_sample)
        #dbscan = OPTICS(metric='cosine', max_eps=eps, eps=eps, min_samples=min_sample, cluster_method='dbscan')
        clustering = dbscan.fit(matrix)
        corpus_selection['dbscan_cluster'] = clustering.labels_
        
        counts_relevant = corpus['cluster_id'].value_counts()

        counts_relevant_unseen = counts_relevant[counts_relevant > 3]
        counts_relevant_unseen = counts_relevant_unseen[counts_relevant_unseen < 7]
        
        counts_relevant_seen = counts_relevant[counts_relevant > 6]
        counts_relevant_seen = counts_relevant_seen[counts_relevant_seen < 81]
        
        print(f'Seen data:')
        corpus_selection_seen = corpus_selection[corpus_selection['cluster_id'].isin(counts_relevant_seen.index)].copy()
        corpus_selection_seen = corpus_selection_seen[corpus_selection_seen['dbscan_cluster'] != -1]
        
        print(f'Clusters found: {len(corpus_selection_seen["dbscan_cluster"].unique())}')
        print(f'Mean cluster size: {mean(corpus_selection_seen["dbscan_cluster"].value_counts())}, Median cluster_size: {median(corpus_selection_seen["dbscan_cluster"].value_counts())}')
        
        counts_clustering = corpus_selection_seen['dbscan_cluster'].value_counts()
        counts_clustering = counts_clustering[counts_clustering > 2]
        corpus_selection_seen = corpus_selection_seen[corpus_selection_seen['dbscan_cluster'].isin(counts_clustering.index)]
        corpus_selection_seen = corpus_selection_seen.sort_values('dbscan_cluster')
        
        print(f'Clusters >2 found: {len(corpus_selection_seen["dbscan_cluster"].unique())}')
        print(f'Mean cluster size: {mean(corpus_selection_seen["dbscan_cluster"].value_counts())}, Median cluster_size: {median(corpus_selection_seen["dbscan_cluster"].value_counts())}\n')
        corpus_selection_seen = corpus_selection_seen[['dbscan_cluster', 'brand', 'title', 'description', 'price', 'priceCurrency',
       'specTableContent', 'id', 'cluster_id', 'sku', 'mpn', 'gtin', 'gtin8',
       'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier']]
        
        corpus_selection_seen.to_excel(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_eps{eps}_minsamples{min_sample}_dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.xlsx', header=True, index=False)
        
        db_clu = corpus_selection_seen[['cluster_id', 'dbscan_cluster']].copy()
        db_clu = db_clu.drop_duplicates('cluster_id')
        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_mapping.csv', header=True, index=False)
        db_clu = corpus_selection_seen['dbscan_cluster'].copy()
        db_clu = db_clu.drop_duplicates()
        db_clu = db_clu.sort_values()
        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_clusters.csv', header=True, index=False)
        
        print(f'Unseen data:')
        corpus_selection_unseen = corpus_selection[corpus_selection['cluster_id'].isin(counts_relevant_unseen.index)].copy()
        corpus_selection_unseen = corpus_selection_unseen[corpus_selection_unseen['dbscan_cluster'] != -1]
        
        print(f'Clusters found: {len(corpus_selection_unseen["dbscan_cluster"].unique())}')
        print(f'Mean cluster size: {mean(corpus_selection_unseen["dbscan_cluster"].value_counts())}, Median cluster_size: {median(corpus_selection_unseen["dbscan_cluster"].value_counts())}')
        
        counts_clustering = corpus_selection_unseen['dbscan_cluster'].value_counts()
        counts_clustering = counts_clustering[counts_clustering > 2]
        corpus_selection_unseen = corpus_selection_unseen[corpus_selection_unseen['dbscan_cluster'].isin(counts_clustering.index)]
        corpus_selection_unseen = corpus_selection_unseen.sort_values('dbscan_cluster')
        
        print(f'Clusters >2 found: {len(corpus_selection_unseen["dbscan_cluster"].unique())}')
        print(f'Mean cluster size: {mean(corpus_selection_unseen["dbscan_cluster"].value_counts())}, Median cluster_size: {median(corpus_selection_unseen["dbscan_cluster"].value_counts())}\n')
        corpus_selection_unseen = corpus_selection_unseen[['dbscan_cluster', 'brand', 'title', 'description', 'price', 'priceCurrency',
       'specTableContent', 'id', 'cluster_id', 'sku', 'mpn', 'gtin', 'gtin8',
       'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier']]
        
        corpus_selection_unseen.to_excel(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_eps{eps}_minsamples{min_sample}_dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.xlsx', header=True, index=False)
        
        db_clu = corpus_selection_unseen[['cluster_id', 'dbscan_cluster']].copy()
        db_clu = db_clu.drop_duplicates('cluster_id')
        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_mapping.csv', header=True, index=False)
        db_clu = corpus_selection_unseen['dbscan_cluster'].copy()
        db_clu = db_clu.drop_duplicates()
        db_clu = db_clu.sort_values()
        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_clusters.csv', header=True, index=False)

        print(f'-------------------------------------------------------------------------')