In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

In [2]:
from collections import Counter
from itertools import chain
from deep_translator import GoogleTranslator
from ftlangdetect import detect
import re
import random

Считывание корпуса хэштегов

In [319]:
with open(r"C:full_tag_list.txt", "r", encoding="utf-8") as file:
    hashtags = file.readlines()

In [320]:
hashtags_all=[]
for st in hashtags:
    hashtags_all.append(st.strip().split())

### Обработка

In [321]:
# определение языка
def get_lang(string_obj):
    result = detect(text=string_obj, low_memory=False)
    return result['lang']

In [322]:
# Отбрасываем хэштеги, если гиперонимы из списка
banned_hypernyms = ['emotion','feeling', 'quality', 'temperature']

In [323]:
def get_hypernyms(word, stop_hyp):
    current_hypernyms = ''
    flag_stop_word = False
    flag_empty = False
    first_meaning = ''
    for i,j in enumerate(wn.synsets(word)):
        if i == 0:
            first_meaning = j.name().split('.')[0]
        current_hypernyms += ', ' + ", ".join(list(chain(*[l.lemma_names() for l in j.hypernyms()])))
    for w in stop_hyp:
        if w in current_hypernyms:
            flag_stop_word = True
    if current_hypernyms == '':
        flag_empty = True
    return first_meaning, flag_stop_word, flag_empty

In [None]:
data_remade = []
for row in hashtags_all:
    row_remade = []
    for tag in row:
        # автоматическая чистка основаных мусорных хэштегов
        if tag in ['instagram', 'instapic', 'instagrammers', 'instagramers', 'instagramhub']:
            tag = ''
        elif 'like' in tag:
            tag = ''
        elif 'ofinstagram' in tag:
            tag = re.sub(r'ofinstagram', '', tag)
        elif 'instagram' in tag:
            tag = re.sub(r'instagram', '', tag)
        elif 'insta' in tag:
            tag = re.sub(r'insta', '', tag)
        
        if tag != '':
            lang = get_lang(tag)
            if lang != 'en':
                try:
                    # перевод иноязычных хэштегов, только после FastText модели!
                    tag = GoogleTranslator(source=lang, target='en').translate(tag)
                except:
                    tag = GoogleTranslator(source='auto', target='en').translate(tag)
            _, flag_stop_word, _ = get_hypernyms(tag, banned_hypernyms)
            if flag_stop_word == True:
                tag = ''

        if tag != '':
            row_remade.append(tag)
            
    data_remade.append(row_remade)


Дополнение последовательностей хэштегами из того же кластера

- Векторизация FastText
- Кластеризация DBSCAN

In [91]:
import fasttext
from sklearn.cluster import DBSCAN
import sklearn
import numpy as np

In [108]:
with open('hashtags_dataset.txt', 'r', encoding='utf-8') as file:
    checked_hashtags = file.readlines()

In [109]:
new_hashtags=[]
for st in checked_hashtags:
    new_hashtags.append(list(set(st.strip().split())))

In [93]:
ft_model = fasttext.load_model(r"wiki.en.bin")



In [110]:
dict_keys = []
dict_vals = []
for row in new_hashtags:
    for obj in row:
        if obj not in dict_keys:
            dict_keys.append(obj)
            dict_vals.append(ft_model[obj])

In [111]:
X_data = np.array(dict_vals)

In [112]:
clustering = DBSCAN(eps=0.3826, min_samples=3, metric='cosine').fit(X_data)

In [113]:
len(set(clustering.labels_))

39

In [114]:
clusters = {}
for cl in list(set(clustering.labels_)):
    if cl not in clusters:
        clusters[cl] = []

for i in range(len(dict_keys)):
    clusters[clustering.labels_[i]].append(dict_keys[i])


In [115]:
clusters[34]

['nailsonfleek', 'nailsta', 'nailsart', 'nailsalon']

In [143]:
agglo  = sklearn.cluster.FeatureAgglomeration(n_clusters=60, metric='cosine', linkage='average')
agglo.fit(X_data.T)

In [144]:
agglo_clusters = {}
for cl in list(set(agglo.labels_)):
    if cl not in agglo_clusters:
        agglo_clusters[cl] = []


In [145]:
for i in range(len(dict_keys)):
    agglo_clusters[agglo.labels_[i]].append(dict_keys[i])

In [149]:
agglo_clusters[45]

['urban', 'city', 'tourist', 'hub', 'tourism']

In [None]:
groups = clusters

In [176]:
additional_data = []

for row in new_hashtags:
    random.shuffle(groups)
    new_row = row
    addition = []
    if len(row) < 6:
        for var in groups:
            if len(set(row) & set(var)) > 0:
                temp_diff = list(set(var) - set(row))
                if len(temp_diff) == 0:
                    continue
                else:
                    num = random.randint(1, len(temp_diff))
                    addition = random.sample(temp_diff, num)
                    print(row)
                    print('>>>', addition)
                    break
    new_row += addition
    additional_data.append(new_row)

['travel', 'sea', 'trip', 'wanderlust', 'vacation']
>>> ['travelphotography', 'adventure', 'travelblogger']
['travel', 'sea', 'traveling', 'travelblogger', 'wanderlust']
>>> ['travelphotography', 'adventure']
['travel', 'sea', 'travelphotography', 'wanderlust', 'adventure']
>>> ['travelblogger', 'vacation']
['sea', 'art', 'artistzone', 'painting', 'color']
>>> ['colors', 'artwork', 'drawing', 'creative']
['travel', 'sea', 'travelblogger', 'wanderlust']
>>> ['vacation']
['travel', 'sea', 'travelphotography', 'travelblogger', 'vacation']
>>> ['wanderlust']
['travel', 'sea', 'adventure', 'travelblogger', 'travelphotography']
>>> ['vacation', 'wanderlust']
['style', 'home', 'interior', 'streetstyle', 'styleblogger']
>>> ['outfitoftheday', 'outfit']
['inspiration', 'home', 'quotes', 'goodvibes', 'selflove']
>>> ['quote']
['home', 'lifeisgood', 'adventure', 'travelblogger', 'travel']
>>> ['wanderlust', 'travelphotography']
['home', 'pet', 'cutepetclub', 'cuteanimals', 'cutepet']
>>> ['petsof

In [178]:
with open('fuller_hashtags.txt', 'w', encoding='utf-8') as file:
    for row in additional_data:
        file.write(' '.join(row)+'\n')