In [None]:
from bertopic import BERTopic
import pandas as pd
from translate import Translator
from tqdm import tqdm

In [None]:
tqdm.pandas()

In [None]:
df = pd.read_csv("../data/preprocessed.csv", low_memory=False)

In [None]:
df.head(20)

In [None]:
words_to_omit = ["9dojplyjpl", "1995", "1986", "1989", "1980", "1981", "1987", "93", "۹۳", "می", "سلام", "ها", "های", "آها"] 
from hazm import word_tokenize
tqdm.pandas()
def remove_stopwords(text) :
    tokens = word_tokenize(text)
    tokens_without_sw = [word for word in tokens if not word in words_to_omit]
    text = " ".join([word for word in tokens_without_sw])
    return text

def clean_data(data_df, col_name) :
    data_df[col_name] = data_df[col_name].progress_apply(lambda t: omit_specific_words(t))

clean_data(df, 'tweet')

In [None]:
df.head(20)

In [None]:
tweets = df["tweet"].to_list()

## language="multilingual", nr_topics=10

In [None]:
topic_model = BERTopic(language="multilingual", nr_topics=10)
topics, probs = topic_model.fit_transform(tweets)

In [None]:
res_df = topic_model.get_topic_info()
res_df

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

#### English

In [None]:
manual_dict = {
    "دورکاری": "Remote Work",
    "خونه": "Home",
    "کرونا": "Coronavirus",
    "کارا": "Things",
    "سیستم": "System",
    "مجبور": "Have to",
    "یاد": "Learn",
    "خوبه": "Good",
    "سرکار": "At Work",
    "زن": "Woman",
    "قرنطینه": "Quarantine",
    "کارمند": "Employee",
    "کارفرما": "Employer",
    "کار منزل": "Work from home",
    "صورت دورکاری": "",
    "وقت": "Time",
    "تجربه": "Experience",
    "خانم": "Femail",
    "حقوق": "Salary",
    "دورکاری دورکاری": "",
    "ریموت": "Remote",
    "منزل": "Home",
    "بابا": "",
    "مدیریت": "management",
    "تموم": "finish",
    "خانگی": "homemade",
    "بخوابم": "sleep",
    "پیشنهاد": "suggestion",
    "قطعی": "outage",
    "شبکه": "network",
    "قطع": "went",
    "iranestekhdam ": "recruitment",
    "برون": "Outsourcing",
    "سپاری": "Outsourcing",
    "حضوری": "in person",
    "کاری": "job-related",
    "پیشنهاد": "suggest",
    "بازارکار": "job market",
    "پردرآمد": "high income",
    "دورکار": "remote",
    "برنامه": "program"
}

translator = Translator(to_lang="en", from_lang="fa")
translated_dict = {}
for key, value in topic_model.get_topics().items():
    new_value = []
    for l in value:
        if l[0] in manual_dict:
            new_value.append((manual_dict[l[0]], l[1]))
        else:
            tmp = translator.translate(l[0])
            new_value.append((tmp, l[1]))
            print(l[0] + " -> " + tmp)
    translated_dict[key] = new_value

In [None]:
topic_model.get_topics()

In [None]:
translated_dict

In [None]:
temp_topic_model = BERTopic(language="english", calculate_probabilities=False)
temp_topic_model.topics = translated_dict
temp_topic_model.topic_sizes = topic_model.topic_sizes.copy()
temp_topic_model.probs = probs
temp_topic_model.get_topics()

In [None]:
temp_topic_model.visualize_barchart()

In [None]:
import matplotlib.pyplot as plt

# Extract the topics and their probabilities from the dictionary
topics = []
probs = []
for topic, tuples in translated_dict.items():
    topics.append(topic)
    topic_probs = [(word, prob) for word, prob in tuples]
    probs.append(topic_probs)

# Plot the barchart for each topic
plt.figure(figsize=(10, 6))
for topic, topic_probs in zip(topics, probs):
    words, probabilities = zip(*topic_probs)
    plt.bar(words, probabilities, label=f"Topic {topic}")

plt.xlabel('Words')
plt.ylabel('Probabilities')
plt.title('Topic Model - Barchart')
plt.legend()
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [10]:
topic_model.visualize_heatmap()

In [11]:
topic_model.get_topic(3)

[('اینترنت', 0.17471219714227523),
 ('اینترنتی', 0.05075170917774581),
 ('کار', 0.04652929709414984),
 ('برق', 0.04625067256430334),
 ('منزل', 0.04501514662462472),
 ('دورکاری', 0.04243604993061482),
 ('درآمد', 0.04093379191193606),
 ('سرعت', 0.04087407388628696),
 ('قطع', 0.03945912710513787),
 ('کسب', 0.036131762524161164)]

## language="persian", nr_topics="auto"

In [12]:
topic_model = BERTopic(language="persian", nr_topics="auto")
topics, probs = topic_model.fit_transform(tweets)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [13]:
res_df = topic_model.get_topic_info()
res_df

Unnamed: 0,Topic,Count,Name
0,-1,22657,-1_دورکاری_کار_می_خونه
1,0,18063,0_چاپ_کار_دورکاری_منزل
2,1,1111,1_نمی_خونه_دورکاری_دورکار
3,2,465,2_قرنطینه_خونه_دورکاری_کرونا
4,3,375,3_واکسن_واکسیناسیون_واکسینه_دوز
5,4,322,4_خواب_خوابم_بخوابم_صبح
6,5,156,5_ترافیک_کاهش_تاکسی_حمل
7,6,136,6_ویروس_شیوع_coronavirus_ویروسکرونا
8,7,134,7_توییتر_توییت_توئیت_توئیتر
9,8,128,8_آلمان_یورو_سوئد_آلمانی


In [14]:
res_df.to_csv("res__nr_topics_auto_after_reply_deletion.csv")

In [15]:
topic_model.visualize_topics()

In [16]:
topic_model.visualize_barchart()

In [17]:
topic_model.visualize_heatmap()

In [18]:
topic_model.get_topic(4)

[('خواب', 0.09559486619029868),
 ('خوابم', 0.07206880840207146),
 ('بخوابم', 0.05380682545213605),
 ('صبح', 0.04787970658537198),
 ('بیدار', 0.04287943660089377),
 ('شب', 0.039177020434651724),
 ('میخوابم', 0.03301651588086282),
 ('خوابیدم', 0.028168516213682968),
 ('خوابی', 0.024710713726117443),
 ('دورکاری', 0.020285529974733624)]

## language="multilingual", nr_topics=10

In [19]:
topic_model = BERTopic(language="multilingual", nr_topics=10)
topics, probs = topic_model.fit_transform(tweets)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [20]:
res_df = topic_model.get_topic_info()
res_df

Unnamed: 0,Topic,Count,Name
0,-1,36074,-1_دورکاری_کار_می_خونه
1,0,2725,0_دورکاری_تهران_کارمندان_کار
2,1,1497,1_چاپ_خانگی_سیلک_شغل
3,2,880,2_اینترنت_کار_دورکاری_اینترنتی
4,3,768,3_ریموت_کار_ایران_دلار
5,4,602,4_پروژه_بازارکار_توضیحات_نمایید
6,5,600,5_زنگ_دورکاری_می_خونه
7,6,585,6_دورکاری_خر_سرپرستیتو_شراط
8,7,554,7_دورکار_حضوری_سرکار_دورکاری
9,8,538,8_زن_زنان_خانه_کار


In [21]:
topic_model.visualize_topics()

In [22]:
topic_model.visualize_barchart()

In [23]:
topic_model.visualize_heatmap()

In [24]:
topic_model.get_topic(2)

[('اینترنت', 0.1543509751773434),
 ('کار', 0.04882747855866737),
 ('دورکاری', 0.044262509830732065),
 ('اینترنتی', 0.04218945691443545),
 ('سرعت', 0.040807228971435155),
 ('منزل', 0.039506739846216866),
 ('برق', 0.039500227816458476),
 ('درآمد', 0.034873044765885625),
 ('قطع', 0.031909622902046),
 ('آنلاین', 0.031248609321333488)]