In [15]:
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess
import re
import string
from nltk.corpus import stopwords
import os
from wordcloud import WordCloud
import shutil
stopwords = stopwords.words('english')


In [16]:
df = pd.read_csv("../../data/extracted/TRAINING/training.csv", delimiter='\t')
print(df.shape)
df.head(3)

(10000, 7)


Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,Milk Milk.zip
1,10.jpg,1,0,0,0,1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,1000.jpg,0,0,0,0,0,BREAKING NEWS: Russia releases photo of DONALD...


In [17]:
def preprocess(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    text = ' '.join([word for word in text.split() if len(word) >= 3])
    text = ' '.join([word for word in text.split() if not re.match(r'\b\w+\.(com|org|net)\b', word)])
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([word for word in text.split() if len(word) >= 3])
    return text

texts = df['Text Transcription'].tolist()
texts = list(map(preprocess, texts))

In [18]:
num_topics = 6

In [19]:
docs = [text.split() for text in texts]

dictionary = gensim.corpora.Dictionary(docs)

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

vocab_length = len(dictionary)

bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

gsdmm = MovieGroupProcess(K=num_topics, alpha=0.1, beta=0.3, n_iters=15)

y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 7401 clusters with 6 clusters populated
In stage 1: transferred 4845 clusters with 6 clusters populated
In stage 2: transferred 3906 clusters with 6 clusters populated
In stage 3: transferred 3455 clusters with 6 clusters populated
In stage 4: transferred 3172 clusters with 6 clusters populated
In stage 5: transferred 2983 clusters with 6 clusters populated
In stage 6: transferred 2745 clusters with 6 clusters populated
In stage 7: transferred 2668 clusters with 6 clusters populated
In stage 8: transferred 2583 clusters with 6 clusters populated
In stage 9: transferred 2461 clusters with 6 clusters populated
In stage 10: transferred 2488 clusters with 6 clusters populated
In stage 11: transferred 2493 clusters with 6 clusters populated
In stage 12: transferred 2389 clusters with 6 clusters populated
In stage 13: transferred 2400 clusters with 6 clusters populated
In stage 14: transferred 2410 clusters with 6 clusters populated


In [20]:
import joblib

In [21]:
joblib.dump(gsdmm, 'gsdmm.joblib')

['gsdmm.joblib']

In [22]:
gsdmm = joblib.load('gsdmm.joblib')

In [23]:
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

top_index = doc_count.argsort()[-num_topics:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))


Number of documents per topic : [1137 2742 1955 1898 1356  912]
Most important clusters (by number of docs inside): [1 2 3 4 0 5]


In [24]:
num_words = 20

for topic_num in top_index:
    topic_dict = dict(sorted(gsdmm.cluster_word_distribution[topic_num].items(), key=lambda k: k[1], reverse=True)[:num_words])
    print(f"{topic_num}: {topic_dict}")
    

1: {'girlfriend': 335, 'wife': 330, 'get': 228, 'kitchen': 166, 'like': 166, 'house': 152, 'one': 135, 'woman': 130, 'mom': 130, 'hooker': 124, 'make': 120, 'prostitute': 114, 'know': 112, 'time': 108, 'girl': 107, 'you': 105, 'got': 105, 'want': 104, 'day': 102, 'home': 94}
2: {'like': 354, 'girls': 233, 'girl': 175, 'women': 163, 'kitchen': 138, 'meme': 114, 'look': 111, 'one': 100, 'woman': 98, 'good': 86, 'know': 81, 'fat': 67, 'get': 66, 'girlfriend': 63, 'guy': 62, 'center': 62, 'see': 61, 'ass': 59, 'made': 58, 'new': 58}
3: {'women': 672, 'men': 324, 'woman': 278, 'feminists': 157, 'man': 150, 'feminism': 145, 'feminist': 143, 'like': 138, 'want': 120, 'rights': 95, 'rape': 94, 'get': 91, 'equal': 88, 'people': 84, 'make': 81, 'womens': 75, 'female': 73, 'right': 65, 'girl': 63, 'think': 63}
4: {'call': 154, 'house': 142, 'cooking': 79, 'people': 76, 'one': 70, 'cheat': 70, 'clean': 66, 'witch': 63, 'gold': 59, 'like': 56, 'get': 56, 'toilet': 53, 'paper': 48, 'new': 47, 'man':

In [96]:
path_to_font = "milky_coffee/Milky Coffee.ttf"


output_directory = "outputs_gsdmm"
shutil.rmtree(output_directory)
os.mkdir(output_directory)



cluster_word_distribution = gsdmm.cluster_word_distribution
topic_num = 0
num_words = 20

for topic_num in top_index:

    topic_dict = dict(sorted(cluster_word_distribution[topic_num].items(), key=lambda k: k[1], reverse=True)[:num_words])

    # Generate a word cloud image
    wordcloud = WordCloud(background_color='#fcf2ed', 
                                width=1800,
                                height=700,
                                font_path=path_to_font,
                                colormap='flag').generate_from_frequencies(topic_dict)

    wordcloud.to_file(os.path.join(output_directory, f"topic_{topic_num}.png"))

In [100]:
gsdmm.choose_best_label(doc = ['feminist', 'cooking', 'kitchen'])

(4, 0.40652958046338517)