In [56]:
import pandas as pd
import numpy as np
import gensim
from gsdmm import MovieGroupProcess
import re
import nltk
import string
from nltk.corpus import stopwords
import os
import matplotlib.pyplot as plt
stopwords = stopwords.words('english')


In [51]:
df = pd.read_csv("../../data/extracted/TRAINING/training.csv", delimiter='\t')
print(df.shape)
df.head(3)

(10000, 7)


Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,1.jpg,0,0,0,0,0,Milk Milk.zip
1,10.jpg,1,0,0,0,1,"ROSES ARE RED, VIOLETS ARE BLUE IF YOU DON'T S..."
2,1000.jpg,0,0,0,0,0,BREAKING NEWS: Russia releases photo of DONALD...


In [52]:
def preprocess(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    text = ' '.join([word for word in text.split() if len(word) >= 3])
    text = ' '.join([word for word in text.split() if not re.match(r'\b\w+\.(com|org|net)\b', word)])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

texts = df['Text Transcription'].tolist()
texts = list(map(preprocess, texts))

In [53]:
# cast tweets to numpy array
docs = [text.split() for text in texts]

# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(docs)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

# initialize GSDMM
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 8495 clusters with 15 clusters populated
In stage 1: transferred 5667 clusters with 15 clusters populated
In stage 2: transferred 4579 clusters with 15 clusters populated
In stage 3: transferred 4030 clusters with 15 clusters populated
In stage 4: transferred 3752 clusters with 15 clusters populated
In stage 5: transferred 3533 clusters with 15 clusters populated
In stage 6: transferred 3393 clusters with 15 clusters populated
In stage 7: transferred 3391 clusters with 15 clusters populated
In stage 8: transferred 3282 clusters with 15 clusters populated
In stage 9: transferred 3243 clusters with 15 clusters populated
In stage 10: transferred 3064 clusters with 15 clusters populated
In stage 11: transferred 3115 clusters with 15 clusters populated
In stage 12: transferred 3111 clusters with 15 clusters populated
In stage 13: transferred 3031 clusters with 15 clusters populated
In stage 14: transferred 3008 clusters with 15 clusters populated


In [58]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
# top_words(gsdmm.cluster_word_distribution, top_index, 20)

Number of documents per topic : [ 596  527  313  375  806  732  945  605  443 1245  830 1017  761  560
  245]
Most important clusters (by number of docs inside): [ 9 11  6 10  4 12  5  7  0 13  1  8  3  2 14]


In [66]:
# Import wordcloud library
from wordcloud import WordCloud
path_to_font = "milky_coffee/Milky Coffee.ttf"
output_directory = "outputs"

# Get topic word distributions from gsdmm model
cluster_word_distribution = gsdmm.cluster_word_distribution
topic_num = 0
num_words = 20

for topic_num in top_index:
    # Select topic you want to output as dictionary (using topic_number)
    topic_dict = dict(sorted(cluster_word_distribution[topic_num].items(), key=lambda k: k[1], reverse=True)[:num_words])

    # Generate a word cloud image
    wordcloud = WordCloud(background_color='#fcf2ed', 
                                width=1800,
                                height=700,
                                font_path=path_to_font,
                                colormap='flag').generate_from_frequencies(topic_dict)

    # Print to screen
    # fig, ax = plt.subplots(figsize=[20,10])
    # plt.imshow(wordcloud, interpolation='bilinear')
    # plt.axis("off");

    # Save to disk
    wordcloud.to_file(os.path.join(output_directory, f"topic_{topic_num}.png"))

<wordcloud.wordcloud.WordCloud at 0x152a762bb310>