In [None]:
from datasets import load_dataset

dataset = load_dataset("SocialGrep/one-million-reddit-jokes", split='train[:30%]')

In [None]:

dataset

In [None]:
import pandas as pd
df = pd.DataFrame(dataset)

In [None]:
import numpy as np
df['score'] = pd.to_numeric(df['score'])
data_df = df.sort_values(by="score", ascending=True, ignore_index=True)
value1 = data_df.iloc[150000]
distribution = data_df['score'].value_counts()
length = 300000
# we want 4 different subcategories
value1 = round(length * (1/4))
value2 = round(length * (2/4))
value3 = round(length * (3/4))
total = 0
start = 0
end = 0
end2 = 0
end3 = 0
difference = np.Inf
difference2 = np.Inf
difference3 = np.Inf

for value, count in distribution.items():
    total += count
    if abs(value1 - total) < difference:
        difference = abs(value1 - total)
        end = value
    if abs(value2 - total) < difference2:
        difference2 = abs(value2 - total)
        end2 = value
    if abs(value3 - total) < difference3:
        difference3 = abs(value3 - total)
        end3 = value

print([end, end2, end3])
#bounds are (0, 0), (1, 2), (3, 8), (8, INF)
len(data_df)

In [None]:
from sklearn import feature_extraction
from sklearn.cluster import KMeans
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
import cluster_functions as cp

In [None]:
data_df = data_df[(data_df['score']>8)]
phrase_model = Phrases([doc.split() for doc in data_df.loc[:,"title"].values], 
                        min_count = 2, 
                        threshold = 0.7, 
                        connector_words = ENGLISH_CONNECTOR_WORDS, scoring = "npmi"
                        )
len(data_df)

In [None]:
print(phrase_model.export_phrases().keys())

In [None]:
data_df.loc[:,"title"] = [" ".join(phrase_model[sentence.split()]) for sentence in data_df.loc[:,"title"]]


In [None]:
features = feature_extraction.text.CountVectorizer(input='content', 
                                                encoding='utf-8', 
                                                decode_error='ignore', 
                                                lowercase=True, 
                                                tokenizer = None,
                                                ngram_range=(1, 1), 
                                                analyzer='word', 
                                                max_features=500,   #Choose number of future stopwords
                                                )

In [None]:
#Sklearn first fits then transforms
features.fit(data_df.loc[:,"title"].values)
#The most frequent words can be found in the dictionary of vocabulary items
stopwords = list(features.vocabulary_.keys())
print(stopwords)
print("ABOVE: Frequent words to exclude")

#Create a loop to continue clustering until the largest category is not too big
main_topic = data_df    #Initialize main topic
cluster_prefix = "Topic"     #Start with root topics
holder = []
starting_length = len(data_df)
counter = 0

In [None]:
import importlib
importlib.reload(cp)
while True:

    #Run clustering
    counter += 1
    main_topic, other_topics, most_frequent = cp.cluster(stopwords, main_topic, cluster_prefix)
    cluster_prefix = str(most_frequent)

    #Check stopping conditions, no topic over 20% of documents
    if len(main_topic)/len(data_df) < 0.20:
        holder.append(other_topics)
        holder.append(main_topic)
        break

    #Keep going
    else:
        holder.append(other_topics)
        print("Continuing after round " + str(counter), "Current: ", len(main_topic), "Total: ", starting_length)

In [None]:
holder

In [None]:
data_df = pd.concat(holder)
data_df.sort_values("Topic", inplace = True)
data_df
# #Reorder columns
data_df = data_df.loc[:,["title", "score", "Topic"]]
# print(data_df)
print(data_df.Topic.value_counts())

# #Saves
data_df.to_csv("Jokes3_by_topic.csv")

In [None]:
importlib.reload(cp)

In [None]:
data_df2 = data_df.loc[:,["title", "score"]]
main_topic2 = data_df2    #Initialize main topic
cluster_prefix = "Syntax"     #Start with root topics
holder2 = []
starting_length2 = len(data_df2)
counter = 0

while True:

    #Run clustering
    counter += 1
    main_topic2, other_topics, most_frequent = cp.cluster2(main_topic2, cluster_prefix)
    cluster_prefix = str(most_frequent)

    #Check stopping conditions, no topic over 20% of documents
    if len(main_topic2)/len(data_df) < 0.35:
        holder2.append(other_topics)
        holder2.append(main_topic2)
        break

    #Keep going
    else:
        holder2.append(other_topics)
        print("Continuing after round " + str(counter), "Current: ", len(main_topic2), "Total: ", starting_length2)
        

In [None]:
data_df2 = pd.concat(holder2)
data_df2.sort_values("Syntax", inplace = True)
data_df2
# #Reorder columns
data_df2 = data_df2.loc[:,["title", "score", "Syntax"]]
# print(data_df)
print(data_df2.Syntax.value_counts())

# #Saves
data_df2.to_csv("Jokes3_by_structure.csv")