In [44]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Azalea: Journal Of Korean Literature & Culture, V.8 3 stories
tokenized_text = ['story', 'translation', 'essay', 'follow', 'serve', 'purpose', 'span', 'year', 'story', 'offer', 'glimpse', 'development', 'accomplished', 'writer', 'short', 'fiction', 'modern', 'korea', 'story', 'distinct', 'sick', 'butterfly', 'pyŏngdŭn', 'nabi', 'spring', '1942', 'date', 'composition', 'tell', 'limited', 'person', 'perspective', 'hwang', 'perfect', 'school', 'blind', 'mute', 'maengawŏn', 'esŏ', '1953', 'reflect', 'considerable', 'insight', 'human', 'psychology', 'tale', 'bamboo', 'wife', 'na', 'chukpuin', 'chŏn', 'july', '1985', 'hint', 'hwang', 'gift', 'storyteller', 'similarity', 'author', 'imagination', 'border', 'surreal', 'work', 'precision', 'composition', 'ear', 'spoken', 'word']

# Define controlled vocabularies for each set
controlled_vocabularies = {
    'BISG': ["absurdist", "action", "adaptation", "adventure", "african american", "alternative history", "amish", "animal", "anthology", "arab american", "asian american", "biographical", "black", "buddhist", "christian", "citylife", "classic", "coming of age", "crime", "cultural heritage", "detective", "disability", "disaster", "diversity", "dystopian", "environment", "epistolary", "erotica", "fairytale", "familylife", "fantasy", "feminist", "friendship", "ghost", "gothic", "graphic novel", "hispanic", "historical", "holiday", "horror", "humorous", "immigration", "indigenous", "jewish", "latino", "legal", "lgbtq+", "literary", "litrpg", "magical realism", "mashup", "media tie in", "medical", "mennonite", "metaphysical", "middle eastern", "military", "multicultural", "multiple timeline", "muslim", "mystery", "mythology", "native american", "nature", "neurodiversity", "noir", "occult", "own voice", "pacific islander", "pastiche", "performing art", "political", "psychological", "religious", "romance", "rural", "sagas", "satire", "science fiction", "seastorie", "short story", "small town", "southern", "sport", "streetlit", "suburban", "superhero", "supernatural", "thriller", "turtleisland", "urban", "visionary", "war", "western", "woman"],
    'LC-history': ["1905", "1910", "1919", "1945", "1948", "1950", "1953", "1960", "1961", "1979", "1980", "1988", "2002", "2010", "20th century", "allied occupation", "april revolution", "bombardment", "buma", "buma uprising", "chejudo", "chejudo rebellion", "chosen", "chōsen", "collaborationists", "declaration independence", "independence movement", "invasions", "japanese occupation", "korean war", "kwangju", "kwangju si", "kwangju uprising", "mansei movement", "may revolution", "operation blacklist", "partition", "pictorial works", "puma", "puma uprising", "resistance movements", "reunification", "reunification question", "sunchon", "sunchʻon", "territorial", "twentieth century", "war", "war and intervention", "yonp'yong", "yŏnp'yŏng island", "yŏsu sunchʻŏn rebellion", "yosun", "yŏsun rebellion"],
    'LC-fiction': ["adventure", "adventure storie", "autobiographical", "autobiographical fiction", "bildungsroman", "biographical", "biographical fiction", "buddhist", "buddhist storie", "children's storie", "christian", "christian fiction", "detective", "detective storie", "didactic", "didactic fiction", "domestic", "domestic fiction", "erotic", "erotic storie", "fantasy", "fantasy fiction", "feminist", "feminist fiction", "ghost", "ghost storie", "hanmun", "hanmun sosŏl", "hanmun sosol", "historical", "historical fiction", "horror", "horror tale", "hypertext", "hypertext fiction", "legal", "legal storie", "martial arts", "mystery", "mystery storie", "political", "political fiction", "romance", "romance fiction", "science", "science fiction", "short storie", "sports", "sports storie", "spy", "spy storie", "suspense", "suspense fiction", "war", "war storie", "young adult"],
    'emotions': ["anger", "annoyance", "displeasure", "entrapment", "frustration", "disgust", "dislike", "captivation", "curiosity", "engagement", "reflection", "confusion", "difficulty", "disorientation", "fear", "stress", "uncertainty", "happiness", "anticipation", "excitement", "joy", "pleasure", "love", "admiration", "attraction", "empathy", "enchantment", "lust", "sensitivity", "apathy", "boredom", "depression", "disaffection", "disappointment", "lassitude", "sadness", "shame", "wistfulness", "astonishment", "bemusement", "disbelief", "surprise", "uncategorized", "caution", "gratitude", "patience", "perseverance"],
    'tones': ["cerebral", "dignified", "erudite", "formal", "insightful", "intelligent", "lofty", "philosophical", "profound", "reflective", "beautiful", "charming", "elegant", "enchanting", "engaging", "entertaining", "haunting", "lush", "pleasurable", "sensitive", "sympathetic", "tender", "touching", "complex", "controversial", "fragmented", "labyrinthine", "nuanced", "picaresque", "psychological", "rambling", "subtle", "wordy", "conventional", "bland", "cliched", "comfortable", "contrived", "familiar", "formulaic", "insipid", "juvenile", "light", "maudlin", "melodramatic", "one dimensional", "repetitive", "stereotypical", "unrealistic", "dramatic", "emotional", "exciting", "fascinating", "heartfelt", "intriguing", "moving", "nostalgic", "powerful", "sensational", "surprising", "suspenseful", "thrilling", "zesty", "frightening", "angst ridden", "chilling", "claustrophobic", "cruel", "daunting", "disturbing", "gory", "graphic", "gruesome", "harsh", "horrifying", "perverse", "psychopathic", "scary", "shocking", "uncomfortable", "unnerving", "violent", "volatile", "humourous", "dark humour", "ironical", "satirical", "imaginative", "adventurous", "allegorical", "creative", "descriptive", "eccentric", "edgy", "evocative", "fantastical", "innovative", "lyrical", "magical", "mysterious", "mythical", "original", "stylish", "surreal", "unique", "wistful", "assured", "hopeful", "innocent", "inspirational", "optimistic", "resilient", "respectful", "triumphant", "uplifting", "abrupt", "authentic", "character driven", "cohesive", "compelling", "gritty", "historical", "poignant", "precise", "prosaic", "readable", "realistic", "resonant", "sad", "barren", "bleak", "dark", "depressing", "desolate", "devastating", "grim", "heavy", "melancholic", "painful"]
}

# Create separate instances of CountVectorizer for each set with the specified n-gram ranges
vectorizers = {
    key: CountVectorizer(vocabulary=vocab, ngram_range=(1, 3) if key == "BISG" else (1, 2))
    for key, vocab in controlled_vocabularies.items()
}

# Display the words that appeared in the text (meaningful words in terms of TF)
all_feature_names = set()
all_term_frequencies = {}

for key, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform([" ".join(tokenized_text)])  # Use the preprocessed and tokenized text
    feature_names = vectorizer.get_feature_names_out()
    term_frequencies = pd.DataFrame(X.toarray(), columns=feature_names)

    # Display the term frequencies for the current set
    print(f"\nControlled Vocabulary Set: {key}")
    print(term_frequencies)

    # Extract words with frequencies greater than or equal to 1
    filtered_term_frequencies = term_frequencies.loc[:, (term_frequencies != 0).any(axis=0)]
    print("\nFiltered Term Frequencies:")
    print(filtered_term_frequencies)

    # Store the feature names and term frequencies for later use
    all_feature_names.update(feature_names)
    all_term_frequencies[key] = filtered_term_frequencies

# Display the unique words that appeared in the text (meaningful words in terms of TF)
print("\nWords Appeared in the Text:")
print(", ".join(all_feature_names))

# Display the combined term frequencies for all sets
print("\nCombined Term Frequencies:")
combined_term_frequencies = pd.concat(all_term_frequencies.values(), axis=1, keys=all_term_frequencies.keys())
print(combined_term_frequencies)



Controlled Vocabulary Set: BISG
   absurdist  action  adaptation  adventure  african american  \
0          0       0           0          0                 0   

   alternative history  amish  animal  anthology  arab american  ...  \
0                    0      0       0          0              0  ...   

   suburban  superhero  supernatural  thriller  turtleisland  urban  \
0         0          0             0         0             0      0   

   visionary  war  western  woman  
0          0    0        0      0  

[1 rows x 95 columns]

Filtered Term Frequencies:
Empty DataFrame
Columns: []
Index: [0]

Controlled Vocabulary Set: LC-history
   1905  1910  1919  1945  1948  1950  1953  1960  1961  1979  ...  sunchʻon  \
0     0     0     0     0     0     0     1     0     0     0  ...         0   

   territorial  twentieth century  war  war and intervention  yonp'yong  \
0            0                  0    0                     0          0   

   yŏnp'yŏng island  yŏsu sunchʻ

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Readymade Bodhisattva
tokenized_text = ['readymade', 'bodhisattva', 'kaya', 'anthology', 'south', 'korean', 'science', 'fiction', 'presents', 'book', 'length', 'english', 'language', 'translation', 'science', 'speculative', 'fiction', 'south', 'korea', 'bringing', '13', 'classic', 'contemporary', 'stories', '1960s', '2010s', 'reimagining', 'asimovian', 'robot', 'walls', 'buddhist', 'temple', 'postapocalyptic', 'showdown', 'south', 'north', 'korean', 'refugees', 'faraway', 'planet', 'fictional', 'recollection', 'disabled', 'woman', 'struggle', 'join', 'international', 'space', 'mission', 'stories', 'showcase', 'thematic', 'stylistic', 'versatility', 'south', 'korean', 'science', 'fiction', 'writers', 'wide', 'array', 'conversant', 'global', 'science', 'fiction', 'tradition', 'thick', 'local', 'historical', 'specificities', 'works', 'resonate', 'popular', 'cultural', 'products', 'south', 'k', 'pop', 'k', 'drama', 'videogames', 'owe', 'appeal', 'pulsating', 'technocultural', 'edge', 'ability', 'play', 'familiar', 'tropes', 'unexpected', 'ways', 'country', 'renowned', 'hi', 'tech', 'industry', 'ultraspeed', 'broadband', 'mired', 'unfinished', 'cold', 'war', 'south', 'korean', 'science', 'fiction', 'offers', 'fresh', 'perspectives', 'global', 'technoindustrial', 'modernity', 'human', 'consequences', 'book', 'features', 'critical', 'introduction', 'essay', 'sf', 'fandom', 'south', 'korea', 'contextualizing', 'information', 'annotations', 'story']

# Define controlled vocabularies for each set
controlled_vocabularies = {
    'BISG': ["absurdist", "action", "adaptation", "adventure", "african american", "alternative history", "amish", "animal", "anthology", "arab american", "asian american", "biographical", "black", "buddhist", "christian", "citylife", "classic", "coming of age", "crime", "cultural heritage", "detective", "disability", "disaster", "diversity", "dystopian", "environment", "epistolary", "erotica", "fairytale", "familylife", "fantasy", "feminist", "friendship", "ghost", "gothic", "graphic novel", "hispanic", "historical", "holiday", "horror", "humorous", "immigration", "indigenous", "jewish", "latino", "legal", "lgbtq+", "literary", "litrpg", "magical realism", "mashup", "media tie in", "medical", "mennonite", "metaphysical", "middle eastern", "military", "multicultural", "multiple timeline", "muslim", "mystery", "mythology", "native american", "nature", "neurodiversity", "noir", "occult", "own voice", "pacific islander", "pastiche", "performing art", "political", "psychological", "religious", "romance", "rural", "sagas", "satire", "science fiction", "seastorie", "short story", "small town", "southern", "sport", "streetlit", "suburban", "superhero", "supernatural", "thriller", "turtleisland", "urban", "visionary", "war", "western", "woman"],
    'LC-history': ["1905", "1910", "1919", "1945", "1948", "1950", "1953", "1960", "1961", "1979", "1980", "1988", "2002", "2010", "20th century", "allied occupation", "april revolution", "bombardment", "buma", "buma uprising", "chejudo", "chejudo rebellion", "chosen", "chōsen", "collaborationists", "declaration independence", "independence movement", "invasions", "japanese occupation", "korean war", "kwangju", "kwangju si", "kwangju uprising", "mansei movement", "may revolution", "operation blacklist", "partition", "pictorial works", "puma", "puma uprising", "resistance movements", "reunification", "reunification question", "sunchon", "sunchʻon", "territorial", "twentieth century", "war", "war and intervention", "yonp'yong", "yŏnp'yŏng island", "yŏsu sunchʻŏn rebellion", "yosun", "yŏsun rebellion"],
    'LC-fiction': ["adventure", "adventure storie", "autobiographical", "autobiographical fiction", "bildungsroman", "biographical", "biographical fiction", "buddhist", "buddhist storie", "children's storie", "christian", "christian fiction", "detective", "detective storie", "didactic", "didactic fiction", "domestic", "domestic fiction", "erotic", "erotic storie", "fantasy", "fantasy fiction", "feminist", "feminist fiction", "ghost", "ghost storie", "hanmun", "hanmun sosŏl", "hanmun sosol", "historical", "historical fiction", "horror", "horror tale", "hypertext", "hypertext fiction", "legal", "legal storie", "martial arts", "mystery", "mystery storie", "political", "political fiction", "romance", "romance fiction", "science", "science fiction", "short storie", "sports", "sports storie", "spy", "spy storie", "suspense", "suspense fiction", "war", "war storie", "young adult"],
    'emotions': ["anger", "annoyance", "displeasure", "entrapment", "frustration", "disgust", "dislike", "captivation", "curiosity", "engagement", "reflection", "confusion", "difficulty", "disorientation", "fear", "stress", "uncertainty", "happiness", "anticipation", "excitement", "joy", "pleasure", "love", "admiration", "attraction", "empathy", "enchantment", "lust", "sensitivity", "apathy", "boredom", "depression", "disaffection", "disappointment", "lassitude", "sadness", "shame", "wistfulness", "astonishment", "bemusement", "disbelief", "surprise", "uncategorized", "caution", "gratitude", "patience", "perseverance"],
    'tones': ["cerebral", "dignified", "erudite", "formal", "insightful", "intelligent", "lofty", "philosophical", "profound", "reflective", "beautiful", "charming", "elegant", "enchanting", "engaging", "entertaining", "haunting", "lush", "pleasurable", "sensitive", "sympathetic", "tender", "touching", "complex", "controversial", "fragmented", "labyrinthine", "nuanced", "picaresque", "psychological", "rambling", "subtle", "wordy", "conventional", "bland", "cliched", "comfortable", "contrived", "familiar", "formulaic", "insipid", "juvenile", "light", "maudlin", "melodramatic", "one dimensional", "repetitive", "stereotypical", "unrealistic", "dramatic", "emotional", "exciting", "fascinating", "heartfelt", "intriguing", "moving", "nostalgic", "powerful", "sensational", "surprising", "suspenseful", "thrilling", "zesty", "frightening", "angst ridden", "chilling", "claustrophobic", "cruel", "daunting", "disturbing", "gory", "graphic", "gruesome", "harsh", "horrifying", "perverse", "psychopathic", "scary", "shocking", "uncomfortable", "unnerving", "violent", "volatile", "humourous", "dark humour", "ironical", "satirical", "imaginative", "adventurous", "allegorical", "creative", "descriptive", "eccentric", "edgy", "evocative", "fantastical", "innovative", "lyrical", "magical", "mysterious", "mythical", "original", "stylish", "surreal", "unique", "wistful", "assured", "hopeful", "innocent", "inspirational", "optimistic", "resilient", "respectful", "triumphant", "uplifting", "abrupt", "authentic", "character driven", "cohesive", "compelling", "gritty", "historical", "poignant", "precise", "prosaic", "readable", "realistic", "resonant", "sad", "barren", "bleak", "dark", "depressing", "desolate", "devastating", "grim", "heavy", "melancholic", "painful"]
}

# Create separate instances of CountVectorizer for each set with the specified n-gram ranges
vectorizers = {
    key: CountVectorizer(vocabulary=vocab, ngram_range=(1, 3) if key == "BISG" else (1, 2))
    for key, vocab in controlled_vocabularies.items()
}

# Display the words that appeared in the text (meaningful words in terms of TF)
all_feature_names = set()
all_term_frequencies = {}

for key, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform([" ".join(tokenized_text)])  # Use the preprocessed and tokenized text
    feature_names = vectorizer.get_feature_names_out()
    term_frequencies = pd.DataFrame(X.toarray(), columns=feature_names)

    # Display the term frequencies for the current set
    print(f"\nControlled Vocabulary Set: {key}")
    print(term_frequencies)

    # Extract words with frequencies greater than or equal to 1
    filtered_term_frequencies = term_frequencies.loc[:, (term_frequencies != 0).any(axis=0)]
    print("\nFiltered Term Frequencies:")
    print(filtered_term_frequencies)

    # Store the feature names and term frequencies for later use
    all_feature_names.update(feature_names)
    all_term_frequencies[key] = filtered_term_frequencies

# Display the unique words that appeared in the text (meaningful words in terms of TF)
print("\nWords Appeared in the Text:")
print(", ".join(all_feature_names))

# Display the combined term frequencies for all sets
print("\nCombined Term Frequencies:")
combined_term_frequencies = pd.concat(all_term_frequencies.values(), axis=1, keys=all_term_frequencies.keys())
print(combined_term_frequencies)



Controlled Vocabulary Set: BISG
   absurdist  action  adaptation  adventure  african american  \
0          0       0           0          0                 0   

   alternative history  amish  animal  anthology  arab american  ...  \
0                    0      0       0          1              0  ...   

   suburban  superhero  supernatural  thriller  turtleisland  urban  \
0         0          0             0         0             0      0   

   visionary  war  western  woman  
0          0    1        0      1  

[1 rows x 95 columns]

Filtered Term Frequencies:
   anthology  buddhist  classic  historical  science fiction  war  woman
0          1         1        1           1                4    1      1

Controlled Vocabulary Set: LC-history
   1905  1910  1919  1945  1948  1950  1953  1960  1961  1979  ...  sunchʻon  \
0     0     0     0     0     0     0     0     0     0     0  ...         0   

   territorial  twentieth century  war  war and intervention  yonp'yong  \
0   

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# The Penguin Book of Korean Short Stories
tokenized_text = ['eclectic', 'moving', 'enjoyable', 'collection', 'essential', 'introduction', 'korean', 'literature', 'journeying', 'korea', 'dramatic', 'recent', 'past', 'japanese', 'occupation', 'colonial', 'era', 'devastating', 'war', 'north', 'south', 'rapid', 'disorienting', 'urbanization', 'later', 'decades', 'penguin', 'book', 'korean', 'short', 'stories', 'captures', 'years', 'vivid', 'storytelling']

# Define controlled vocabularies for each set
controlled_vocabularies = {
    'BISG': ["absurdist", "action", "adaptation", "adventure", "african american", "alternative history", "amish", "animal", "anthology", "arab american", "asian american", "biographical", "black", "buddhist", "christian", "citylife", "classic", "coming of age", "crime", "cultural heritage", "detective", "disability", "disaster", "diversity", "dystopian", "environment", "epistolary", "erotica", "fairytale", "familylife", "fantasy", "feminist", "friendship", "ghost", "gothic", "graphic novel", "hispanic", "historical", "holiday", "horror", "humorous", "immigration", "indigenous", "jewish", "latino", "legal", "lgbtq+", "literary", "litrpg", "magical realism", "mashup", "media tie in", "medical", "mennonite", "metaphysical", "middle eastern", "military", "multicultural", "multiple timeline", "muslim", "mystery", "mythology", "native american", "nature", "neurodiversity", "noir", "occult", "own voice", "pacific islander", "pastiche", "performing art", "political", "psychological", "religious", "romance", "rural", "sagas", "satire", "science fiction", "seastorie", "short story", "small town", "southern", "sport", "streetlit", "suburban", "superhero", "supernatural", "thriller", "turtleisland", "urban", "visionary", "war", "western", "woman"],
    'LC-history': ["1905", "1910", "1919", "1945", "1948", "1950", "1953", "1960", "1961", "1979", "1980", "1988", "2002", "2010", "20th century", "allied occupation", "april revolution", "bombardment", "buma", "buma uprising", "chejudo", "chejudo rebellion", "chosen", "chōsen", "collaborationists", "declaration independence", "independence movement", "invasions", "japanese occupation", "korean war", "kwangju", "kwangju si", "kwangju uprising", "mansei movement", "may revolution", "operation blacklist", "partition", "pictorial works", "puma", "puma uprising", "resistance movements", "reunification", "reunification question", "sunchon", "sunchʻon", "territorial", "twentieth century", "war", "war and intervention", "yonp'yong", "yŏnp'yŏng island", "yŏsu sunchʻŏn rebellion", "yosun", "yŏsun rebellion"],
    'LC-fiction': ["adventure", "adventure storie", "autobiographical", "autobiographical fiction", "bildungsroman", "biographical", "biographical fiction", "buddhist", "buddhist storie", "children's storie", "christian", "christian fiction", "detective", "detective storie", "didactic", "didactic fiction", "domestic", "domestic fiction", "erotic", "erotic storie", "fantasy", "fantasy fiction", "feminist", "feminist fiction", "ghost", "ghost storie", "hanmun", "hanmun sosŏl", "hanmun sosol", "historical", "historical fiction", "horror", "horror tale", "hypertext", "hypertext fiction", "legal", "legal storie", "martial arts", "mystery", "mystery storie", "political", "political fiction", "romance", "romance fiction", "science", "science fiction", "short storie", "sports", "sports storie", "spy", "spy storie", "suspense", "suspense fiction", "war", "war storie", "young adult"],
    'emotions': ["anger", "annoyance", "displeasure", "entrapment", "frustration", "disgust", "dislike", "captivation", "curiosity", "engagement", "reflection", "confusion", "difficulty", "disorientation", "fear", "stress", "uncertainty", "happiness", "anticipation", "excitement", "joy", "pleasure", "love", "admiration", "attraction", "empathy", "enchantment", "lust", "sensitivity", "apathy", "boredom", "depression", "disaffection", "disappointment", "lassitude", "sadness", "shame", "wistfulness", "astonishment", "bemusement", "disbelief", "surprise", "uncategorized", "caution", "gratitude", "patience", "perseverance"],
    'tones': ["cerebral", "dignified", "erudite", "formal", "insightful", "intelligent", "lofty", "philosophical", "profound", "reflective", "beautiful", "charming", "elegant", "enchanting", "engaging", "entertaining", "haunting", "lush", "pleasurable", "sensitive", "sympathetic", "tender", "touching", "complex", "controversial", "fragmented", "labyrinthine", "nuanced", "picaresque", "psychological", "rambling", "subtle", "wordy", "conventional", "bland", "cliched", "comfortable", "contrived", "familiar", "formulaic", "insipid", "juvenile", "light", "maudlin", "melodramatic", "one dimensional", "repetitive", "stereotypical", "unrealistic", "dramatic", "emotional", "exciting", "fascinating", "heartfelt", "intriguing", "moving", "nostalgic", "powerful", "sensational", "surprising", "suspenseful", "thrilling", "zesty", "frightening", "angst ridden", "chilling", "claustrophobic", "cruel", "daunting", "disturbing", "gory", "graphic", "gruesome", "harsh", "horrifying", "perverse", "psychopathic", "scary", "shocking", "uncomfortable", "unnerving", "violent", "volatile", "humourous", "dark humour", "ironical", "satirical", "imaginative", "adventurous", "allegorical", "creative", "descriptive", "eccentric", "edgy", "evocative", "fantastical", "innovative", "lyrical", "magical", "mysterious", "mythical", "original", "stylish", "surreal", "unique", "wistful", "assured", "hopeful", "innocent", "inspirational", "optimistic", "resilient", "respectful", "triumphant", "uplifting", "abrupt", "authentic", "character driven", "cohesive", "compelling", "gritty", "historical", "poignant", "precise", "prosaic", "readable", "realistic", "resonant", "sad", "barren", "bleak", "dark", "depressing", "desolate", "devastating", "grim", "heavy", "melancholic", "painful"]
}

# Create separate instances of CountVectorizer for each set with the specified n-gram ranges
vectorizers = {
    key: CountVectorizer(vocabulary=vocab, ngram_range=(1, 3) if key == "BISG" else (1, 2))
    for key, vocab in controlled_vocabularies.items()
}

# Display the words that appeared in the text (meaningful words in terms of TF)
all_feature_names = set()
all_term_frequencies = {}

for key, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform([" ".join(tokenized_text)])  # Use the preprocessed and tokenized text
    feature_names = vectorizer.get_feature_names_out()
    term_frequencies = pd.DataFrame(X.toarray(), columns=feature_names)

    # Display the term frequencies for the current set
    print(f"\nControlled Vocabulary Set: {key}")
    print(term_frequencies)

    # Extract words with frequencies greater than or equal to 1
    filtered_term_frequencies = term_frequencies.loc[:, (term_frequencies != 0).any(axis=0)]
    print("\nFiltered Term Frequencies:")
    print(filtered_term_frequencies)

    # Store the feature names and term frequencies for later use
    all_feature_names.update(feature_names)
    all_term_frequencies[key] = filtered_term_frequencies

# Display the unique words that appeared in the text (meaningful words in terms of TF)
print("\nWords Appeared in the Text:")
print(", ".join(all_feature_names))

# Display the combined term frequencies for all sets
print("\nCombined Term Frequencies:")
combined_term_frequencies = pd.concat(all_term_frequencies.values(), axis=1, keys=all_term_frequencies.keys())
print(combined_term_frequencies)


Controlled Vocabulary Set: BISG
   absurdist  action  adaptation  adventure  african american  \
0          0       0           0          0                 0   

   alternative history  amish  animal  anthology  arab american  ...  \
0                    0      0       0          0              0  ...   

   suburban  superhero  supernatural  thriller  turtleisland  urban  \
0         0          0             0         0             0      0   

   visionary  war  western  woman  
0          0    1        0      0  

[1 rows x 95 columns]

Filtered Term Frequencies:
   war
0    1

Controlled Vocabulary Set: LC-history
   1905  1910  1919  1945  1948  1950  1953  1960  1961  1979  ...  sunchʻon  \
0     0     0     0     0     0     0     0     0     0     0  ...         0   

   territorial  twentieth century  war  war and intervention  yonp'yong  \
0            0                  0    1                     0          0   

   yŏnp'yŏng island  yŏsu sunchʻŏn rebellion  yosun  yŏ

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Cursed Bunny
tokenized_text = ['shortlisted', 'international', 'booker', 'prize', 'winner', 'pen', 'heim', 'translation', 'grant', 'cool', 'demented', 'k', 'horror', 'way', 'like', 'ed', 'park', 'author', 'personal', 'days', 'stunning', 'original', 'debut', 'rising', 'star', 'korean', 'literature', 'surreal', 'chilling', 'fables', 'patriarchy', 'capitalism', 'reign', 'big', 'tech', 'absurdist', 'humor', 'literal', 'bite', 'author', 'published', 'united', 'states', 'cursed', 'bunny', 'unique', 'imaginative', 'blending', 'horror', 'sci', 'fi', 'fairytales', 'speculative', 'fiction', 'stories', 'defy', 'categorization', 'turns', 'thought', 'provoking', 'stomach', 'turning', 'monsters', 'shapes', 'furry', 'woodland', 'creatures', 'danger', 'lurks', 'unexpected', 'corners', 'everyday', 'apartment', 'buildings', 'unforgettable', 'collection', 'translated', 'acclaimed', 'anton', 'hur', 'chung', 'absurd', 'haunting', 'universe', 'illuminating', 'ills', 'contemporary', 'society', 'head', 'follows', 'woman', 'haunted', 'bodily', 'waste', 'embodiment', 'takes', 'dystopian', 'gynecology', 'office', 'pregnant', 'woman', 'told', 'find', 'father', 'baby', 'face', 'horrific', 'consequences', 'story', 'follows', 'young', 'monster', 'forced', 'underground', 'fight', 'rings', 'knowing', 'force', 'power', 'titular', 'fable', 'centers', 'cursed', 'lamp', 'approachable', 'shape', 'rabbit', 'fit', 'child', 'bedroom', 'sinister', 'capabilities', 'stories', 'readers', 'torn', 'race', 'savor', 'chung', 'wit', 'frenetic', 'energy', 'page', 'cursed', 'bunny', 'book', 'screams', 'read', 'night', 'passed', 'nearest', 'set', 'hands', 'day']

# Define controlled vocabularies for each set
controlled_vocabularies = {
    'BISG': ["absurdist", "action", "adaptation", "adventure", "african american", "alternative history", "amish", "animal", "anthology", "arab american", "asian american", "biographical", "black", "buddhist", "christian", "citylife", "classic", "coming of age", "crime", "cultural heritage", "detective", "disability", "disaster", "diversity", "dystopian", "environment", "epistolary", "erotica", "fairytale", "familylife", "fantasy", "feminist", "friendship", "ghost", "gothic", "graphic novel", "hispanic", "historical", "holiday", "horror", "humorous", "immigration", "indigenous", "jewish", "latino", "legal", "lgbtq+", "literary", "litrpg", "magical realism", "mashup", "media tie in", "medical", "mennonite", "metaphysical", "middle eastern", "military", "multicultural", "multiple timeline", "muslim", "mystery", "mythology", "native american", "nature", "neurodiversity", "noir", "occult", "own voice", "pacific islander", "pastiche", "performing art", "political", "psychological", "religious", "romance", "rural", "sagas", "satire", "science fiction", "seastorie", "short story", "small town", "southern", "sport", "streetlit", "suburban", "superhero", "supernatural", "thriller", "turtleisland", "urban", "visionary", "war", "western", "woman"],
    'LC-history': ["1905", "1910", "1919", "1945", "1948", "1950", "1953", "1960", "1961", "1979", "1980", "1988", "2002", "2010", "20th century", "allied occupation", "april revolution", "bombardment", "buma", "buma uprising", "chejudo", "chejudo rebellion", "chosen", "chōsen", "collaborationists", "declaration independence", "independence movement", "invasions", "japanese occupation", "korean war", "kwangju", "kwangju si", "kwangju uprising", "mansei movement", "may revolution", "operation blacklist", "partition", "pictorial works", "puma", "puma uprising", "resistance movements", "reunification", "reunification question", "sunchon", "sunchʻon", "territorial", "twentieth century", "war", "war and intervention", "yonp'yong", "yŏnp'yŏng island", "yŏsu sunchʻŏn rebellion", "yosun", "yŏsun rebellion"],
    'LC-fiction': ["adventure", "adventure storie", "autobiographical", "autobiographical fiction", "bildungsroman", "biographical", "biographical fiction", "buddhist", "buddhist storie", "children's storie", "christian", "christian fiction", "detective", "detective storie", "didactic", "didactic fiction", "domestic", "domestic fiction", "erotic", "erotic storie", "fantasy", "fantasy fiction", "feminist", "feminist fiction", "ghost", "ghost storie", "hanmun", "hanmun sosŏl", "hanmun sosol", "historical", "historical fiction", "horror", "horror tale", "hypertext", "hypertext fiction", "legal", "legal storie", "martial arts", "mystery", "mystery storie", "political", "political fiction", "romance", "romance fiction", "science", "science fiction", "short storie", "sports", "sports storie", "spy", "spy storie", "suspense", "suspense fiction", "war", "war storie", "young adult"],
    'emotions': ["anger", "annoyance", "displeasure", "entrapment", "frustration", "disgust", "dislike", "captivation", "curiosity", "engagement", "reflection", "confusion", "difficulty", "disorientation", "fear", "stress", "uncertainty", "happiness", "anticipation", "excitement", "joy", "pleasure", "love", "admiration", "attraction", "empathy", "enchantment", "lust", "sensitivity", "apathy", "boredom", "depression", "disaffection", "disappointment", "lassitude", "sadness", "shame", "wistfulness", "astonishment", "bemusement", "disbelief", "surprise", "uncategorized", "caution", "gratitude", "patience", "perseverance"],
    'tones': ["cerebral", "dignified", "erudite", "formal", "insightful", "intelligent", "lofty", "philosophical", "profound", "reflective", "beautiful", "charming", "elegant", "enchanting", "engaging", "entertaining", "haunting", "lush", "pleasurable", "sensitive", "sympathetic", "tender", "touching", "complex", "controversial", "fragmented", "labyrinthine", "nuanced", "picaresque", "psychological", "rambling", "subtle", "wordy", "conventional", "bland", "cliched", "comfortable", "contrived", "familiar", "formulaic", "insipid", "juvenile", "light", "maudlin", "melodramatic", "one dimensional", "repetitive", "stereotypical", "unrealistic", "dramatic", "emotional", "exciting", "fascinating", "heartfelt", "intriguing", "moving", "nostalgic", "powerful", "sensational", "surprising", "suspenseful", "thrilling", "zesty", "frightening", "angst ridden", "chilling", "claustrophobic", "cruel", "daunting", "disturbing", "gory", "graphic", "gruesome", "harsh", "horrifying", "perverse", "psychopathic", "scary", "shocking", "uncomfortable", "unnerving", "violent", "volatile", "humourous", "dark humour", "ironical", "satirical", "imaginative", "adventurous", "allegorical", "creative", "descriptive", "eccentric", "edgy", "evocative", "fantastical", "innovative", "lyrical", "magical", "mysterious", "mythical", "original", "stylish", "surreal", "unique", "wistful", "assured", "hopeful", "innocent", "inspirational", "optimistic", "resilient", "respectful", "triumphant", "uplifting", "abrupt", "authentic", "character driven", "cohesive", "compelling", "gritty", "historical", "poignant", "precise", "prosaic", "readable", "realistic", "resonant", "sad", "barren", "bleak", "dark", "depressing", "desolate", "devastating", "grim", "heavy", "melancholic", "painful"]
}

# Create separate instances of CountVectorizer for each set with the specified n-gram ranges
vectorizers = {
    key: CountVectorizer(vocabulary=vocab, ngram_range=(1, 3) if key == "BISG" else (1, 2))
    for key, vocab in controlled_vocabularies.items()
}

# Display the words that appeared in the text (meaningful words in terms of TF)
all_feature_names = set()
all_term_frequencies = {}

for key, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform([" ".join(tokenized_text)])  # Use the preprocessed and tokenized text
    feature_names = vectorizer.get_feature_names_out()
    term_frequencies = pd.DataFrame(X.toarray(), columns=feature_names)

    # Display the term frequencies for the current set
    print(f"\nControlled Vocabulary Set: {key}")
    print(term_frequencies)

    # Extract words with frequencies greater than or equal to 1
    filtered_term_frequencies = term_frequencies.loc[:, (term_frequencies != 0).any(axis=0)]
    print("\nFiltered Term Frequencies:")
    print(filtered_term_frequencies)

    # Store the feature names and term frequencies for later use
    all_feature_names.update(feature_names)
    all_term_frequencies[key] = filtered_term_frequencies

# Display the unique words that appeared in the text (meaningful words in terms of TF)
print("\nWords Appeared in the Text:")
print(", ".join(all_feature_names))

# Display the combined term frequencies for all sets
print("\nCombined Term Frequencies:")
combined_term_frequencies = pd.concat(all_term_frequencies.values(), axis=1, keys=all_term_frequencies.keys())
print(combined_term_frequencies)


Controlled Vocabulary Set: BISG
   absurdist  action  adaptation  adventure  african american  \
0          1       0           0          0                 0   

   alternative history  amish  animal  anthology  arab american  ...  \
0                    0      0       0          0              0  ...   

   suburban  superhero  supernatural  thriller  turtleisland  urban  \
0         0          0             0         0             0      0   

   visionary  war  western  woman  
0          0    0        0      2  

[1 rows x 95 columns]

Filtered Term Frequencies:
   absurdist  dystopian  horror  woman
0          1          1       2      2

Controlled Vocabulary Set: LC-history
   1905  1910  1919  1945  1948  1950  1953  1960  1961  1979  ...  sunchʻon  \
0     0     0     0     0     0     0     0     0     0     0  ...         0   

   territorial  twentieth century  war  war and intervention  yonp'yong  \
0            0                  0    0                     0          0 

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Kim Jiyoung, Born 1982
tokenized_text = ['kim', 'jiyoung', 'born', '1982', 'south', 'korean', 'sensation', 'got', 'world', 'talking', 'life', 'story', 'young', 'woman', 'born', 'end', 'twentieth', 'century', 'raises', 'questions', 'endemic', 'misogyny', 'institutional', 'oppression', 'relevant', 'guardian', "'", 'look', '2020', 'red', 'magazine', "can't", 'wait', 'read', 'book', '2020', 'riveting', 'original', 'uncompromising', 'important', 'book', 'emerged', 'south', 'korea', 'sincehan', 'kang', 'vegetarian', 'book', 'life', 'woman', 'living', 'korea', 'despair', 'ordinary', 'woman', 'takes', 'granted', 'fact', 'special', 'shocking', 'relatable', 'sayaka', 'murata', 'author', 'convenience', 'store', 'woman', 'kim', 'jiyoung', 'girl', 'born', 'mother', 'laws', 'wanted', 'boy', 'kim', 'jiyoung', 'sister', 'share', 'room', 'brother', 'gets', 'kim', 'jiyoung', 'female', 'preyed', 'male', 'teachers', 'school', 'kim', 'jiyoung', 'daughter', 'father', 'blames', 'harassed', 'night', 'kim', 'jiyoung', 'good', 'student', 'internships', 'kim', 'jiyoung', 'model', 'employee', 'overlooked', 'promotion', 'kim', 'jiyoung', 'wife', 'gives', 'career', 'independence', 'life', 'domesticity', 'kim', 'jiyoung', 'started', 'acting', 'kim', 'jiyoung', 'depressed', 'kim', 'jiyoung', 'mad', 'kim', 'jiyoung', 'woman', 'kim', 'jiyoung', 'woman', 'witty', 'disturbing', 'book', 'deals', 'sexism', 'mental', 'health', 'issues', 'hypocrisy', 'country', 'young', 'women', 'popping', 'caffeine', 'pills', 'turning', 'jaundiced', 'slave', 'factories', 'helping', 'fund', 'higher', 'education', 'male', 'siblings', 'independent', 'treatise', 'howl', 'anger', 'describes', 'experiences', 'recognisable', 'slim', 'unadorned', 'narrative', 'distils', 'lifetime', 'iniquities', 'sharp', 'punch', 'books', 'demonstrates', 'unfairness', 'female', 'experience', 'sheer', 'difficulty', 'improving', 'sunday', 'times']

# Define controlled vocabularies for each set
controlled_vocabularies = {
    'BISG': ["absurdist", "action", "adaptation", "adventure", "african american", "alternative history", "amish", "animal", "anthology", "arab american", "asian american", "biographical", "black", "buddhist", "christian", "citylife", "classic", "coming of age", "crime", "cultural heritage", "detective", "disability", "disaster", "diversity", "dystopian", "environment", "epistolary", "erotica", "fairytale", "familylife", "fantasy", "feminist", "friendship", "ghost", "gothic", "graphic novel", "hispanic", "historical", "holiday", "horror", "humorous", "immigration", "indigenous", "jewish", "latino", "legal", "lgbtq+", "literary", "litrpg", "magical realism", "mashup", "media tie in", "medical", "mennonite", "metaphysical", "middle eastern", "military", "multicultural", "multiple timeline", "muslim", "mystery", "mythology", "native american", "nature", "neurodiversity", "noir", "occult", "own voice", "pacific islander", "pastiche", "performing art", "political", "psychological", "religious", "romance", "rural", "sagas", "satire", "science fiction", "seastorie", "short story", "small town", "southern", "sport", "streetlit", "suburban", "superhero", "supernatural", "thriller", "turtleisland", "urban", "visionary", "war", "western", "woman"],
    'LC-history': ["1905", "1910", "1919", "1945", "1948", "1950", "1953", "1960", "1961", "1979", "1980", "1988", "2002", "2010", "20th century", "allied occupation", "april revolution", "bombardment", "buma", "buma uprising", "chejudo", "chejudo rebellion", "chosen", "chōsen", "collaborationists", "declaration independence", "independence movement", "invasions", "japanese occupation", "korean war", "kwangju", "kwangju si", "kwangju uprising", "mansei movement", "may revolution", "operation blacklist", "partition", "pictorial works", "puma", "puma uprising", "resistance movements", "reunification", "reunification question", "sunchon", "sunchʻon", "territorial", "twentieth century", "war", "war and intervention", "yonp'yong", "yŏnp'yŏng island", "yŏsu sunchʻŏn rebellion", "yosun", "yŏsun rebellion"],
    'LC-fiction': ["adventure", "adventure storie", "autobiographical", "autobiographical fiction", "bildungsroman", "biographical", "biographical fiction", "buddhist", "buddhist storie", "children's storie", "christian", "christian fiction", "detective", "detective storie", "didactic", "didactic fiction", "domestic", "domestic fiction", "erotic", "erotic storie", "fantasy", "fantasy fiction", "feminist", "feminist fiction", "ghost", "ghost storie", "hanmun", "hanmun sosŏl", "hanmun sosol", "historical", "historical fiction", "horror", "horror tale", "hypertext", "hypertext fiction", "legal", "legal storie", "martial arts", "mystery", "mystery storie", "political", "political fiction", "romance", "romance fiction", "science", "science fiction", "short storie", "sports", "sports storie", "spy", "spy storie", "suspense", "suspense fiction", "war", "war storie", "young adult"],
    'emotions': ["anger", "annoyance", "displeasure", "entrapment", "frustration", "disgust", "dislike", "captivation", "curiosity", "engagement", "reflection", "confusion", "difficulty", "disorientation", "fear", "stress", "uncertainty", "happiness", "anticipation", "excitement", "joy", "pleasure", "love", "admiration", "attraction", "empathy", "enchantment", "lust", "sensitivity", "apathy", "boredom", "depression", "disaffection", "disappointment", "lassitude", "sadness", "shame", "wistfulness", "astonishment", "bemusement", "disbelief", "surprise", "uncategorized", "caution", "gratitude", "patience", "perseverance"],
    'tones': ["cerebral", "dignified", "erudite", "formal", "insightful", "intelligent", "lofty", "philosophical", "profound", "reflective", "beautiful", "charming", "elegant", "enchanting", "engaging", "entertaining", "haunting", "lush", "pleasurable", "sensitive", "sympathetic", "tender", "touching", "complex", "controversial", "fragmented", "labyrinthine", "nuanced", "picaresque", "psychological", "rambling", "subtle", "wordy", "conventional", "bland", "cliched", "comfortable", "contrived", "familiar", "formulaic", "insipid", "juvenile", "light", "maudlin", "melodramatic", "one dimensional", "repetitive", "stereotypical", "unrealistic", "dramatic", "emotional", "exciting", "fascinating", "heartfelt", "intriguing", "moving", "nostalgic", "powerful", "sensational", "surprising", "suspenseful", "thrilling", "zesty", "frightening", "angst ridden", "chilling", "claustrophobic", "cruel", "daunting", "disturbing", "gory", "graphic", "gruesome", "harsh", "horrifying", "perverse", "psychopathic", "scary", "shocking", "uncomfortable", "unnerving", "violent", "volatile", "humourous", "dark humour", "ironical", "satirical", "imaginative", "adventurous", "allegorical", "creative", "descriptive", "eccentric", "edgy", "evocative", "fantastical", "innovative", "lyrical", "magical", "mysterious", "mythical", "original", "stylish", "surreal", "unique", "wistful", "assured", "hopeful", "innocent", "inspirational", "optimistic", "resilient", "respectful", "triumphant", "uplifting", "abrupt", "authentic", "character driven", "cohesive", "compelling", "gritty", "historical", "poignant", "precise", "prosaic", "readable", "realistic", "resonant", "sad", "barren", "bleak", "dark", "depressing", "desolate", "devastating", "grim", "heavy", "melancholic", "painful"]
}

# Create separate instances of CountVectorizer for each set with the specified n-gram ranges
vectorizers = {
    key: CountVectorizer(vocabulary=vocab, ngram_range=(1, 3) if key == "BISG" else (1, 2))
    for key, vocab in controlled_vocabularies.items()
}

# Display the words that appeared in the text (meaningful words in terms of TF)
all_feature_names = set()
all_term_frequencies = {}

for key, vectorizer in vectorizers.items():
    X = vectorizer.fit_transform([" ".join(tokenized_text)])  # Use the preprocessed and tokenized text
    feature_names = vectorizer.get_feature_names_out()
    term_frequencies = pd.DataFrame(X.toarray(), columns=feature_names)

    # Display the term frequencies for the current set
    print(f"\nControlled Vocabulary Set: {key}")
    print(term_frequencies)

    # Extract words with frequencies greater than or equal to 1
    filtered_term_frequencies = term_frequencies.loc[:, (term_frequencies != 0).any(axis=0)]
    print("\nFiltered Term Frequencies:")
    print(filtered_term_frequencies)

    # Store the feature names and term frequencies for later use
    all_feature_names.update(feature_names)
    all_term_frequencies[key] = filtered_term_frequencies

# Display the unique words that appeared in the text (meaningful words in terms of TF)
print("\nWords Appeared in the Text:")
print(", ".join(all_feature_names))

# Display the combined term frequencies for all sets
print("\nCombined Term Frequencies:")
combined_term_frequencies = pd.concat(all_term_frequencies.values(), axis=1, keys=all_term_frequencies.keys())
print(combined_term_frequencies)


Controlled Vocabulary Set: BISG
   absurdist  action  adaptation  adventure  african american  \
0          0       0           0          0                 0   

   alternative history  amish  animal  anthology  arab american  ...  \
0                    0      0       0          0              0  ...   

   suburban  superhero  supernatural  thriller  turtleisland  urban  \
0         0          0             0         0             0      0   

   visionary  war  western  woman  
0          0    0        0      6  

[1 rows x 95 columns]

Filtered Term Frequencies:
   woman
0      6

Controlled Vocabulary Set: LC-history
   1905  1910  1919  1945  1948  1950  1953  1960  1961  1979  ...  sunchʻon  \
0     0     0     0     0     0     0     0     0     0     0  ...         0   

   territorial  twentieth century  war  war and intervention  yonp'yong  \
0            0                  1    0                     0          0   

   yŏnp'yŏng island  yŏsu sunchʻŏn rebellion  yosun 