In [None]:
import os
import re
import markovify
from markovify.text import ParamError 
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

from newsapi import NewsApiClient
import praw
from pytrends.request import TrendReq

from datasets import load_dataset

from TTS.api import TTS
import IPython.display as ipd
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)


from dotenv import load_dotenv
load_dotenv("config.env")

NEWSAPI_KEY      = os.getenv("NEWSAPI_KEY")
REDDIT_ID        = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET    = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_AGENT     = os.getenv("REDDIT_USER_AGENT")

newsapi = NewsApiClient(api_key=NEWSAPI_KEY)
reddit  = praw.Reddit(client_id=REDDIT_ID,
                      client_secret=REDDIT_SECRET,
                      user_agent=REDDIT_AGENT)
pytrends = TrendReq()


import gradio as gr
import json
from datetime import datetime


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## Loading the datasets

### Gutenberg Corpus


In [2]:
with open("fairy_tales.txt", "r", encoding="utf-8") as f:
    text = f.read()

start = text.find("*** START OF THE PROJECT GUTENBERG EBOOK GRIMMS' FAIRY TALES ***")
end   = text.find("*** END OF THE PROJECT GUTENBERG EBOOK GRIMMS' FAIRY TALES ***")
corpus_gutenberg = text[start:end]

print("Gutenberg corpus length:", len(corpus_gutenberg), "characters")

Gutenberg corpus length: 520886 characters


### HuggingFace Gryphe ChatGPT-4o Writing Prompts



In [3]:
ds_gryphe = load_dataset("Gryphe/ChatGPT-4o-Writing-Prompts", split="train")

texts_gryphe = []
for row in ds_gryphe:
    msgs = row["conversations"]

    human = next((m["value"] for m in msgs if m["from"] == "human"), "")
    gpt   = next((m["value"] for m in msgs if m["from"] == "gpt"), "")
    if not gpt:
        continue

    include_human = False
    combined = (human + "\n\n" + gpt) if include_human else gpt
    texts_gryphe.append(combined.strip())

corpus_gryphe = "\n".join(texts_gryphe[:5000])
print(f"Prepared Gryphe corpus with {len(texts_gryphe)} entries, using first 5000.")
print("Sample:\n", corpus_gryphe[:500])


Prepared Gryphe corpus with 3741 entries, using first 5000.
Sample:
 The smell in the kitchen was somewhere between charred wood and sulfur, a sharp, acrid tang that set the back of Mina's throat on fire. That was the first indication something had gone terribly, terribly wrong.

She flipped frantically through the pages of her grandma’s old cookbook, the fragile, yellowed edges crumbling between her fingers as she scanned for something—anything—that could explain the colossal, horned creature standing in the middle of her kitchen. Flour dust still hung in the ai


### Kaggle WritingPrompts

In [4]:
def load_kaggle_corpus(split="train", max_samples=5000, max_tokens=1000, base_path="writingprompts"):
    """
    Load Kaggle Writing Prompts split (train/test/valid) from the given folder,
    combine source + target per line, truncate to max_tokens tokens,
    and return up to max_samples combined entries.
    """
    src_path = os.path.join(base_path, f"{split}.wp_source")
    tgt_path = os.path.join(base_path, f"{split}.wp_target")
    texts = []
    with open(src_path, "r", encoding="utf-8") as src_f, \
         open(tgt_path, "r", encoding="utf-8") as tgt_f:
        for i, (src, tgt) in enumerate(zip(src_f, tgt_f)):
            if i >= max_samples:
                break
            # Combine prompt (source) + story (target)
            combined = src.strip() + " " + tgt.strip()

            tokens = combined.split()
            truncated = " ".join(tokens[:max_tokens])
            texts.append(truncated)
    return texts

kaggle_texts = load_kaggle_corpus(split="train", max_samples=5000, max_tokens=1000)
corpus_kg = "\n".join(kaggle_texts)
print(f"Loaded {len(kaggle_texts)} samples from 'writingprompts/train' (total chars: {len(corpus_kg)})")
print("Example snippet:\n", kaggle_texts[0][:300], "…")


Loaded 5000 samples from 'writingprompts/train' (total chars: 15034991)
Example snippet:
 [ WP ] You 've finally managed to discover the secret to immortality . Suddenly , Death appears before you , hands you a business card , and says , `` When you realize living forever sucks , call this number , I 've got a job offer for you . '' So many times have I walked on ruins , the remainings o …


## Combine Corpora & Build Markov Model


In [5]:
combined_corpus = "\n".join([corpus_gutenberg, corpus_kg, corpus_gryphe])

model = markovify.Text(combined_corpus, state_size=2)

print("Markov model built. Sample sentence:")
print(model.make_sentence())


Markov model built. Sample sentence:
Dane had always been like this—not a moment ago, *fallen through the gaseous layers of strategy, dice rolls…luck.


## Fetching a Trending Topic


In [None]:
def fetch_newsapi_topic():
    resp = newsapi.get_top_headlines(country="us", page_size=1)
    return resp["articles"][0]["title"]

def fetch_reddit_topic():
    return next(reddit.subreddit("all").hot(limit=1)).title

def fetch_pytrends_topic():
    return pytrends.trending_searches(pn="united_states")[0]

trend = fetch_newsapi_topic()
print("Trending topic:", trend)


Trending topic: Trump has said Canada should be the 51st state. Today, he meets its prime minister - NPR


## Generate a Story by Seeding the Markov Model


In [None]:
# def build_topic_model(topic, state_size=2):

#     sentences = re.split(r'(?<=[\.\?\!])\s+', combined_corpus)
#     keywords = [w.lower() for w in topic.split() if len(w) > 3]
#     topic_sentences = [
#         s for s in sentences
#         if any(kw in s.lower() for kw in keywords)
#     ]
#     if len(topic_sentences) < 5:
#         return None 
#     return markovify.Text("\n".join(topic_sentences), state_size=state_size)

# topic_model = build_topic_model(trend, state_size=2)

# if topic_model:
#     combined_model = markovify.combine(
#     [topic_model, model], [0.7, 0.3]
#     )
#     combined_model = combined_model.compile()
# else:
#     combined_model = model

# story_sentences = []
# for _ in range(5):
#     s = combined_model.make_short_sentence(
#         max_chars=100,
#         min_chars=50,
#         tries=30,
#         max_overlap_ratio=0.7,
#         max_overlap_total=15
#     )
#     if s and re.match(r'^[A-Z]', s):
#         story_sentences.append(s)

# if not story_sentences:
#     template = f"This story is about {trend}. "
#     fallback = model.make_short_sentence(
#         max_chars=100, min_chars=50, tries=30
#     ) or ""
#     story_sentences = [template + fallback]

# story = " ".join(story_sentences)
# print("Generated Story:\n", story)



Generated Story:
 Jerry III cleared his throat like a curtain, incomplete yet painfully familiar. Maybe you stood still for too long, trained himself not far off—but there was no mistake. You opened your eyes, you glanced down at the sky, surrounded by books and glowing high-rises. Moments later, the ship had held us tethered in the same end in tears...or blood. Instead, she laughed—a laugh so carefree it seemed to pause, caught in reflections?


## Convert Story to Audio (Coqui TTS)


In [None]:
# output_path = "trend_story.wav"
# tts.tts_to_file(text=story, file_path=output_path)
# print("Saved audio to", output_path)

# ipd.display(ipd.Audio(output_path))

maybe you stood still for too long, trained himself not far off—but there was no mistake.
Character '—' not found in the vocabulary. Discarding it.


Saved audio to trend_story.wav


In [None]:
# result = {
#     "timestamp": datetime.utcnow().isoformat() + "Z",
#     "topic": trend,
#     "story": story,
#     "audio_path": output_path
# }

# with open("result.json", "w", encoding="utf-8") as f:
#     json.dump(result, f, indent=2, ensure_ascii=False)

# print("Saved result to result.json")

Saved result to result.json


## Gradio Interface


In [None]:
def trendstory_pipeline(source: str):
    try:
        if source == "reddit":
            topic = fetch_reddit_topic()
        else:
            topic = fetch_newsapi_topic()

        seed = topic.split()[0]
        sentences = []
        for _ in range(5):
            try:
                s = model.make_sentence_with_start(seed, strict=False)
            except ParamError:
                s = None
            if s:
                sentences.append(s)

        if not sentences:
            for _ in range(5):
                s = model.make_short_sentence(
                    max_chars=120, min_chars=50,
                    tries=30, max_overlap_ratio=0.7,
                    max_overlap_total=15
                )
                if s:
                    sentences.append(s)

        story = " ".join(sentences)

        audio_path = "trend_story.wav"
        tts.tts_to_file(text=story, file_path=audio_path)

        result = {
            "timestamp": datetime.utcnow().isoformat() + "Z",
            "source": source,
            "topic": topic,
            "story": story,
            "audio_path": audio_path
        }

        with open("result.json", "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

        return topic, story, audio_path, result

    except Exception as e:
        err = f"Error: {e}"
        return err, err, None, {"error": err}

gr.Interface(
    fn=trendstory_pipeline,
    inputs=gr.Radio(["newsapi", "reddit"], label="Source"),
    outputs=[
        gr.Textbox(label="Topic"),
        gr.Textbox(label="Story"),
        gr.Audio(label="Story Audio"),
        gr.JSON(label="Full JSON Output")
    ],
    title="TrendStory (Markovify + Coqui TTS)",
    description="Pick a source to fetch a trend, generate a Markov-based story, play it aloud, and see the JSON output."
).launch()


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




but here… at hogwarts… it felt like a thousand different versions of myself from asking.
Character '…' not found in the vocabulary. Discarding it.
