In [1]:
import os
import re
import markovify
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

# Data APIs
from newsapi import NewsApiClient
import praw
from pytrends.request import TrendReq

# Datasets
from datasets import load_dataset

# TTS
from TTS.api import TTS
import IPython.display as ipd

# Load keys from config.env
from dotenv import load_dotenv
load_dotenv("config.env")

NEWSAPI_KEY      = os.getenv("NEWSAPI_KEY")
REDDIT_ID        = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET    = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_AGENT     = os.getenv("REDDIT_USER_AGENT")

# Initialize clients
newsapi = NewsApiClient(api_key=NEWSAPI_KEY)
reddit  = praw.Reddit(client_id=REDDIT_ID,
                      client_secret=REDDIT_SECRET,
                      user_agent=REDDIT_AGENT)
pytrends = TrendReq()
# Coqui TTS model
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC",
          progress_bar=False, gpu=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Zainr\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## Loading the datasets

### Gutenberg Corpus


In [5]:
with open("fairy_tales.txt", "r", encoding="utf-8") as f:
    text = f.read()

start = text.find("*** START OF THE PROJECT GUTENBERG EBOOK GRIMMS' FAIRY TALES ***")
end   = text.find("*** END OF THE PROJECT GUTENBERG EBOOK GRIMMS' FAIRY TALES ***")
corpus_gutenberg = text[start:end]

print("Gutenberg corpus length:", len(corpus_gutenberg), "characters")

Gutenberg corpus length: 520886 characters


### HuggingFace Gryphe ChatGPT-4o Writing Prompts



In [6]:
ds_gryphe = load_dataset("Gryphe/ChatGPT-4o-Writing-Prompts", split="train")

texts_gryphe = []
for row in ds_gryphe:
    msgs = row["conversations"]

    human = next((m["value"] for m in msgs if m["from"] == "human"), "")
    gpt   = next((m["value"] for m in msgs if m["from"] == "gpt"), "")
    if not gpt:
        continue

    include_human = False
    combined = (human + "\n\n" + gpt) if include_human else gpt
    texts_gryphe.append(combined.strip())

corpus_gryphe = "\n".join(texts_gryphe[:5000])
print(f"Prepared Gryphe corpus with {len(texts_gryphe)} entries, using first 5000.")
print("Sample:\n", corpus_gryphe[:500])


Prepared Gryphe corpus with 3741 entries, using first 5000.
Sample:
 The smell in the kitchen was somewhere between charred wood and sulfur, a sharp, acrid tang that set the back of Mina's throat on fire. That was the first indication something had gone terribly, terribly wrong.

She flipped frantically through the pages of her grandma’s old cookbook, the fragile, yellowed edges crumbling between her fingers as she scanned for something—anything—that could explain the colossal, horned creature standing in the middle of her kitchen. Flour dust still hung in the ai


### Kaggle WritingPrompts

In [7]:
def load_kaggle_corpus(split="train", max_samples=5000, max_tokens=1000, base_path="writingprompts"):
    """
    Load Kaggle Writing Prompts split (train/test/valid) from the given folder,
    combine source + target per line, truncate to max_tokens tokens,
    and return up to max_samples combined entries.
    """
    src_path = os.path.join(base_path, f"{split}.wp_source")
    tgt_path = os.path.join(base_path, f"{split}.wp_target")
    texts = []
    with open(src_path, "r", encoding="utf-8") as src_f, \
         open(tgt_path, "r", encoding="utf-8") as tgt_f:
        for i, (src, tgt) in enumerate(zip(src_f, tgt_f)):
            if i >= max_samples:
                break
            # Combine prompt (source) + story (target)
            combined = src.strip() + " " + tgt.strip()

            tokens = combined.split()
            truncated = " ".join(tokens[:max_tokens])
            texts.append(truncated)
    return texts

kaggle_texts = load_kaggle_corpus(split="train", max_samples=5000, max_tokens=1000)
corpus_kg = "\n".join(kaggle_texts)
print(f"Loaded {len(kaggle_texts)} samples from 'writingprompts/train' (total chars: {len(corpus_kg)})")
print("Example snippet:\n", kaggle_texts[0][:300], "…")


Loaded 5000 samples from 'writingprompts/train' (total chars: 15034991)
Example snippet:
 [ WP ] You 've finally managed to discover the secret to immortality . Suddenly , Death appears before you , hands you a business card , and says , `` When you realize living forever sucks , call this number , I 've got a job offer for you . '' So many times have I walked on ruins , the remainings o …


## Combine Corpora & Build Markov Model


In [8]:
combined_corpus = "\n".join([corpus_gutenberg, corpus_kg, corpus_gryphe])

model = markovify.Text(combined_corpus, state_size=2)

print("Markov model built. Sample sentence:")
print(model.make_sentence())


Markov model built. Sample sentence:
Reyes was silent, save for the others.


## Fetching a Trending Topic


In [10]:
def fetch_newsapi_topic():
    resp = newsapi.get_top_headlines(country="us", page_size=1)
    return resp["articles"][0]["title"]

def fetch_reddit_topic():
    return next(reddit.subreddit("all").hot(limit=1)).title

def fetch_pytrends_topic():
    return pytrends.trending_searches(pn="united_states")[0]

trend = fetch_newsapi_topic()
print("Trending topic:", trend)


Trending topic: Israel calling up tens of thousands of reservists to expand war on Gaza - Al Jazeera


## Generate a Story by Seeding the Markov Model


In [15]:
def build_topic_model(topic, state_size=2):

    sentences = re.split(r'(?<=[\.\?\!])\s+', combined_corpus)
    keywords = [w.lower() for w in topic.split() if len(w) > 3]
    topic_sentences = [
        s for s in sentences
        if any(kw in s.lower() for kw in keywords)
    ]
    if len(topic_sentences) < 5:
        return None 
    return markovify.Text("\n".join(topic_sentences), state_size=state_size)

topic_model = build_topic_model(trend, state_size=2)

if topic_model:
    combined_model = markovify.combine(
    [topic_model, model], [0.7, 0.3]
    )
    combined_model = combined_model.compile()
else:
    combined_model = model

story_sentences = []
for _ in range(5):
    s = combined_model.make_short_sentence(
        max_chars=100,
        min_chars=50,
        tries=30,
        max_overlap_ratio=0.7,
        max_overlap_total=15
    )
    if s and re.match(r'^[A-Z]', s):
        story_sentences.append(s)

if not story_sentences:
    template = f"This story is about {trend}. "
    fallback = model.make_short_sentence(
        max_chars=100, min_chars=50, tries=30
    ) or ""
    story_sentences = [template + fallback]

story = " ".join(story_sentences)
print("Generated Story:\n", story)



Generated Story:
 We stood before the first time that evening, something changed. She paused, those wide eyes rimmed red from the concussive blasts from their home planets. Its surface bears marks of wear, dug into her eyes, pinching the bridge walls. He turned to dust as they tapped through to its bones.


## Convert Story to Audio (Coqui TTS)


In [16]:
output_path = "trend_story.wav"
tts.tts_to_file(text=story, file_path=output_path)
print("Saved audio to", output_path)

ipd.display(ipd.Audio(output_path))

Saved audio to trend_story.wav


## Gradio Interface


In [31]:
import gradio as gr

def trendstory_pipeline(source: str):
    # 1) fetch
    if source == "reddit":
        topic = fetch_reddit_topic()
    elif source == "pytrends":
        topic = fetch_pytrends_topic()
    else:
        topic = fetch_newsapi_topic()
    # 2) generate
    seed = topic.split()[0]
    sentences = [model.make_sentence_with_start(seed, strict=False) for _ in range(5)]
    if not any(sentences):
        sentences = [model.make_sentence() for _ in range(5)]
    story = " ".join(s for s in sentences if s)
    # 3) audio
    out = "trend_story.wav"
    tts.tts_to_file(text=story, file_path=out)
    return topic, story, out

gr.Interface(
    fn=trendstory_pipeline,
    inputs=gr.Radio(["newsapi","reddit","pytrends"], label="Source"),
    outputs=["text","text","audio"],
    title="TrendStory (Markovify + Coqui TTS)",
    description="Pick a source to fetch a trend, generate a Markov-based story, and play it."
).launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [33]:

gr.close_all()

Closing server running on port: 7863
Closing server running on port: 7862
Closing server running on port: 7860
Closing server running on port: 7862
