In [272]:
import sys
sys.path.append('..')

import os
import random
import uuid
import pandas as pd
from pydantic import BaseModel, Field
from openai import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv

from model.utils.OpenRouter import OpenRouter
from model.utils.JsonExtractor import JsonExtractor

In [273]:
load_dotenv()

True

In [274]:
models = [
  'google/gemini-2.0-flash-001',
  'openai/gpt-4.1',
  'openai/gpt-4o-mini',
  'deepseek/deepseek-chat-v3-0324'
  'anthropic/claude-3.7-sonnet',
  'mistralai/mistral-small-24b-instruct-2501',
]

In [275]:
topics = [
    'The ethics of artificial intelligence in decision-making',
    'The future of space exploration and colonization',
    'The impact of climate change on global food security',
    'The rise of decentralized finance (DeFi) and its potential',
    'The history and cultural significance of the Silk Road',
    'The psychology of misinformation and its spread',
    'The benefits and drawbacks of remote work',
    'The role of renewable energy in achieving carbon neutrality',
    'The evolution of video games as an art form',
    'The challenges of global mental health awareness and treatment',
    'The science behind sleep and its importance for well-being',
    'The influence of social media on political discourse',
    'The conservation of endangered species and biodiversity',
    'The history of pandemics and lessons learned',
    'The future of transportation (e.g., autonomous vehicles, hyperloop)',
    'The impact of fast fashion on the environment and labor',
    'The exploration of deep-sea ecosystems',
    'The role of art in social and political movements',
    'The development and implications of quantum computing',
    'The history of democracy and its modern challenges',
    'The benefits of learning a second language',
    'The philosophy of Stoicism and its relevance today',
    'The rise of the gig economy and its effects on workers',
    'The importance of critical thinking in the digital age',
    'The history of jazz music and its key figures',
    'The challenges of sustainable tourism',
    'The process and implications of gene editing technologies',
    'The history and architecture of ancient Rome',
    'The psychological effects of long-term isolation',
    'The future of personalized medicine',
    'Effective study techniques for different subjects',
    'How to write a strong thesis statement for an essay',
    'Managing stress and maintaining mental well-being in college',
    'Budgeting and managing finances as a student',
    'Dealing with homesickness or adjusting to a new environment',
    'The impact of social media on student mental health and focus',
    'Strategies for improving critical thinking and analytical skills',
    'How to effectively participate in group projects',
    'Preparing for standardized tests (e.g., SAT, ACT, GRE, GMAT)',
    'The future job market and in-demand skills',
    'New smartphones, laptops, wearables, smart home devices',
    'Debugging code, writing functions in Python/JavaScript/etc., understanding algorithms, web development, app development',
    'How to stay safe online, protect data, avoid scams, understand VPNs, data breaches',
    'Quantum computing, Web3, Metaverse, blockchain applications beyond crypto',
    'Best productivity apps, photo/video editing software, how to use specific software features',
    'What is AI, machine learning, deep learning, neural networks? How do they work?',
    'How does ChatGPT work? Its capabilities, limitations, and ethical implications. Prompts for ChatGPT. The future of LLMs',
    'Specific recipes, cooking techniques, ingredient substitutions, meal planning',
    'How to fix things around the house, home organization, gardening tips',
    'Destination ideas, itinerary planning, budget travel, travel restrictions, packing tips',
    'Budgeting, saving money, understanding investing basics (stocks, crypto – though AI is cautious here), explaining economic concepts',
    'Explanations of common medical conditions (AI will always advise seeing a doctor), fitness routines, nutrition basics, mental well-being strategies (stress management, mindfulness)',
    'Drafting emails, writing articles/blog posts, creating stories, poems, song lyrics, scripts. Summarizing text, rephrasing content',
    'Ideas for business, creative projects, party themes, gift ideas',
    'Social media post ideas, ad copy, product descriptions, business plan outlines',
    'Understanding major current events, political situations, scientific breakthroughs',
    'Explaining scientific theories, historical events, philosophical concepts in easy-to-understand terms',
    'Resume/cover letter writing, job interview preparation, career change advice',
    'Time management techniques, focus strategies',
    'Communication tips, understanding different perspectives',
    'Explaining homework concepts, help with study strategies',
    'Tell me a joke',
    'Exploring philosophical perspectives',
    'Discussing hypothetical moral situations',
]

In [276]:
styles = [
    'Academic/Scholarly',
    'News Reportorial',
    'Informal Blog Post',
    'Persuasive/Argumentative',
    'Descriptive',
    'Narrative',
    'Instructional/How-To',
    'Humorous/Satirical',
    'Technical',
    'Poetic/Lyrical',
    'News',
    'Blogpost',
    'Blogpost or Video Comment'
]

In [277]:
languages = [
  'russian',
]

In [None]:
additional_instructions = [
    '\n',
    'Write text in a human-like manner to bypass AI detection checks\n',
    'Make some natural grammatical or punctuation mistakes\n',
]

In [278]:
lengths = [
  (80, 250),
  (250, 800),
  (800, 1500),
]

In [279]:
class Schema(BaseModel):
    text: str = Field(description='Given the instructions, write a piece of text')

In [280]:
template = """
You are a versatile AI text generator. Your task is to generate a piece of text.
Topic: "{topic}"
Style: "{style}"
Lnaguage: "{language}"
Length: {min_length}-{max_length} symols
{additional_instruction}
{format_instructions}

Please generate the text based *only* on these instructions.
"""
parser = PydanticOutputParser(pydantic_object=Schema)
prompt = PromptTemplate(
    template=template,
    input_variables=['topic', 'style', 'min_length', 'max_length'],
    partial_variables={'format_instructions': parser.get_format_instructions()},
)

In [281]:
client = OpenAI(
    base_url='https://openrouter.ai/api/v1',
    api_key=os.getenv('OPENROUTER_API_KEY'),
)

In [282]:
dataset_path = 'raw/generated.csv'
file_exists = os.path.isfile(dataset_path)

In [None]:
i = 0

while True:
    model = random.choice(models)
    topic = random.choice(topics)
    style = random.choice(styles)
    language = random.choice(languages)
    min_length, max_length = random.choice(lengths)
    additional_instruction = random.choice(additional_instructions)
    temperature = random.random()

    chain = (
        prompt
        | OpenRouter(model='openai/o4-mini', temperature=temperature)
        | StrOutputParser()
        | JsonExtractor()
        | parser
    )

    result = chain.invoke({
        'topic': topic,
        'style': style,
        'language': language,
        'min_length': min_length,
        'max_length': max_length,
        'additional_instruction': additional_instruction,
    })
    record = {
        'id': uuid.uuid4().hex,
        'text': result.text,
        'is_human': 0,
    }
    df_record = pd.DataFrame([record])
    if not file_exists or i == 0 and not pd.read_csv(dataset_path, nrows=0).columns.tolist():
        df_record.to_csv(dataset_path, index=False, mode='a')
        file_exists = True
    else:
        df_record.to_csv(dataset_path, mode='a', header=False, index=False)

    i += 1
    print(f'Generated {i} row: model = "{model}", topic = "{topic}" style = "{style}", length = {len(result.text)}')

Generated 1 row: model = "mistralai/mistral-small-24b-instruct-2501", topic = "Debugging code, writing functions in Python/JavaScript/etc., understanding algorithms, web development, app development" style = "News", length = 584


KeyboardInterrupt: 