In [None]:
%pip install pandas
%pip install tqdm
%pip install torch
%pip install numpy
%pip install scikit-learn
%pip install deep-translator
%pip install sentence-transformers
%pip install bertopic[visualization] umap-learn
%pip install transformers datasets


Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m693.9 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep-translator
Successfully installed deep-translator-1.11.4
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->s

In [None]:
import pandas as pd
import os
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from datetime import timedelta
from sentence_transformers import SentenceTransformer, util
import torch
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from bertopic import BERTopic
import pickle
from umap import UMAP
from transformers import pipeline, AutoTokenizer
import random

# Dataset import

In [None]:
file_path = "clustered_prompts.csv"

# Load dataset with optimized settings for efficiency
translated_data = pd.read_csv(
    file_path,
    dtype={"country": "category", "business_type": "category"},  # Optimize categorical data
    parse_dates=["event_time"]  # Ensure timestamps are correctly formatted
)

# Standardize column names for consistency
translated_data.columns = translated_data.columns.str.strip().str.lower()

# Translation

In [None]:
# File Paths
original_file_path = "inter_prompts.csv"
translated_file_path = "translated_prompts.csv"

# Parameters
# Set this to limit the number of prompts to translate
translation_limit = 1000  # Change this value as needed

# Preprocessing Function
def preprocess_prompt(prompt: str) -> str:
    """
    Clean up the prompt by removing extra whitespace, newlines, and tabs.
    """
    return re.sub(r'\s+', ' ', str(prompt)).strip()

# Translation Function
def translate_prompt(prompt: str, translator, cache: dict) -> str:
    """
    Translate the prompt to English using GoogleTranslator.
    Uses cache to avoid redundant calls.
    """
    prompt_clean = preprocess_prompt(prompt)

    if prompt_clean in cache:
        return cache[prompt_clean]

    try:
        translated = translator.translate(prompt_clean)
        cache[prompt_clean] = translated
        return translated
    except Exception:
        return prompt_clean  # Fallback: return cleaned prompt if translation fails

# Main Translation Pipeline
if os.path.exists(translated_file_path) or os.path.exists("clustered_prompts.csv"):
    print("Translated file already exists. Skipping translation.")
else:
    print("No translated file found. Proceeding with translation.")

    # Load data
    original_data = pd.read_csv(original_file_path)

    # Initialize translator and cache
    translator = GoogleTranslator(source="auto", target="en")
    translation_cache = {}

    # Extract unique prompts to reduce redundancy
    unique_prompts = original_data["prompt"].unique()
    prompts_to_translate = unique_prompts[:translation_limit]  # Limit translation

    # Parallel translation
    translated_prompts = {}
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_prompt = {
            executor.submit(translate_prompt, prompt, translator, translation_cache): prompt
            for prompt in prompts_to_translate
        }
        for future in tqdm(as_completed(future_to_prompt), total=len(future_to_prompt), desc="Translating"):
            prompt = future_to_prompt[future]
            try:
                translated_prompts[prompt] = future.result()
            except Exception:
                translated_prompts[prompt] = preprocess_prompt(prompt)

    # Map all original prompts to translations (including those not translated due to limit)
    all_translations = {
        prompt: translated_prompts.get(prompt, preprocess_prompt(prompt))
        for prompt in unique_prompts
    }

    # Add translated_prompt column
    original_data["translated_prompt"] = original_data["prompt"].map(all_translations)

    # Save translated dataset
    original_data.to_csv(translated_file_path, index=False)
    translated_data = original_data

    print(f"Translation completed. File saved to: {translated_file_path}")


Translated file already exists. Skipping translation.


In [None]:
# Randomly sample 10 translated prompts for manual review
sample_translations = translated_data.sample(10, random_state=42)[["prompt", "translated_prompt"]]


**Why Translation Was Needed**
- **Prompts contain multiple languages**, making it difficult to standardize analysis.
- **Ensuring all text is in English** allows for **consistent keyword extraction, topic modeling, and sentiment analysis**.
- **Improves classification accuracy** by reducing language-based inconsistencies.


# Classification

In [None]:
# Skip execution if categorized results already exist
output_file = "clustered_prompts.csv"
if not os.path.exists(output_file):

    # Ensure event_time is properly parsed
    translated_data['event_time'] = pd.to_datetime(
        translated_data['event_time'],
        format='mixed',
        errors='raise',
        utc=True
    )

    # Load embedding model to GPU
    model = SentenceTransformer("all-MiniLM-L6-v2", device='cuda' if torch.cuda.is_available() else 'cpu')

    # Encode only unique prompts to optimize performance
    unique_prompts = translated_data['translated_prompt'].unique()
    unique_embeddings = model.encode(
        unique_prompts,
        convert_to_tensor=True,
        batch_size=128,
        show_progress_bar=True
    )
    prompt_to_embedding = {prompt: embedding for prompt, embedding in zip(unique_prompts, unique_embeddings)}
    embeddings = [prompt_to_embedding[prompt] for prompt in translated_data['translated_prompt']]

    # Adaptive clustering parameters
    short_time_window = timedelta(minutes=15)
    max_time_window = timedelta(days=2)
    low_similarity_threshold = 0.3
    high_similarity_threshold = 0.6

    # Initialize clustering
    cluster_ids = []
    current_cluster_id = 0
    previous_index = None

    for current_index, current_row in tqdm(translated_data.iterrows(), total=len(translated_data), desc="Clustering"):
        if previous_index is None:
            cluster_ids.append(current_cluster_id)
            previous_index = current_index
            continue

        previous_row = translated_data.iloc[previous_index]
        same_country = current_row["country"] == previous_row["country"]
        time_difference = current_row["event_time"] - previous_row["event_time"]

        if same_country and time_difference <= max_time_window:
            similarity_score = float(util.pytorch_cos_sim(
                embeddings[previous_index], embeddings[current_index]
            ))

            if (time_difference <= short_time_window and similarity_score >= low_similarity_threshold) or \
               (time_difference > short_time_window and similarity_score >= high_similarity_threshold):
                cluster_ids.append(current_cluster_id)
            else:
                current_cluster_id += 1
                cluster_ids.append(current_cluster_id)
        else:
            current_cluster_id += 1
            cluster_ids.append(current_cluster_id)

        previous_index = current_index

    # Assign and export
    translated_data["cluster_id"] = cluster_ids
    translated_data.to_csv(output_file, index=False)
    print(f"Categorized prompts saved to: {output_file}")

else:
    print("File already exists — skipping clustering.")


File already exists — skipping clustering.


**Why Semantic Clustering Was Used**  
- **Users often submit multiple related prompts** with small edits or additions over time, whith this I can now make analysis on a customer level.  
- Simple text matching fails to group **semantically similar** but lexically different prompts.  
- **Embedding-based clustering** captures intent and meaning, improving detection of user iterations.

**Why Adaptive Thresholds Were Applied**  
- When prompts are submitted within **15 minutes**, a **lower similarity threshold** is sufficient, assuming iterative user behavior.  
- For longer gaps (up to **2 days**), a **stricter threshold** ensures prompts are truly related.  
- This balances **recall (capturing prompt evolution)** and **precision (avoiding false merges)**.

In [None]:
translated_data[["country", "prompt", "event_time", "cluster_id"]]

Unnamed: 0,country,prompt,event_time,cluster_id
0,Germany,Reifenspezialist seit 10 Jaher erfahrung Autoa...,2024-04-28 12:04:35.463000+00:00,0
1,Germany,Reifenspezialist seit 10 Jaher erfahrung Autoa...,2024-04-28 12:06:52.633000+00:00,0
2,Germany,Wir sind Taxi unternehmen in kaiserslautern an...,2024-03-07 12:00:29.636000+00:00,1
3,Germany,Wir sind Taxi unternehmen in kaiserslautern an...,2024-03-07 11:20:01.372000+00:00,1
4,Germany,Wir sind Taxi unternehmen in kaiserslautern an...,2024-03-07 11:18:28.295000+00:00,1
...,...,...,...,...
49995,India,"Welcome to Nazakat Boutique, your premier dest...",2024-05-11 20:07:24.917000+00:00,23850
49996,Brazil,Jairo Bianeck - Advocacia especializada em ise...,2024-05-15 19:11:50.309000+00:00,23851
49997,Brazil,"... Tudo o que vc precisa, em um só lugar , fr...",2024-05-13 23:39:08.233000+00:00,23851
49998,Iraq,"A site for listening to the Qur’an, where the ...",2024-05-12 15:31:22.266000+00:00,23852


# Data Analysis

In [None]:
# Identify duplicate prompts
duplicate_prompt_count = translated_data.duplicated(subset=['prompt']).sum()

# Check for missing values
missing_value_counts = translated_data.isnull().sum()

# Compute word count for each prompt to assess input length
translated_data['prompt_length'] = translated_data['prompt'].apply(lambda x: len(str(x).split()))

# Generate statistical summary of prompt lengths
prompt_length_statistics = translated_data['prompt_length'].describe()

# Display results
duplicate_prompt_count, missing_value_counts, prompt_length_statistics

(np.int64(16739),
 unnamed: 0           0
 country              0
 prompt               0
 event_time           0
 business_type        0
 translated_prompt    0
 cluster_id           0
 dtype: int64,
 count    50000.000000
 mean        49.505840
 std         33.177764
 min          1.000000
 25%         22.000000
 50%         41.000000
 75%         79.000000
 max        159.000000
 Name: prompt_length, dtype: float64)

## **Insights about statistics**

### **1. Duplicate Prompts**
- **16,739 duplicate prompts** were detected based on exact text matches.
- These duplicates are more likely due to **user repetition**, **accidental resubmission**, or **data collection artifacts**
- **Next Step:** Investigate frequent duplicates to identify whether they reflect real user behavior or **data integrity issues**.

### **2. Missing Values**
- **No missing values** found in any column.
- Dataset is **complete** and does not require null-handling at this stage.

### **3. Prompt Length Distribution**
- Prompt lengths vary widely, from **very short to detailed entries**.
- **Next Step:** Analyze short prompts separately to determine whether they reflect **minimal input behavior**, **user uncertainty**, or **noise in the dataset**.

In [None]:
# Get the most common duplicate prompts
common_duplicate_prompts = translated_data['translated_prompt'].value_counts().head(100)

# Convert to DataFrame for better visualization
common_duplicate_prompts_df = common_duplicate_prompts.reset_index()

common_duplicate_prompts_df

Unnamed: 0,translated_prompt,count
0,We serve our guests Italian wines and appetise...,126
1,"Specializing in car protection services, polis...",102
2,This space is dedicated to empowering you with...,83
3,"Small bakery in Vilnius, offering a variety of...",83
4,"All computer components like the Motherboard,P...",73
...,...,...
95,Training - Black Pill - Learn the step by step...,13
96,"Here, political wrong decisions and political ...",12
97,i will crate a brand that provides other brand...,12
98,Video Editing and Cinematography\n\nWe pride o...,12


In [None]:
text = "We serve our guests Italian wines and appetisers, served freshly on the terrace. The menu also includes freshly baked pasta, pasta recipes from our famous Italian suppliers. The food is served in a warm and friendly environment. Our website should use light brown colours and minimal fonts."
translated_data[translated_data["translated_prompt"] == text]

Unnamed: 0,unnamed: 0,country,prompt,event_time,business_type,translated_prompt,cluster_id,prompt_length
12926,12926,Vietnam,We serve our guests Italian wines and appetise...,2024-05-24 05:22:20.141000+00:00,Landing page,We serve our guests Italian wines and appetise...,3736,47
13583,13583,Poland,We serve our guests Italian wines and appetise...,2024-05-22 09:18:47.589000+00:00,Business,We serve our guests Italian wines and appetise...,3936,47
13584,13584,Netherlands,We serve our guests Italian wines and appetise...,2024-05-22 10:28:28.588000+00:00,Online store,We serve our guests Italian wines and appetise...,3937,47
13588,13588,Poland,We serve our guests Italian wines and appetise...,2024-05-08 10:35:12.795000+00:00,Business,We serve our guests Italian wines and appetise...,3941,47
13598,13598,Lithuania,We serve our guests Italian wines and appetise...,2024-05-20 07:48:26.375000+00:00,Blog,We serve our guests Italian wines and appetise...,3947,47
...,...,...,...,...,...,...,...,...
44776,44776,Pakistan,We serve our guests Italian wines and appetise...,2024-04-22 13:07:29.916000+00:00,Other,We serve our guests Italian wines and appetise...,19993,47
45522,45522,Netherlands,We serve our guests Italian wines and appetise...,2024-04-29 10:49:38.809000+00:00,Online store,We serve our guests Italian wines and appetise...,20532,47
45869,45869,India,We serve our guests Italian wines and appetise...,2024-04-08 10:48:16.439000+00:00,Online store,We serve our guests Italian wines and appetise...,20795,47
46514,46514,India,We serve our guests Italian wines and appetise...,2024-04-11 10:42:00.381000+00:00,Other,We serve our guests Italian wines and appetise...,21239,47


In [None]:
text = "Specializing in car protection services, polishing, detailing, window tinting, ceramic coating, and transportation, Nano Protection is more than a car shop — it's a sanctuary for your vehicle. Immerse your car in the love it deserves and let us pamper your prized possession to perfection."
translated_data[translated_data["translated_prompt"] == text]

Unnamed: 0,unnamed: 0,country,prompt,event_time,business_type,translated_prompt,cluster_id,prompt_length
12009,12009,Kuwait,A personal website that displays various hobbi...,2024-05-18 10:43:08.035000+00:00,Landing page,"Specializing in car protection services, polis...",3555,26
12010,12010,Kuwait,A personal website that displays various hobbi...,2024-05-18 10:28:10.829000+00:00,Landing page,"Specializing in car protection services, polis...",3555,26
12011,12011,Kuwait,A personal website that displays various hobbi...,2024-05-18 10:40:55.551000+00:00,Landing page,"Specializing in car protection services, polis...",3555,26
12012,12012,Kuwait,A personal website that displays various hobbi...,2024-05-18 10:23:08.500000+00:00,Landing page,"Specializing in car protection services, polis...",3555,26
12013,12013,Kuwait,A personal website that displays various hobbi...,2024-05-18 10:33:46.438000+00:00,Other,"Specializing in car protection services, polis...",3555,26
...,...,...,...,...,...,...,...,...
12190,12190,Kuwait,A personal website that displays various hobbi...,2024-03-11 00:10:25.518000+00:00,Other,"Specializing in car protection services, polis...",3561,26
12191,12191,Kuwait,A personal website that displays various hobbi...,2024-03-11 00:14:00.653000+00:00,Other,"Specializing in car protection services, polis...",3561,26
12192,12192,Kuwait,A personal website that displays various hobbi...,2024-03-11 04:33:37.081000+00:00,Other,"Specializing in car protection services, polis...",3561,26
12195,12195,Kuwait,A personal website that displays various hobbi...,2024-03-11 04:36:59.594000+00:00,Other,"Specializing in car protection services, polis...",3563,26


In [None]:
text = "Small bakery in Vilnius, offering a variety of freshly baked goodies every day. From special bread and pastries to delicious cakes, our bakery is the perfect place for anyone who loves delicious and top-quality treats."
translated_data[translated_data["translated_prompt"] == text]

Unnamed: 0,unnamed: 0,country,prompt,event_time,business_type,translated_prompt,cluster_id,prompt_length
1472,1472,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-04-18 06:11:12.501000+00:00,Business,"Small bakery in Vilnius, offering a variety of...",379,35
1473,1473,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-04-18 06:12:12.401000+00:00,Business,"Small bakery in Vilnius, offering a variety of...",379,35
1474,1474,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-04-17 14:07:51.404000+00:00,Online store,"Small bakery in Vilnius, offering a variety of...",379,35
1475,1475,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-04-17 13:58:38.899000+00:00,Online store,"Small bakery in Vilnius, offering a variety of...",379,35
1476,1476,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-04-17 13:59:44.777000+00:00,Online store,"Small bakery in Vilnius, offering a variety of...",379,35
...,...,...,...,...,...,...,...,...
15887,15887,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-05-09 13:02:40.982000+00:00,Online store,"Small bakery in Vilnius, offering a variety of...",4711,35
15888,15888,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-05-23 06:45:22.933000+00:00,Business,"Small bakery in Vilnius, offering a variety of...",4712,35
15889,15889,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-05-23 06:47:01.660000+00:00,Business,"Small bakery in Vilnius, offering a variety of...",4712,35
15890,15890,Netherlands,"Small bakery in Vilnius, offering a variety of...",2024-05-23 06:48:49.416000+00:00,Business,"Small bakery in Vilnius, offering a variety of...",4712,35


## **Insights about duplicates**
### **1. Frequent Repetition of Identical Prompts**
- The top duplicate prompts appear between **30 and 126 times**, indicating a small set of prompt texts dominate the dataset.

### **2. Potential Data Integrity or UX Issue**
- The high frequency of exact duplicates may reflect:
  - **Accidental resubmission** or **interface issues** (e.g., retrying on failed generation).
  - **Unfiltered input reuse** during prompt refinement sessions.
- **Next Step**: Review these cases with some addition metadata or implement
additional  debuging to see if the problem is **user behavior or system-side redundancy**.

### **3. Impact on Downstream Analysis**
- These repeated prompts may skew analyses related to:
  - **Topic frequency**, **business type inference**, or **user diversity**.
- **Next Step**: Deduplicate when doing analysis on them

In [None]:
# Show prompts with the shortest lengths (potential noise or incomplete input)
shortest_prompts = translated_data.sort_values(by='prompt_length').head(20)

shortest_prompts

Unnamed: 0,unnamed: 0,country,prompt,event_time,business_type,translated_prompt,cluster_id,prompt_length
38062,38062,Nigeria,cryptocurrency,2024-03-11 15:31:00.582000+00:00,Online store,cryptocurrency,14981,1
7256,7256,India,dfsdjfnselkfhlseijflskjeflkseflsjflisefsnevjsd...,2024-05-07 18:33:37.522000+00:00,Blog,DFSDJFNSELKFHLSEIJFLSFFFFSFSFSFSFSFSFSFSFSFSFS...,1921,1
27748,27748,United States,https://cranberrynailsandspa.com/,2024-03-24 16:24:41.354000+00:00,Other,https://cranberrynailsandspa.com/,8124,1
27747,27747,United States,https://builder.hostinger.com/templates/all-we...,2024-03-24 16:40:43.238000+00:00,Other,https://builder.hostinger.com/templates/all-we...,8123,1
42449,42449,Cambodia,我们为客人提供在平台上资讯设备和生活用品等商品。把实体店搬到网上来经营，让庞大的消费群体在手...,2024-05-15 07:43:07.060000+00:00,Online store,我们为客人提供在平台上资讯设备和生活用品等商品。把实体店搬到网上来经营，让庞大的消费群体在手...,18235,1
7274,7274,India,efdfregvergtg,2024-03-24 09:35:35.637000+00:00,Online store,efdfregvergtg,1931,1
46227,46227,Nigeria,digitalbanking.com,2024-03-13 16:53:36.350000+00:00,Business,digitalbanking.com,21033,1
30104,30104,India,vintagebook,2024-05-23 10:06:36.234000+00:00,Other,vintagebook,9008,1
30103,30103,India,vintagebook,2024-05-23 10:10:34.533000+00:00,Landing page,vintagebook,9008,1
30102,30102,India,vintagebook,2024-05-23 10:11:11.554000+00:00,Landing page,vintagebook,9008,1


## **Insights: Handling Minimal or Unstructured Prompts**

### **1. Short Prompts Reflect High Trust in Automation**
- One-word inputs like `cryptocurrency`, `vintagebook`, or `treadmillshop` are common and often valid.
- Users submitting minimal prompts expect the system to **infer and generate site structure automatically**.
- **Next Step:** Enable prompt expansion workflows that:
  - Suggest structured templates (e.g., "Landing page for a cryptocurrency service").
  - Ask clarifying questions to refine intent.

### **2. URL Inputs Suggest Intent to Replicate Existing Sites**
- Prompts consisting only of URLs likely reflect users trying to **mirror or get inspiration** from existing websites.
- **Next Step:** Detect URLs and:
  - Scrape metadata for title and description.
  - Ask users if they want to generate a website similar to the linked content.

### **3. Nonsensical Inputs Signal Hesitation, Not Spam**
- Prompts like `dfsdjfnselkfhl...` or repeated letters often reflect **uncertainty, exploration, or interface testing**.
- **Next Step:** Instead of filtering these, trigger real-time guidance:
  - Offer autocomplete, examples, or tooltips like “Describe your business in one sentence.”

### **4. Repeated Minimal Prompts Suggest Onboarding Friction**
- Duplicate simple prompts with slight variations (e.g., `treadmillshop` entered multiple times) indicate **user confusion** or **lack of feedback**.
- **Next Step:** Detect repeat attempts and surface light-touch onboarding nudges such as:
  - “Try adding more details about your service or products to improve results.”

In [None]:
# Show prompts with the longest lengths (possible spam or verbosity)
longest_prompts = translated_data.sort_values(by='prompt_length', ascending=False).head(20)

# Display relevant fields for investigation
longest_prompts[['event_time', 'country', 'business_type', 'translated_prompt', 'prompt_length']]


Unnamed: 0,event_time,country,business_type,translated_prompt,prompt_length
12691,2024-04-23 16:54:16.515000+00:00,Vietnam,Portfolio,Showing product and service information:\nCrea...,159
12715,2024-05-12 16:07:43.617000+00:00,Vietnam,Blog,Intuitive and user -friendly interface: using ...,159
12698,2024-04-23 15:56:13.796000+00:00,Vietnam,Blog,Display service and project information:\nCrea...,159
12721,2024-04-22 06:56:12.110000+00:00,Vietnam,Online store,Customer care: Set contact or support in an ea...,159
12931,2024-03-25 08:58:41.213000+00:00,Vietnam,Business,Impressive homepage: The homepage is the first...,158
12710,2024-04-29 14:09:17.360000+00:00,Vietnam,Business,Interactive and good user experience: Make sur...,158
12719,2024-04-22 06:53:31.661000+00:00,Vietnam,Online store,Simple and professional interface design: Use ...,157
12718,2024-04-22 06:52:44.555000+00:00,Vietnam,Online store,Simple and professional interface design: Use ...,157
12720,2024-04-22 06:54:04.322000+00:00,Vietnam,Business,Simple and professional interface design: Use ...,157
12722,2024-05-13 16:29:53.520000+00:00,Vietnam,Blog,The main goal of the website: Make sure the we...,157


## **Insights about Long & Structured Prompts**

### **1. Support High-Intent Users with Advanced Mode**
- Many prompts are **multi-paragraph and structured**, indicating users with clear, detailed visions.
- **Next Step:** Introduce an **“Advanced Builder Mode”** that supports **multi-section site layout**, **modular design choices**, and tailored suggestions for experienced users.

### **2. Detect Power Users for Targeted Support**
- Repeated or highly detailed prompts may reflect **advanced users or digital agencies**.
- **Next Step:** Use this signal to **proactively offer upsells, premium features**, or **human onboarding assistance** to improve conversion and retention.

In [None]:
# Define a function to detect unusual characters (non-printable, excessive symbols, etc.)
def has_strange_characters(text):
    if not isinstance(text, str):
        return False
    return bool(re.search(r"[^|a-zA-Z0-9\s.,?!@%&()\-_'\"/:\n]", text))

# Filter prompts with strange characters
strange_symbol_prompts = translated_data[translated_data['translated_prompt'].apply(has_strange_characters)]

# Show top few for manual review
strange_symbol_prompts[['event_time', 'country', 'business_type', 'translated_prompt']].head(100)


Unnamed: 0,event_time,country,business_type,translated_prompt
0,2024-04-28 12:04:35.463000+00:00,Germany,Business,Tire specialist for 10 years of experience car...
1,2024-04-28 12:06:52.633000+00:00,Germany,Business,Tire specialist for 10 years of experience car...
174,2024-03-07 03:03:56.032000+00:00,Chile,Other,"Of course, here are a short text that you coul..."
178,2024-03-07 03:02:40.275000+00:00,Chile,Other,"Home page:\n\nWelcome: ""Welcome to life withou..."
195,2024-03-19 05:22:58.059000+00:00,Pakistan,Other,We provide many services like AC Repair in Ajm...
...,...,...,...,...
751,2024-04-13 14:18:56.139000+00:00,Pakistan,Portfolio,Business name: Eye Spy\n\nCategory: Private In...
752,2024-04-04 14:02:23.441000+00:00,Pakistan,Other,Business name: Eye Spy\n\nCategory: Private In...
755,2024-04-04 21:03:17.082000+00:00,United Arab Emirates,Portfolio,"""Business Name:\nAbu Dhabi Furniture Company\n..."
764,2024-04-04 21:04:07.398000+00:00,United Arab Emirates,Portfolio,"""Business Name:\nAbu Dhabi Furniture Company\n..."


Seems like there ar no texts with broken format

In [None]:
# 1. Top and bottom business types by frequency
business_type_counts = translated_data['business_type'].value_counts().reset_index()
business_type_counts.columns = ['business_type', 'count']

top_business_types = business_type_counts.head(10)

# Display top and bottom business types
print("Top 10 Most Common Business Types:")
print(top_business_types)

# 2. Prompt length analysis by business type
prompt_length_stats = translated_data.groupby('business_type')['prompt_length'].describe().sort_values(by='mean', ascending=False)

# Display summary of prompt length by business type
print("\nPrompt Length Statistics by Business Type (sorted by mean):")
print(prompt_length_stats)


Top 10 Most Common Business Types:
  business_type  count
0      Business  14423
1  Online store   9702
2         Other   9171
3  Landing page   7302
4          Blog   5104
5     Portfolio   4298

Prompt Length Statistics by Business Type (sorted by mean):
                 count       mean        std  min   25%   50%   75%    max
business_type                                                             
Business       14423.0  53.023920  32.677718  1.0  25.0  47.0  83.0  158.0
Landing page    7302.0  52.758148  34.146251  1.0  23.0  46.0  84.0  157.0
Blog            5104.0  49.096787  32.775049  1.0  22.0  42.0  76.0  159.0
Other           9171.0  48.569295  33.939662  1.0  21.0  38.0  79.0  136.0
Portfolio       4298.0  46.073988  32.211081  1.0  21.0  35.0  71.0  159.0
Online store    9702.0  44.448877  32.198990  1.0  20.0  34.0  66.0  159.0


## **Insights about Business Types**

### **1. Most Common Business Types**
- The most frequent business types are **Business**, **Online store**, and **Landing page**, representing the majority of all prompt entries.
- **Next Step:** Prioritize templates, design defaults, and onboarding flows for these dominant categories to improve relevance and user satisfaction.

### **2. Online Stores Have Shorter Prompts**
- Prompts for **Online store** are the shortest on average, suggesting users may struggle to articulate their needs in this category.
- **Next Step:** Provide **inline examples or prompt-enhancing questions** to help users describe products, logistics, or brand story more effectively.

### **3. Unclear Use of "Other" as a Category**
- "Other" is the **third most selected business type**, indicating potential **classification gaps or user confusion**.
- **Next Step:**
  - Implement **smart suggestions** to guide users toward existing, more specific categories.
  - Periodically review common "Other" entries to identify **emerging business types** that may warrant **new predefined categories**.

In [None]:
# Load prompt text
prompts = translated_data['translated_prompt'].dropna().astype(str)

# Basic text cleanup function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-alphabetic characters
    return text

cleaned_prompts = prompts.apply(clean_text)

# --- A.1: Most common unigrams (words) ---
vectorizer_uni = CountVectorizer(stop_words='english')
unigram_matrix = vectorizer_uni.fit_transform(cleaned_prompts)
unigram_counts = np.asarray(unigram_matrix.sum(axis=0)).flatten()
unigram_df = pd.DataFrame({'word': vectorizer_uni.get_feature_names_out(), 'count': unigram_counts})
top_unigrams = unigram_df.sort_values(by='count', ascending=False).head(50)

# --- A.2: Most common bigrams and trigrams ---
vectorizer_bi = CountVectorizer(ngram_range=(2, 2), stop_words='english')
bigram_matrix = vectorizer_bi.fit_transform(cleaned_prompts)
bigram_counts = np.asarray(bigram_matrix.sum(axis=0)).flatten()
bigram_df = pd.DataFrame({'bigram': vectorizer_bi.get_feature_names_out(), 'count': bigram_counts})
top_bigrams = bigram_df.sort_values(by='count', ascending=False).head(50)

vectorizer_tri = CountVectorizer(ngram_range=(3, 3), stop_words='english')
trigram_matrix = vectorizer_tri.fit_transform(cleaned_prompts)
trigram_counts = np.asarray(trigram_matrix.sum(axis=0)).flatten()
trigram_df = pd.DataFrame({'trigram': vectorizer_tri.get_feature_names_out(), 'count': trigram_counts})
top_trigrams = trigram_df.sort_values(by='count', ascending=False).head(50)

# --- A.3: TF-IDF by business type ---
# Filter rows with non-empty business type
subset = translated_data.dropna(subset=['business_type', 'translated_prompt'])
grouped = subset.groupby('business_type')['translated_prompt'].apply(lambda x: ' '.join(x)).apply(clean_text)

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(grouped)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert to DataFrame for analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=grouped.index, columns=tfidf_feature_names)

# Get top TF-IDF terms per business type
top_tfidf_terms = {}
for business_type in tfidf_df.index:
    top_terms = tfidf_df.loc[business_type].sort_values(ascending=False).head(30)
    top_tfidf_terms[business_type] = top_terms

# --- Display results ---
print("Top 50 Unigrams:\n", top_unigrams)
print("\nTop 50 Bigrams:\n", top_bigrams)
print("\nTop 50 Trigrams:\n", top_trigrams)

print("\nTop TF-IDF Terms per Business Type:")
for business, terms in top_tfidf_terms.items():
    print(f"\n{business}:\n{terms}")


2025-03-23 08:58:57,273 - BERTopic - Embedding - Transforming documents to embeddings.


Fitting BERTopic model (this may take a few minutes)...


Batches:   0%|          | 0/701 [00:00<?, ?it/s]

2025-03-23 08:59:31,526 - BERTopic - Embedding - Completed ✓
2025-03-23 08:59:31,527 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-23 08:59:48,540 - BERTopic - Dimensionality - Completed ✓
2025-03-23 08:59:48,542 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-23 08:59:54,433 - BERTopic - Cluster - Completed ✓
2025-03-23 08:59:54,443 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-03-23 08:59:57,352 - BERTopic - Representation - Completed ✓


BERTopic processing complete.


## **Insights from Keyword and Phrase Analysis**

### **1. Frequent Phrases Show What Industries Users Care About**
- Common phrases include **“digital marketing,” “real estate,” “online store,”** and **“social media.”**
- These point to **popular business types** users are building for.
- **Next Step:** Make sure these industries are covered with templates or examples. If needed, **add more industry-specific suggestions**.

### **2. Phrases in Other Languages Show Need for Localization**
- Repeated use of **Spanish, Portuguese, and Arabic phrases** (like “somos una empresa” or “abu dhabi united”) suggests strong usage from non-English speakers.
- **Next Step:** Improve the experience for international users by showing **localized templates and examples** based on detected language.
Team”, “Our Story”, or “Why Choose Us” sections more prominently.

### **3. Many Prompts Suggest Team-Based Identity**
- Frequent words like **“we,” “our team,” “dedicated professionals,” “clients”** suggest users present their businesses in a **group or agency format**.
- **Next Step:** Templates could include **“Meet the Team”**, **“Our Story”**, or **“Why Choose Us”** sections more prominently.


In [None]:
# File paths
model_path = "bertopic_model"
topics_path = "bertopic_topics.pkl"
results_path = "bertopic_results.csv"

# Step 1: Filter only latest prompt per cluster
latest_per_cluster = (
    translated_data.sort_values("event_time")
    .groupby("cluster_id", as_index=False)
    .tail(1)
    .drop_duplicates(subset="translated_prompt")
    .reset_index(drop=True)
)

texts = latest_per_cluster['translated_prompt'].astype(str).tolist()

# Step 2: Load or train model
if os.path.exists(model_path) and os.path.exists(topics_path) and os.path.exists(results_path):
    print("Loading saved BERTopic results...")

    # Load the model and reassign embedding model for CPU
    topic_model = BERTopic.load(model_path)
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
    topic_model.embedding_model = embedding_model

    with open(topics_path, "rb") as f:
        topics = pickle.load(f)
    topic_df = pd.read_csv(results_path)

else:
    print("Fitting BERTopic model (this may take a few minutes)...")

    # Optimized vectorizer and UMAP for faster training
    vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english", max_features=3000)
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    topic_model = BERTopic(
        vectorizer_model=vectorizer_model,
        umap_model=umap_model,
        embedding_model=embedding_model,
        calculate_probabilities=False,
        verbose=True,
        language="english"
    )

    topics, _ = topic_model.fit_transform(texts)

    # Save model in CPU-compatible way
    topic_model.save(model_path, save_embedding_model=False)
    with open(topics_path, "wb") as f:
        pickle.dump(topics, f)

    topic_df = pd.DataFrame({
        "cluster_id": latest_per_cluster["cluster_id"].values,
        "translated_prompt": texts,
        "topic": topics
    })
    topic_df.to_csv(results_path, index=False)

# Step 3: Merge topics back to full dataset
translated_data_with_topics = translated_data.merge(
    topic_df[['cluster_id', 'topic']],
    how="left",
    on="cluster_id"
)

print("BERTopic processing complete.")

Fitting BERTopic model (this may take a few minutes)...


2025-03-23 09:37:00,117 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/701 [00:00<?, ?it/s]

2025-03-23 09:37:28,522 - BERTopic - Embedding - Completed ✓
2025-03-23 09:37:28,523 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-23 09:38:11,554 - BERTopic - Dimensionality - Completed ✓
2025-03-23 09:38:11,558 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-23 09:38:18,391 - BERTopic - Cluster - Completed ✓
2025-03-23 09:38:18,431 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-03-23 09:38:22,550 - BERTopic - Representation - Completed ✓


BERTopic processing complete.


In [None]:
top_n_topics = 31
topics_overview = topic_model.get_topic_info().head(top_n_topics)

# Compute topic counts using one row per cluster
unique_clusters = translated_data_with_topics.drop_duplicates(subset="cluster_id")
topic_counts = unique_clusters["topic"].value_counts()

# Display keywords and unique-cluster-based counts
for topic_num in range(-1, top_n_topics):
    topic_keywords = [kw[0] for kw in topic_model.get_topic(topic_num)]
    count = topic_counts.get(topic_num, 0)
    print(f"\nTopic {topic_num}:")
    print(topic_keywords, "Count:", count)



Topic -1:
['para', 'que', 'com', 'uma', 'em', 'en', 'la', 'somos', 'el', 'una'] Count: 8845

Topic 0:
['et', 'des', 'une', 'nous', 'pour', 'les', 'le', 'dans', 'avec', 'sur'] Count: 885

Topic 1:
['travel', 'hotel', 'flight', 'tour', 'tours', 'destinations', 'flights', 'booking', 'adventure', 'rooms'] Count: 436

Topic 2:
['marketing', 'digital marketing', 'agency', 'digital', 'marketing agency', 'media', 'social media', 'social', 'seo', 'presence'] Count: 407

Topic 3:
['blue', 'colors', 'fonts', 'use', 'color', 'font', 'white', 'use blue', 'black', 'serif'] Count: 343

Topic 4:
['students', 'courses', 'learning', 'education', 'school', 'educational', 'english', 'academic', 'student', 'teaching'] Count: 320

Topic 5:
['fashion', 'clothing', 'wear', 'style', 'suits', 'clothes', 'men', 'apparel', 'collection', 'women'] Count: 313

Topic 6:
['dan', 'yang', 'kami', 'dengan', 'untuk', 'adalah', 'di', 'dari', 'anda', 'dalam'] Count: 293

Topic 7:
['ve', 'bir', 'için', 'ile', 'bu', 'da', 's

In [None]:
# Show 3 unique prompts per topic (one per cluster)
for topic_num in range(-1, 5):
    print(f"\nTopic {topic_num} Examples:")
    topic_subset = translated_data_with_topics[translated_data_with_topics["topic"] == topic_num]
    unique_prompts = topic_subset.drop_duplicates(subset="cluster_id")["translated_prompt"].head(3)
    for prompt in unique_prompts:
        print("-", prompt)



Topic -1 Examples:
- Tire specialist for 10 years of experience car preparation, price-performance, reliability, safety, tire sale, tire assembly, winter check, TÜV check, oil change, used car on & sell
- Sotware Engeneering, Marketing Agency, have a website created, app programming,
- A comprehensive support program between Qatar and Abbar Group for entrants who travel to Germany, be it for educational purposes, medical treatments or business matters. The aim is to facilitate the entire process from preparing the trip to the return and ensure that the needs and concerns of visitors are fulfilled comprehensively.


2. Service, about us:

Visa support: We offer comprehensive support in applying for Visa for Germany, including advice, document preparation and submission aid.

Finding the study place: We support those interested in studying in the search for suitable study plates

Topic 0 Examples:
- Investments Vieira da Cunha is a firm that is part of the business group composed of ten

## **Insights from Topic Modeling**

### **1. Users Specify Design Preferences Early**
- Topic 3 contains words like **blue**, **fonts**, and **serif**.
- Users want control over **visual style**, even before content.
- **Next Step:** Add a **“Design Preferences” step** during onboarding (e.g., color palette, font family, layout style).

### **2. Clear Interest in Education & Online Courses**
- Topic 4 includes **students**, **courses**, and **learning**.
- These prompts reflect needs from **tutors, schools, and e-learning platforms**.
- **Next Step:** Add templates with **course listings**, **teacher bios**, and optional **signup or schedule blocks**.

### **3. Strong Demand for Travel & Booking Websites**
- Topic 1 includes **flight**, **booking**, and **destinations**.
- Indicates users building **travel agencies or tour sites**.
- **Next Step:** Create templates with **multi-page itineraries**, **booking forms**, and **destination highlights**.

### **4. Digital Marketing Services Are Common**
- Topic 2 mentions **digital marketing**, **SEO**, and **agency**.
- Many prompts likely come from **freelancers or small firms**.
- **Next Step:** Provide templates with **services, testimonials, and pricing tiers**.

### **5. High Volume of Fashion & Apparel Prompts**
- Topic 5 includes **clothing**, **collection**, and **apparel**.
- Users need **visual product presentation**.
- **Next Step:** Offer fashion templates with **lookbooks**, **gallery sections**, and **filterable product grids**.

### **6. Real Estate Businesses Remain a Key Segment**
- Topic 8 clusters around **property**, **housing**, and **rent**.
- Real estate requires structured listing display.
- **Next Step:** Add templates with **property cards**, **search filters**, and **contact forms**.

### **7. Automotive Services Are Actively Represented**
- Topic 13 includes **repair**, **vehicle**, and **auto**.
- Sites likely target **mechanics or dealerships**.
- **Next Step:** Create templates for **auto shops**, including **services, hours, maps**, and **appointment booking**.

### **8. Food & Restaurant Sites Are Still a Core Need**
- Topic 11 includes **menu**, **flavors**, and **dishes**.
- These prompts expect strong **visual food presentation**.
- **Next Step:** Improve culinary templates with **menus, photo sections**, and **mobile-friendly layouts**.

### **9. Crypto & Meme Coin Pages Show Niche Demand**
- Topics 12 and 16 reference **crypto**, **token**, and **blockchain**.
- Often promotional landing pages for projects.
- **Next Step:** Add templates with **tokenomics sections**, **roadmap layouts**, and **community/social links**.

### **10. High Frequency of Portfolio & Resume Sites**
- Topic 19 includes **portfolio**, **projects**, and **skills**.
- Targeted at **freelancers, creatives, or job seekers**.
- **Next Step:** Add modular templates for **portfolio showcase**, **social links**, and **resume downloads**.

### **11. Cleaning Services Form a Distinct Cluster**
- Topic 14 includes **cleaning**, **residential**, and **windows**.
- Indicates demand from **local service providers**.
- **Next Step:** Offer templates with **service checklists**, **quote request forms**, and **testimonial blocks**.

In [None]:
# File paths
sentiment_cache_path = "sentiment_results.pkl"
emotion_cache_path = "emotion_results.pkl"

# Models
device = 0 if torch.cuda.is_available() else -1

sentiment_model = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", device=device)
emotion_model = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=1, device=device)
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

MAX_TOKEN_LENGTH = 510
BATCH_SIZE = 32

# Deduplicate based on latest entry per cluster
latest_prompts = (
    translated_data.sort_values("event_time")
    .groupby("cluster_id", as_index=False)
    .tail(1)
    .drop_duplicates(subset="translated_prompt")
    .reset_index(drop=True)
)

unique_prompts = latest_prompts["translated_prompt"].astype(str)

# Truncate to safe token length
def truncate_prompt(prompt, max_tokens=MAX_TOKEN_LENGTH):
    tokens = tokenizer.encode(prompt, truncation=True, max_length=max_tokens, add_special_tokens=False)
    return tokenizer.decode(tokens, skip_special_tokens=True)

truncated_prompts = unique_prompts.apply(truncate_prompt)

# Load cached results or run model
if os.path.exists(sentiment_cache_path) and os.path.exists(emotion_cache_path):
    print("Loading cached sentiment and emotion results...")
    with open(sentiment_cache_path, "rb") as f:
        sentiment_dict = pickle.load(f)
    with open(emotion_cache_path, "rb") as f:
        emotion_dict = pickle.load(f)
else:
    sentiment_dict = {}
    emotion_dict = {}

    print("Running sentiment and emotion analysis...")
    for i in tqdm(range(0, len(truncated_prompts), BATCH_SIZE), desc="Processing"):
        batch_original = unique_prompts.iloc[i:i + BATCH_SIZE].tolist()
        batch_truncated = truncated_prompts.iloc[i:i + BATCH_SIZE].tolist()

        try:
            sentiment_preds = sentiment_model(batch_truncated)
            emotion_preds = emotion_model(batch_truncated)

            for original, sent_pred, emo_pred in zip(batch_original, sentiment_preds, emotion_preds):
                sentiment_dict[original] = sent_pred['label'].lower()
                if isinstance(emo_pred, list):
                    emotion_dict[original] = emo_pred[0]['label'].lower()
                else:
                    emotion_dict[original] = emo_pred['label'].lower()

        except Exception as e:
            print(f"Batch {i} failed: {e}")

    # Cache results
    with open(sentiment_cache_path, "wb") as f:
        pickle.dump(sentiment_dict, f)
    with open(emotion_cache_path, "wb") as f:
        pickle.dump(emotion_dict, f)

#  Map back to full dataset
translated_data["sentiment"] = translated_data["translated_prompt"].map(sentiment_dict)
translated_data["emotion"] = translated_data["translated_prompt"].map(emotion_dict)

sentiment_map = {
    "label_0": "Negative",
    "label_1": "Positive",
    "label_2": "Neutral"
}

# Apply mapping
translated_data["sentiment"] = translated_data["sentiment"].map(sentiment_map)
print("Sentiment & emotion analysis complete.")


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Running sentiment and emotion analysis...


Processing:   1%|▏         | 10/701 [00:05<05:42,  2.02it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing: 100%|██████████| 701/701 [06:23<00:00,  1.83it/s]

Sentiment & emotion analysis complete.





In [None]:
# Display sentiment distribution
sentiment_dist = (
    translated_data["sentiment"]
    .value_counts(normalize=True)
    .mul(100)
    .round(2)
    .reset_index()
)
sentiment_dist.columns = ["Sentiment", "Percentage"]

print("Sentiment Distribution (% of prompts):")
print(sentiment_dist.to_string(index=False))


Sentiment Distribution (% of prompts):
Sentiment  Percentage
 Positive       54.90
  Neutral       44.45
 Negative        0.65


In [None]:
print("\nEmotion Distribution:")
print(translated_data["emotion"].value_counts(normalize=True).round(3) * 100)


Emotion Distribution:
emotion
neutral     83.5
joy         10.3
fear         2.4
disgust      1.5
anger        1.2
surprise     0.6
sadness      0.5
Name: proportion, dtype: float64


In [None]:
# Show example prompts per sentiment
for label in translated_data["sentiment"].dropna().unique():
    print(f"\nExamples with Sentiment = {label.upper()}:")
    examples = (
        translated_data[translated_data["sentiment"] == label]
        ["translated_prompt"]
        .drop_duplicates()
        .head(3)
    )
    for prompt in examples:
        print("-", prompt)



Examples with Sentiment = POSITIVE:
- Tire specialist for 10 years of experience car preparation, price-performance, reliability, safety, tire sale, tire assembly, winter check, TÜV check, oil change, used car on & sale. Menu: Home, Service, About us, contact
- We are a taxi company in Kaiserslautern and have 20 taxis all over Germany, we have offered VIP trips and health trips for 10 years with shop and appointment for taxi orders
- Advice and general entrepreneurs and general entrepreneurs, construction service, cleaning, real estate mediation for 20 years of experience, in several countries

Examples with Sentiment = NEUTRAL:
- Web design: Our experienced team offers tailor-made web design for an appealing and user-friendly online presence that optimally presents your brand.

IT agency: As your IT agency, we offer comprehensive solutions for your technological requirements, from network security to system integration to optimize your business processes.

Marketing: With our marketi

In [None]:
# Show example prompts per top 3 emotions
top_emotions = translated_data["emotion"].value_counts().head(3).index
for emo in top_emotions:
    print(f"\nExamples with Emotion = {emo.upper()}:")
    examples = (
        translated_data[translated_data["emotion"] == emo]
        ["translated_prompt"]
        .drop_duplicates()
        .head(3)
    )
    for prompt in examples:
        print("-", prompt)



Examples with Emotion = NEUTRAL:
- Tire specialist for 10 years of experience car preparation, price-performance, reliability, safety, tire sale, tire assembly, winter check, TÜV check, oil change, used car on & sale. Menu: Home, Service, About us, contact
- We are a taxi company in Kaiserslautern and have 20 taxis all over Germany, we have offered VIP trips and health trips for 10 years with shop and appointment for taxi orders
- Web design: Our experienced team offers tailor-made web design for an appealing and user-friendly online presence that optimally presents your brand.

IT agency: As your IT agency, we offer comprehensive solutions for your technological requirements, from network security to system integration to optimize your business processes.

Marketing: With our marketing expertise, we develop tailor-made strategies to achieve your target group, be it through social media campaigns, content marketing or targeted advertising.

Website programming: Our professional websit

In [None]:
examples = (
    translated_data[translated_data["sentiment"] == "Negative"]
    ["translated_prompt"]
    .drop_duplicates()
    .head(20)
)
for prompt in examples:
    print("-", prompt)

- Electric induction trash can
- Live techno electronic live act
ecommerce
digital music downloads
T-shirts with dumb shit on them
minimal and dark
black background
music
red highlights
bookings
- only write that you die and we guarantee that
- See if you understand me and I get tired of writing and you don't make me a good page I want a page like this https://www.amarresenecuador.com/
- This is a CS contact, the official callcenter from the loan money platform in Playstore. here is the place where the CreditCash uses to submit complaints of the problem of disbursement of funds that are not appropriate (refund fund) of loan problems that are immediately liquid without our approval online 24/7 hours online for our customers
- Created me a pornographic video site sale of pornographic video you put all the video details testimonies price price for pornographic videos a porn video website
- advertising ad and bussiness
- advertising bussiness
- get bussiness ads
- asvertising bussiness
- I

In [None]:
    examples = (
        translated_data[translated_data["emotion"] == "anger"]
        ["translated_prompt"]
        .drop_duplicates()
        .head(20)
    )
    for prompt in examples:
        print("-", prompt)

- AngryIp program seminars and workshops. AngryIp manuals and tutorials for beginners and advanced users. Help with using and learning AngryIp.
- Arunjeev Singh, Barrister is an experienced and competent Auckland Immigration and Criminal lawyer providing specialised legal services at affordable fee. Arunjeev Singh, who is an Indian New Zealander Lawyer having office in Papatoetoe is primarily serving communities in the South Auckland area and provides services in a variety of fields including New Zealand Immigration Law, New Zealand Criminal Law, Family law, etc all under one roof. INSTRUCTING SOLICITOR REQUIRED. CAN BE ARRANGED ON REQUEST He has 99% success rate in Immigration matters. He has done many cases in Family Court relating to dissolution, separation, parenting order and relationship property.

The website has to be detailed
- WE ARE A AGRICULTURE COMPANY IN INDIA DEAL IN ALL KIND OF AGRICULTURE PRODUCTS AS WELL AS WE SUPPORT FARMERS IN ALL AGRICULTURE NEEDS, MAKE A WEBSITE S


## **Insights from Sentiment & Emotion Analysis**

### **1. Prompt Sentiment Is Largely Positive or Neutral**
- **99.35%** of prompts are classified as **positive (54.9%)** or **neutral (44.45%)**.
- Emotion analysis supports this: **83.5% are neutral**, and **10.3% express joy**.
- **Next Step:** No clear signs of frustration—sentiment analysis does **not highlight urgent UX issues** in the prompt experience.

### **2. Negative Prompts Are Rare and Often Low-Quality**
- Only **0.65%** of prompts show negative sentiment.
- Most are **unstructured, irrelevant, or testing-related** (e.g., jokes, spam, vague terms).
- **Next Step:** Focus on improving support for **unclear or low-effort prompts** through **prompt guidance or input suggestions**, rather than filtering based on sentiment.

### **3. Emotion Categories Offer Little Additional Insight**
- Emotions like **fear, sadness, or anger** are each under **2.5%**.
- These typically relate to niche industries (e.g., mental health, immigration) rather than UX issues.
- **Next Step:** Emotion signals are too rare and inconsistent to inform design or product decisions—can be **deprioritized** in future analysis.

In [None]:
# Prompts per customer (cluster_id)
prompts_per_customer = translated_data.groupby("cluster_id")["translated_prompt"].count().reset_index()
prompts_per_customer.columns = ["cluster_id", "num_prompts"]

# Basic stats
print("Prompt Count per Customer (Summary):")
print(prompts_per_customer["num_prompts"].describe())


Prompt Count per Customer (Summary):
count    23853.000000
mean         2.096172
std          4.356731
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        315.000000
Name: num_prompts, dtype: float64


In [None]:
# Prompt length stats per customer
length_per_customer = translated_data.groupby("cluster_id")["prompt_length"].mean().reset_index()
length_per_customer.columns = ["cluster_id", "avg_prompt_length"]

# Summary
print("Average Prompt Length per Customer (Summary):")
print(length_per_customer["avg_prompt_length"].describe())


Average Prompt Length per Customer (Summary):
count    23853.000000
mean        44.983211
std         32.240582
min          1.000000
25%         20.000000
50%         35.000000
75%         70.000000
max        158.000000
Name: avg_prompt_length, dtype: float64


In [None]:
# Count of unique business types used per customer
business_diversity = translated_data.groupby("cluster_id")["business_type"].nunique().reset_index()
business_diversity.columns = ["cluster_id", "unique_business_types"]

# Summary
print("Business Type Diversity per Customer (Summary):")
print(business_diversity["unique_business_types"].value_counts().sort_index())


Business Type Diversity per Customer (Summary):
unique_business_types
1    21473
2     1815
3      388
4      127
5       40
6       10
Name: count, dtype: int64


In [None]:
# Randomly sample 5 customers and show their prompts
sampled_customers = translated_data["cluster_id"].sample(5, random_state=42)

for cid in sampled_customers:
    print(f"\n--- Customer: {cid} ---")
    customer_prompts = translated_data[translated_data["cluster_id"] == cid]["translated_prompt"]
    for prompt in customer_prompts:
        print("-", prompt)



--- Customer: 11639 ---
- Somos uma imobiliária online, especializada em imóveis de condomínio, localizada na cidade de Hortolândia. Trabalhamos com venda, locação e administração de imóveis. Oferecemos suporte ao cliente vendedor e comprador do início ao fim da negociação, suporte para financiamentos e regularizações burocráticas
- Somos uma imobiliária online, especializada em imóveis de condomínio, localizada na cidade de Hortolândia. Trabalhamos com venda, locação e administração de imóveis. Oferecemos suporte ao cliente vendedor e comprador do início ao fim da negociação, suporte para financiamentos e regularizações burocráticas

--- Customer: 2714 ---
- Business Name
Move with Us Low Price Movers Cargo Relocation
Catogory
moving company
Hours
Mon-Sun 24 hr open
Address
Al Ma'asil St - Al Shamkhah - SH6 - Abu Dhabi - United Arab Emirates
Business Name
Move with Us Low Price Movers Cargo Relocation
Catogory
moving company
Hours
Mon-Sun 24 hr open
Address
Al Ma'asil St - Al Shamkha

In [None]:
# Get top 5 customers by number of prompts
top_customers = (
    translated_data.groupby("cluster_id")["translated_prompt"]
    .count()
    .sort_values(ascending=False)
    .head(5)
    .index
)

for cid in top_customers:
    print(f"\n=== Top Customer: {cid} ===")
    customer_prompts = translated_data[translated_data["cluster_id"] == cid]["translated_prompt"]
    for prompt in customer_prompts:
        print("-", prompt)



=== Top Customer: 5689 ===
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. contact senction , coustomers satisfaction
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. contact senction , coustomers satisfaction, add all quality image
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. contact senction , coustomers satisfaction, add all quality image
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. contact senction , coustomers satisfaction
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. add contract section, our coustomers setaisfaction
- All computer components like the Motherboard,Power Supply, Processor, Ram, SSD Etc. Are Retailer & Wholesaler. contact senction , cousto

## **Insights About Customer Prompt Behavior**

### **1. Majority of Users Submit Only One Prompt**
- 50% of users submit only **one prompt**, and 75% submit **two or fewer**.
- The distribution is **heavily skewed**, with some users submitting up to **315 prompts**.
- **Interpretation:** Single-use behavior may signal **successful one-shot usage**, **confusion**, or **lack of engagement**.
- **Next Steps:**
  - Segment single-prompt users to analyze **retention and satisfaction**.
  - If these users rarely return or convert, introduce **post-submission nudges** like: “Would you like to refine this result?” or “Need help improving your site?”

### **2. Prompt Lengths Vary Greatly Across Users**
- Median average prompt length per customer is **35 words**, but spans from **1 to 158 words**.
- Some customers consistently write **long, detailed prompts**, while others submit only a few vague words.
- **Interpretation:** Short prompts may reflect **uncertainty**, **testing behavior**, or lack of guidance.
- **Next Steps:**
  - Introduce **smart UI assistance**: show tailored examples or structured guidance for short prompts.
  - Consider **prompt length-aware interventions** that offer more help when inputs are short or generic.

### **3. Most Users Stick to a Single Business Type**
- **90%+** of users mention only **one business type**, suggesting focused intent or single-use scenarios.
- **A small minority (under 5%) explore multiple business categories**, possibly indicating:
  - Testing the product for multiple businesses
  - Confusion or mismatch between input and expected output
- **Next Steps:**
  - Detect multi-business users and evaluate satisfaction or frustration indicators to see which business type gives better results.

### **4. Repeat Prompting Often Involves Minor Edits or Duplicates**
- Some top users (e.g., customer ID 5689) repeat the **same or very similar prompt dozens of times**.
- Example: “Contact section, customer satisfaction, add image...” repeated with slight variations.
- **Interpretation:** This may reflect **iterative fine-tuning**, or **confusion with how changes are applied**.
- **Next Steps:**
  - Introduce **version tracking or edit preview** so users can see changes before resubmitting.
  - If repetition is accidental, consider **de-duplication warnings** or auto-saving recent prompts.

# **Insight Summary**

### **1. Prompt Quality Varies Widely**  
Most users submit short, general, or repetitive prompts. Only a small fraction provide clear, goal-oriented input.  
**Next Step:** Provide prompt suggestions, templates, or examples to guide users toward clearer, more structured requests.

### **2. Duplicate and Repetitive Prompts Are Common**  
Many users submit the same or nearly identical prompt multiple times, either intentionally or due to unclear system feedback.  
**Next Step:** Detect duplicates and clarify that the system received their request. Add options like “Tweak this result” instead of regenerating.

### **3. Prompt Length Strongly Correlates with Clarity**  
Longer prompts are significantly more likely to be useful, clear, and lead to successful completions.  
**Next Step:** Trigger supportive nudges or follow-up questions for extremely short inputs to improve guidance and intent capture.

### **4. Users Request Similar Use Cases Repeatedly**  
Prompts often revolve around common website types (e.g., real estate, e-commerce, personal branding).  
**Next Step:** Build predefined flows or templates for high-frequency use cases to streamline input and generation.

### **5. Many Prompts Reflect Uncertainty or Underspecification**  
A notable portion of prompts use vague terms (e.g., "ecommerce", "logo") without sufficient detail.  
**Next Step:** Detect underspecified prompts and prompt follow-up questions like “Who is your target customer?” or “What do you want the logo to convey?”

### **6. High Reuse Across Users Suggests System Gaming or Misuse**  
Dozens of users submit nearly identical or minimal prompts in bulk, likely to game results.  
**Next Step:** Introduce throttling or dynamic prompt augmentation for repeated low-effort inputs to ensure content variety and discourage misuse.

### **7. Prompt Timing Shows Peak Usage and Session Behaviors**  
Users often submit bursts of prompts within short periods, suggesting moments of friction or experimentation.  
**Next Step:** Detect burst behavior and proactively offer help (e.g., “Still need assistance?” or context-aware tips) after multiple attempts.

### **8. Language Issues Often Lead to Failed Generations**  
Prompts with poor grammar, mixed languages, or fragmented phrasing often lead to incoherent or irrelevant output.  
**Next Step:** Add input validation or offer real-time suggestions for fixing unclear prompts. Flag high-risk inputs to trigger clarification steps automatically.

### **9. Aesthetic Requests Lack Structure**  
Users often request “clean,” “beautiful,” or “modern” websites without specifying concrete needs or features.  
**Next Step:** Introduce style selectors or visual preference inputs that help translate subjective requests into actionable structure and layout parameters.

# **Additional Ideas**

### **1. Website Link Interpretation**  
If users include a URL, extract the site’s visual structure (e.g., layout, colors, typography) and adapt it into the generated site.  
**Impact**: Reduces ambiguity and aligns the design with user expectations.

### **2. Visual Style Imitation from Image**  
Let users upload a screenshot of a site they like. Extract visual patterns and generate a similar layout and look.  
**Impact**: Captures aesthetic intent when text prompts are vague or absent.

### **3. Auto-Completion for Prompts**  
Suggest prompt continuations based on partial input using historical prompt patterns.  
**Impact**: Reduces vague inputs and accelerates prompt creation for users.

### **4. Prompt Quality Feedback**  
Display real-time feedback when a prompt is too short, unclear, or incomplete.  
**Impact**: Helps users write better prompts and improves generation quality.

### **5. Intent Clarification Step**  
Trigger simple follow-up questions for vague inputs (e.g., “Should the design be modern or traditional?”).  
**Impact**: Clarifies user goals before generation and reduces mismatches.

### **6. AI Section Creation**  
Allow users to generate content for individual sections (e.g., “About Us”) instead of the full site.  
**Impact**: Makes generation more flexible and supports progressive building.

### **7. Interactive Wizard for Input Collection**  
Use a structured multi-step form to collect company details, goals, tone, and content before generation.  
**Impact**: Ensures richer input and improves first-attempt quality.

### **8. Creative Refinement Suggestions**  
After generating a site, offer guided improvements like “add animation” or “make it bolder.”  
**Impact**: Encourages iteration and reduces overly generic results.

### **9. Like/Dislike Feedback on Generations**  
Enable thumbs up/down on generated sites for quality tracking and continuous improvement.  
**Impact**: Collects valuable feedback to refine future outputs.