
# Domain Name Dataset Generation & Analysis

This notebook demonstrates how to use the `data_generator` module to create a synthetic dataset of business descriptions and domain name suggestions. It also computes simple metrics and visualises the distribution of generated confidence scores and domain lengths.


In [None]:

# Import our generator and analysis libraries
from data_generator import generate_dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Enable inline plotting
%matplotlib inline


In [None]:

# Configuration parameters
num_records = 500  # total number of records to generate
blocked_prob = 0.1  # fraction of descriptions that contain banned terms
low_score_chance = 0.3  # chance of intentionally mismatching category for low scores
mission_prob = 0.3  # chance of appending a mission statement in the description
target_prob = 0.3   # chance of appending a target audience in the description
hyphen_prob = 0.3   # chance of inserting a hyphen between concatenated keywords
num_suggestions = 5  # number of domain suggestions per record

# Generate dataset using the above parameters
dataset = generate_dataset(
    num_records,
    blocked_prob=blocked_prob,
    low_score_chance=low_score_chance,
    mission_prob=mission_prob,
    target_prob=target_prob,
    hyphen_prob=hyphen_prob,
    num_suggestions=num_suggestions,
)
print(f"Generated {len(dataset)} records")


In [None]:

# Flatten the dataset into a DataFrame
rows = []
for rec in dataset:
    status = rec['output']['status']
    if status == 'success':
        for s in rec['output']['suggestions']:
            rows.append({
                'domain': s['domain'],
                'score': s['confidence'],
                'status': 'success',
                'keywords_used': ','.join(s.get('keywords_used', []))
            })
    else:
        # For blocked records, we record the status only
        rows.append({
            'domain': None,
            'score': None,
            'status': 'blocked',
            'keywords_used': ''
        })

# Create DataFrame
df = pd.DataFrame(rows)
print(df.head())
print("Total suggestions:", len(df))


In [None]:

# Success vs blocked distribution
status_counts = df['status'].value_counts()
print("Status distribution:
", status_counts)

# Filter successful suggestions
success_df = df[df['status'] == 'success'].copy()

# Basic statistics on confidence scores
score_stats = success_df['score'].describe()
print("
Confidence score statistics:
", score_stats)

# Length of domain names
success_df['length'] = success_df['domain'].str.len()
length_stats = success_df['length'].describe()
print("
Domain length statistics:
", length_stats)

# Correlation between length and score
corr = success_df[['score', 'length']].corr()
print("
Correlation between score and domain length:
", corr)

# Digit and hyphen prevalence
success_df['has_digit'] = success_df['domain'].str.contains(r'\d')
success_df['has_hyphen'] = success_df['domain'].str.contains('-')
print("
Percentage of domains with digits:", success_df['has_digit'].mean() * 100)
print("Percentage of domains with hyphens:", success_df['has_hyphen'].mean() * 100)


In [None]:

# Histogram of confidence scores
plt.figure(figsize=(8, 4))
plt.hist(success_df['score'], bins=np.linspace(0, 1, 21), color='skyblue', edgecolor='black')
plt.title('Distribution of Confidence Scores')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.show()

# Scatter plot of domain length vs. score
plt.figure(figsize=(8, 4))
plt.scatter(success_df['length'], success_df['score'], alpha=0.5)
plt.title('Domain Length vs Confidence Score')
plt.xlabel('Domain Length (characters)')
plt.ylabel('Confidence Score')
plt.grid(linestyle='--', alpha=0.5)
plt.show()

# Bar chart: frequency of digits and hyphens
plt.figure(figsize=(6, 4))
features = ['has_digit', 'has_hyphen']
values = [success_df['has_digit'].mean(), success_df['has_hyphen'].mean()]
plt.bar(features, values, color=['orange', 'purple'])
plt.title('Prevalence of Digits and Hyphens in Domain Names')
plt.ylabel('Fraction of suggestions')
plt.ylim(0, 1)
plt.show()
