In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 100000

# Define possible values for categorical features
user_ids = np.arange(1, 10001)  # 10,000 unique users
device_types = ["Smartphone", "Tablet", "Laptop", "Smart TV", "Gaming Console"]
app_categories = [
    "Video Streaming", "Live Streaming", "Gaming", "Social Media",
    "Music Streaming", "Browsing", "Video Calls", "Others"
]
time_of_day = ["Morning", "Afternoon", "Evening", "Night"]
days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
network_types = ["5G NSA", "5G SA", "mmWave", "Sub-6GHz"]
data_quality_levels = ["Low", "Medium", "High", "Ultra-HD"]
location_types = ["Urban", "Suburban", "Rural"]
indoor_outdoor = ["Indoor", "Outdoor"]

# Define usage rates (MB/min) based on app category and data quality
usage_rates = {
    "Video Streaming": {"Low": 5, "Medium": 10, "High": 20, "Ultra-HD": 30},
    "Live Streaming": {"Low": 8, "Medium": 15, "High": 25, "Ultra-HD": 40},
    "Gaming": {"Low": 2, "Medium": 5, "High": 10, "Ultra-HD": 15},  # E.g., casual to cloud gaming
    "Social Media": {"Low": 2, "Medium": 5, "High": 10, "Ultra-HD": 15},  # E.g., text to video uploads
    "Music Streaming": {"Low": 1, "Medium": 2, "High": 4, "Ultra-HD": 6},
    "Browsing": {"Low": 2, "Medium": 2, "High": 2, "Ultra-HD": 2},  # Fixed rate
    "Video Calls": {"Low": 5, "Medium": 10, "High": 15, "Ultra-HD": 25},
    "Others": {"Low": 5, "Medium": 5, "High": 5, "Ultra-HD": 5}  # Fixed rate
}

# Define probabilities for app categories based on device type
device_app_probs = {
    "Smartphone": [0.2, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1],  # Balanced usage
    "Tablet": [0.3, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1],  # Video streaming focus
    "Laptop": [0.2, 0.1, 0.05, 0.1, 0.05, 0.3, 0.1, 0.1],  # Browsing focus
    "Smart TV": [0.5, 0.2, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05],  # Video streaming focus
    "Gaming Console": [0.1, 0.05, 0.6, 0.05, 0.05, 0.05, 0.05, 0.05]  # Gaming focus
}

# Generate synthetic dataset
data = {
    "user_id": np.random.choice(user_ids, num_samples),
    "device_type": np.random.choice(device_types, num_samples),
    "session_duration": np.random.randint(1, 301, num_samples),  # 1 to 300 minutes
    "time_of_day": np.random.choice(time_of_day, num_samples),
    "day_of_week": np.random.choice(days_of_week, num_samples),
    "network_type": np.random.choice(network_types, num_samples),
    "location_type": np.random.choice(location_types, num_samples),
    "indoor_outdoor": np.random.choice(indoor_outdoor, num_samples),
    "prev_usage": np.random.uniform(50, 5000, num_samples)  # MB usage in past sessions
}

df = pd.DataFrame(data)

# Generate app_category based on device_type probabilities
df["app_category"] = df["device_type"].apply(lambda x: np.random.choice(app_categories, p=device_app_probs[x]))

# Generate signal strength based on location_type
base_signal_strength = df["location_type"].map({
    "Rural": lambda: np.random.randint(-110, -70),
    "Suburban": lambda: np.random.randint(-90, -50),
    "Urban": lambda: np.random.randint(-80, -40)
}).apply(lambda x: x())
df["signal_strength"] = base_signal_strength

# Adjust signal strength for indoor/outdoor
df["signal_strength"] = np.where(df["indoor_outdoor"] == "Indoor", df["signal_strength"] - 10, df["signal_strength"])

# Calculate throughput (Mbps) based on network_type and signal_strength
def calculate_throughput(network_type, signal_strength):
    if network_type == "5G NSA":
        throughput = 300 + (signal_strength + 110) * 5
    elif network_type == "5G SA":
        throughput = 400 + (signal_strength + 110) * 6
    elif network_type == "mmWave":
        throughput = 1000 + (signal_strength + 110) * 10 if signal_strength > -70 else 50
    elif network_type == "Sub-6GHz":
        throughput = 500 + (signal_strength + 110) * 7
    return max(min(throughput, 2000), 50)  # Cap between 50 and 2000 Mbps

df["throughput"] = df.apply(lambda row: calculate_throughput(row["network_type"], row["signal_strength"]), axis=1)

# Determine data_quality based on throughput
def get_data_quality(throughput):
    if throughput < 100:
        return "Low"
    elif throughput < 300:
        return "Medium"
    elif throughput < 600:
        return "High"
    else:
        return "Ultra-HD"

df["data_quality"] = df["throughput"].apply(get_data_quality)

# Calculate background data (MB/min) based on device_type
def get_background_data(device_type):
    return {
        "Smartphone": np.random.uniform(1, 10),
        "Tablet": np.random.uniform(0.5, 5),
        "Laptop": np.random.uniform(2, 15),
        "Smart TV": np.random.uniform(0, 2),
        "Gaming Console": np.random.uniform(0, 3)
    }[device_type]

df["background_data"] = df["device_type"].apply(get_background_data)

# Calculate usage rate based on app_category and data_quality
df["usage_rate"] = df.apply(lambda row: usage_rates[row["app_category"]][row["data_quality"]], axis=1)

# Calculate total data usage (MB)
df["data_usage"] = (df["session_duration"] * df["usage_rate"] + 
                    df["session_duration"] * df["background_data"]) + np.random.normal(0, 10, num_samples)
df["data_usage"] = df["data_usage"].clip(lower=0)  # Ensure non-negative

# Final columns
final_columns = [
    "user_id", "device_type", "app_category", "session_duration", "data_quality",
    "time_of_day", "day_of_week", "network_type", "signal_strength", "prev_usage",
    "location_type", "indoor_outdoor", "throughput", "background_data", "data_usage"
]
df = df[final_columns]

# Save dataset to CSV
df.to_csv("5G_Data_Usage_Prediction_Enhanced.csv", index=False)

print("Enhanced dataset '5G_Data_Usage_Prediction_Enhanced.csv' created with", len(df), "rows.")

Enhanced dataset '5G_Data_Usage_Prediction_Enhanced.csv' created with 100000 rows.
