## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from faker import Faker       #used to generate fake data to populate dataset
import random                 


In [None]:
fake = Faker()
np.random.seed(42) 

num_rows = 10000  

# Generate a skewed Satisfaction Score (more 6-9 ratings, fewer extreme ratings)
satisfaction_scores = np.round(np.random.normal(loc=7, scale=2, size=num_rows)).astype(int)
satisfaction_scores = np.clip(satisfaction_scores, 1, 10) 

# Generate a realistic Age distribution (Right-skewed)
young = np.random.normal(loc=25, scale=5, size=int(0.5 * num_rows))  # 50% young adults
middle_aged = np.random.normal(loc=45, scale=7, size=int(0.3 * num_rows))  # 30% middle-aged
elderly = np.random.normal(loc=65, scale=6, size=int(0.2 * num_rows))  # 20% elderly

ages = np.concatenate([young, middle_aged, elderly])
ages = np.round(ages).astype(int)

# Generate a skewed Subscription Months distribution 
subscription_months = np.random.exponential(scale=8, size=num_rows).astype(int)
subscription_months = np.clip(subscription_months, 1, 24)   

# Comments
negative_comments = [
    "The meals were bland and arrived late!",
    "Too expensive for the portion size.",
    "I canceled my subscription after a week.",
    "The ingredients didn’t seem fresh.",
    "Delivery was late."
]

neutral_comments = [
    "The service was okay, but portions could be bigger.",
    "Some meals were great, others were average.",
    "Good quality, needs more variety.",
    "I wish they had more flexible delivery times.",
    "Not bad, but not outstanding either."
]

positive_comments = [
    "Loved the variety and convenience of the meals!",
    "Healthy, tasty, and super easy to prepare!",
    "Great meal plans, and customer service is fantastic.",
    "The subscription saves me so much time!",
    "Fresh ingredients and well-balanced meals—highly recommend!"
]

# Function to assign comments based on satisfaction score
def get_comment(score):
    if score in range(1, 5):  # Low satisfaction
        return random.choice(negative_comments)
    elif score in range(5, 8):  # Neutral
        return random.choice(neutral_comments)
    else:  # High satisfaction (8-10)
        return random.choice(positive_comments)
    
# Function to introduce missing values in a column
def introduce_nulls(column, probability=0.1):  # 10% missing values
    return [value if random.random() > probability else np.nan for value in column]

# Generate dataset
data = {
    "CustomerID": range(1, num_rows + 1),
    "Age": introduce_nulls(ages, probability=0.15),
    "Subscription Months": introduce_nulls(subscription_months, probability=0.10),
    "Satisfaction Score (1-10)": satisfaction_scores,
    "Feedback": [get_comment(score) for score in satisfaction_scores],
}

df = pd.DataFrame(data)
df["Satisfaction Score (1-10)"] = df["Satisfaction Score (1-10)"].apply(lambda x: np.nan if random.random() < 0.15 else x)    # filling 15% of the values with null
df.to_csv("SatisfactionSurvey.csv", index=False)

In [39]:
df.head(10)

Unnamed: 0,CustomerID,Age,Subscription Months,Satisfaction Score (1-10),Feedback
0,1,22.0,1.0,8.0,Fresh ingredients and well-balanced meals—high...
1,2,23.0,24.0,7.0,"Good quality, needs more variety."
2,3,22.0,,8.0,"Great meal plans, and customer service is fant..."
3,4,,16.0,,"Great meal plans, and customer service is fant..."
4,5,31.0,9.0,7.0,"Some meals were great, others were average."
5,6,21.0,1.0,7.0,I wish they had more flexible delivery times.
6,7,30.0,1.0,10.0,The subscription saves me so much time!
7,8,21.0,15.0,9.0,Loved the variety and convenience of the meals!
8,9,21.0,4.0,,I wish they had more flexible delivery times.
9,10,29.0,2.0,8.0,"Great meal plans, and customer service is fant..."
