### Data Generation Process for the Project: Competitor Project Analysis in the Construction!

In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os

# Initialize Faker
fake = Faker()
random.seed(42)
np.random.seed(42)

# Output path
output_dir = "construction_data"
os.makedirs(output_dir, exist_ok=True)

# Company names
companies = [
    "Murad Buildings", "NRG Uzbekistan", "Golden House", "Dream City Development",
    "Bizning Uylar", "PREMIER HOUSE", "Ulkan Development", "Discover Invest"
]

regions = [
    "Tashkent", "Samarkand", "Bukhara", "Fergana", "Namangan", "Andijan",
    "Khorezm", "Surkhandarya", "Karakalpakstan", "Jizzakh", "Qashqadaryo",
    "Navoi", "Sirdarya", "Nukus", "Almaty", "Bishkek", "Dushanbe", "Ashgabat"
]

# Generate Companies table
def generate_companies():
    data = []
    for i, name in enumerate(companies):
        data.append({
            "CompanyID": i + 1,
            "CompanyName": name,
            "TotalProjects": random.randint(20, 120),
            "MarketShare": round(np.random.beta(2, 5) * 100, 2),
            "ReputationScore": round(np.random.normal(75, 10), 2)
        })
    return pd.DataFrame(data)

# Generate Projects table
def generate_projects(n=500):
    data = []
    for i in range(1, n + 1):
        start_date = fake.date_between(start_date='-5y', end_date='today')
        duration = random.randint(180, 1200)
        end_date = start_date + timedelta(days=duration)
        planned_cost = round(np.random.gamma(10, 10000), 2)
        actual_cost = planned_cost * np.random.normal(1.1, 0.3)
        if random.random() < 0.015:
            actual_cost = np.nan  # Missing actual cost

        data.append({
            "ProjectID": i,
            "CompanyID": random.randint(1, len(companies)),
            "ProjectName": fake.bs().title(),
            "Region": random.choice(regions),
            "StartDate": start_date,
            "EndDate": end_date,
            "PlannedCost": planned_cost,
            "ActualCost": round(actual_cost, 2) if not np.isnan(actual_cost) else np.nan,
            "SquareMeters": round(np.random.lognormal(8, 0.7)),
            "Status": random.choice(["Planned", "Ongoing", "Completed"])
        })
    return pd.DataFrame(data)

# Generate Milestones table
def generate_milestones(projects_df):
    data = []
    milestone_names = ["Foundation", "Framing", "Plumbing", "Electrical", "Finishing"]
    milestone_id = 1
    for _, row in projects_df.iterrows():
        num_milestones = random.randint(2, 7)
        start = row["StartDate"]
        delta = (row["EndDate"] - start).days // num_milestones
        for i in range(num_milestones):
            planned = start + timedelta(days=delta * i)
            actual = planned + timedelta(days=random.randint(-10, 20))
            if random.random() < 0.1:
                actual = None
            data.append({
                "MilestoneID": milestone_id,
                "ProjectID": row.ProjectID,
                "Name": milestone_names[i% len(milestone_names)],
                "PlannedCompletion": planned,
                "ActualCompletion": actual,
                "Status": random.choice(["On Track", "Delayed", "Completed"])
            })
            milestone_id += 1
    return pd.DataFrame(data)

# Generate Resources table
def generate_resources(projects_df, multiplier=3):
    material_names = ["Brk", "Cem", "Snd", "Wtr", "Stl", "Pls", "Wrd", "Pnt"]
    labor_roles = ["Mason", "Carp", "Plumb", "Elec", "Fins"]
    data = []
    resource_id = 1
    for _, row in projects_df.iterrows():
        num_resources = random.randint(50, 130) * multiplier
        for _ in range(num_resources):
            is_labor = random.random() < 0.3
            r_type = "Labor" if is_labor else "Material"
            quantity = np.random.exponential(33 if is_labor else 345)
            cost = random.randint(21, 68) if is_labor else random.randint(45, 623)

            if random.random() < 0.01:
                cost = np.nan  # Missing cost

            name = random.choice(labor_roles if is_labor else material_names)

            data.append({
                "ResourceID": resource_id,
                "ProjectID": row.ProjectID,
                "Type": r_type,
                "Quantity": round(quantity, 2),
                "Cost": round(cost, 2) if not np.isnan(cost) else np.nan,
                "Name": name
            })
            resource_id += 1
    return pd.DataFrame(data)

# Generate all data
companies_df = generate_companies()
projects_df = generate_projects()
milestones_df = generate_milestones(projects_df)
resources_df = generate_resources(projects_df, multiplier=4)  # increase resource count more

# Save to CSV
companies_df.to_csv(f"{output_dir}/Companies_py.csv", index=False)
projects_df.to_csv(f"{output_dir}/Projects_py.csv", index=False)
milestones_df.to_csv(f"{output_dir}/Milestones_py.csv", index=False)
resources_df.to_csv(f"{output_dir}/Resources_py.csv", index=False)

print("✅ Mock data generated and saved in 'construction_data' folder.")


✅ Mock data generated and saved in 'construction_data' folder.


In [8]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os

# Initialize Faker
fake = Faker()
random.seed(42)
np.random.seed(42)

# Output path
output_dir = "construction_data"
os.makedirs(output_dir, exist_ok=True)

# Company names
companies = [
    "Murad Buildings", "NRG Uzbekistan", "Golden House", "Dream City Development",
    "Bizning Uylar", "PREMIER HOUSE", "Ulkan Development", "Discover Invest"
]

regions = [
    "Tashkent", "Samarkand", "Bukhara", "Fergana", "Namangan", "Andijan",
    "Khorezm", "Surkhandarya", "Karakalpakstan", "Jizzakh", "Qashqadaryo",
    "Navoi", "Sirdarya", "Nukus", "Almaty", "Bishkek", "Dushanbe", "Ashgabat"
]

def generate_projects(n=500):
    data = []
    for i in range(1, n + 1):
        square_meters = round(np.random.lognormal(8, 0.7))
        # Base planned duration: 1 day per 25m², with ±20% to +35% deviation
        base_duration = square_meters / random.uniform(20, 30)
        deviation_factor = np.random.uniform(0.8, 1.35)  # not normal
        planned_duration = int(base_duration * deviation_factor)

        # Assign status with custom probability
        status = random.choices(
            ["Completed", "Ongoing", "Planned"],
            weights=[0.65, 0.25, 0.10],
            k=1
        )[0]

        start_date = None
        end_date = None
        actual_cost = None
        revenue = None

        if status == "Completed":
            start_date = fake.date_between(start_date='-5y', end_date='-30d')
            actual_duration = int(planned_duration * random.uniform(0.85, 1.2))
            end_date = start_date + timedelta(days=actual_duration)

            planned_cost = round(np.random.gamma(10, 10000), 2)
            actual_cost = planned_cost * np.random.normal(1.1, 0.3)
            scaling_factor = np.random.lognormal(mean=0.4, sigma=0.9)
            revenue = planned_cost * scaling_factor
            if random.random() < 0.05:
                revenue *= random.uniform(2.5, 5.0)

        elif status == "Ongoing":
            start_date = fake.date_between(start_date='-2y', end_date='today')

            planned_cost = round(np.random.gamma(10, 10000), 2)
            actual_cost = planned_cost * np.random.normal(1.05, 0.25)
            revenue = np.nan
            end_date = None

        elif status == "Planned":
            planned_cost = round(np.random.gamma(10, 10000), 2)
            actual_cost = np.nan
            revenue = np.nan

        data.append({
            "ProjectID": i,
            "CompanyID": random.randint(1, len(companies)),
            "ProjectName": fake.bs().title(),
            "Region": random.choice(regions),
            "Status": status,
            "StartDate": start_date,
            "EndDate": end_date,
            "PlannedDuration": planned_duration,
            "PlannedCost": planned_cost,
            "ActualCost": round(actual_cost, 2) if not pd.isna(actual_cost) else np.nan,
            "Revenue": round(revenue, 2) if not pd.isna(revenue) else np.nan,
            "SquareMeters": square_meters,
        })

    return pd.DataFrame(data)



# Generate all data
projects_df = generate_projects()


# Save to CSV
projects_df.to_csv(f"{output_dir}/Projects.csv", index=False)

print("✅ Mock data generated and saved in 'construction_data' folder.")

✅ Mock data generated and saved in 'construction_data' folder.
