In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Load and preprocess data
df = pd.read_csv('mannual_data.csv')

# Initialize faker for company names
fake = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

def generate_synthetic_data(original_df, num_records=100):
    # Create a copy to avoid modifying original data
    df = original_df.copy()
    
    # Separate company column and numerical data
    company_col = df['Company']
    numerical_df = df.drop(columns=['Company'])
    
    # Replace 0s with NaN for easier handling
    numerical_df.replace(0, np.nan, inplace=True)
    
    # Calculate company-specific averages for each column
    company_means = numerical_df.groupby(company_col).transform(lambda x: x.fillna(x.mean()))
    
    # Fill remaining NaNs with global averages
    global_means = numerical_df.mean()
    numerical_filled = company_means.fillna(global_means)
    
    # Calculate final averages for generation
    generation_means = numerical_filled.mean()
    generation_stds = numerical_filled.std()
    
    # Generate synthetic data
    synthetic_data = []
    
    # Create base company names for multi-year entries
    num_base_companies = random.randint(400, 500)  # Number of unique base companies
    base_companies = [fake.unique.company().split()[0].lower() for _ in range(num_base_companies)]
    
    for _ in range(num_records):
        # Select base company and generate year
        base = random.choice(base_companies)
        year = random.randint(2020, 2024)
        company = f"{base}-{year}"
        
        record = {'Company': company}
        for col in numerical_df.columns:
            # Generate value with normal distribution around mean
            base_value = np.random.normal(generation_means[col], generation_stds[col])
            
            # Apply column-specific adjustments
            if 'Emission' in col:
                base_value = abs(base_value)
            elif 'Employee' in col or 'Waste' in col:
                base_value = round(abs(base_value))
            elif 'Gender' in col or 'WOB' in col:
                base_value = abs(base_value) % 1  # Keep between 0-1
            elif 'certification' in col:
                base_value = round(abs(base_value))
            
            record[col] = max(base_value, 0)  # Ensure no negative values
        
        synthetic_data.append(record)
    
    return pd.DataFrame(synthetic_data)

# Generate 100-1000 records
num_records = random.randint(100, 1000)
#num_records = random.randint(10, 500)
synthetic_df = generate_synthetic_data(df, num_records)



cleaned_data = []

for entry in synthetic_df['Company']:

    
    if ',' in entry:
        name, year = entry.split(',')
    else:
        name, year = entry.rsplit('-', 1)  # In case there's no comma, get the last part as year

    # Remove all dashes from the name part and strip spaces
    name_cleaned = name.replace("--", "")
    name_cleaned1 = name_cleaned.replace("-", "")
    
    # Strip spaces and ensure we have only one dash before the year
    year_cleaned = year.replace("-", "")
    
    # Avoid double dash
    cleaned_entry = f"{name_cleaned1}-{year_cleaned}"
    
    cleaned_data.append(cleaned_entry)
synthetic_df['Company'] = cleaned_data

#remove duplicates
synthetic_df = synthetic_df.drop_duplicates(subset=['Company'])


# Save to CSV
synthetic_df.to_csv('synthetic_data.csv', index=False)
print(f"Generated {num_records} synthetic records saved to synthetic_data.csv")

Generated 337 synthetic records saved to synthetic_data.csv
