<a href="https://colab.research.google.com/github/wolfzxcv/ml-examples/blob/master/generate_maintenance_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

# Function to create a larger dataset with values rounded to one decimal place
def create_large_dataset(data, num_samples):
    np.random.seed(42)
    large_data = {key: [] for key in data.keys()}
    data_length = len(data['engine_temp'])
    for _ in range(num_samples // data_length):
        for i in range(data_length):
            for key in data.keys():
                # Add random noise to each feature, maintaining data type for 'maintenance_required'
                if key == 'maintenance_required':
                    large_data[key].append(data[key][i])
                else:
                    noise = np.random.normal(0, 1)  # Increased noise variance can reduce the accuracy
                    noisy_data = round(data[key][i] + noise, 1)
                    large_data[key].append(noisy_data)

    # Add remaining samples if num_samples is not an exact multiple of data_length
    remaining_samples = num_samples % data_length
    if remaining_samples > 0:
        for i in range(remaining_samples):
            for key in data.keys():
                if key == 'maintenance_required':
                    large_data[key].append(data[key][i])
                else:
                    noise = np.random.normal(0, 1)  # Increased noise variance can reduce the accuracy
                    noisy_data = round(data[key][i] + noise, 1)
                    large_data[key].append(noisy_data)

    return pd.DataFrame(large_data)


data = {
    'engine_temp': [85, 90, 75, 100, 65, 80, 95, 70, 105, 60, 90, 75, 100, 65, 80, 95, 70, 105, 60, 90, 75, 100, 65, 80, 95, 70, 105, 60, 90, 75, 100, 65, 80, 95, 70, 105, 60, 90, 75, 100, 65, 80, 95, 70, 105, 60, 90, 75, 100, 65, 80, 95, 70, 105, 60],
    'oil_level': [75, 70, 80, 65, 85, 78, 68, 82, 60, 88, 70, 80, 65, 85, 78, 68, 82, 60, 88, 70, 80, 65, 85, 78, 68, 82, 60, 88, 70, 80, 65, 85, 78, 68, 82, 60, 88, 70, 80, 65, 85, 78, 68, 82, 60, 88, 70, 80, 65, 85, 78, 68, 82, 60, 88],
    'tire_pressure': [32, 30, 34, 29, 35, 33, 31, 36, 28, 37, 30, 34, 29, 35, 33, 31, 36, 28, 37, 30, 34, 29, 35, 33, 31, 36, 28, 37, 30, 34, 29, 35, 33, 31, 36, 28, 37, 30, 34, 29, 35, 33, 31, 36, 28, 37, 30, 34, 29, 35, 33, 31, 36, 28, 37],
    'battery_voltage': [12.5, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0, 12.3, 12.8, 12.1, 12.6, 12.4, 12.2, 12.9, 11.9, 13.0],
    'fuel_consumption': [8, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5, 7.5, 9, 7, 8.5, 8.2, 7.8, 9.1, 6.9, 9.5],
    'maintenance_required': [1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0]
}

# Generate a larger dataset
large_df = create_large_dataset(data, num_samples=987)

large_df = pd.DataFrame(large_df)
large_df.to_csv('vehicle_maintenance_data.csv', index=False)