In [5]:
import random
import csv

# Read the lines of the file updated_pollution_dataset.csv
with open('updated_pollution_dataset.csv', 'r') as f:
    lines = f.readlines()

# Separate the header from the rest of the lines
header = lines[0]
data_lines = lines[1:]

# Shuffle the data lines
random.shuffle(data_lines)

# Write the shuffled lines to a new file called shuffled_pollution_dataset.csv
with open('shuffled_pollution_dataset.csv', 'w', newline='') as f:
    f.write(header)
    f.writelines(data_lines)

In [6]:
import pandas as pd
import numpy as np

def clean_negative_values(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file)
    
    # All columns except the last one (label) should be non-negative
    numeric_columns = df.columns[:-1]
    
    # Print statistics before cleaning
    print("Before cleaning:")
    print(df[numeric_columns].describe())
    
    # Replace negative values with 0
    for col in numeric_columns:
        negative_count = (df[col] < 0).sum()
        if negative_count > 0:
            print(f"\nFound {negative_count} negative values in {col}")
            print(f"Negative values: {df[df[col] < 0][col].values}")
            df[col] = df[col].clip(lower=0)
    
    # Print statistics after cleaning
    print("\nAfter cleaning:")
    print(df[numeric_columns].describe())
    
    # Save cleaned data
    df.to_csv(output_file, index=False)
    print(f"\nCleaned data saved to {output_file}")

if __name__ == "__main__":
    input_file = "shuffled_pollution_dataset.csv"
    output_file = "cleaned_pollution_dataset.csv"
    clean_negative_values(input_file, output_file)

Before cleaning:
       Temperature     Humidity        PM2.5         PM10          NO2  \
count  5000.000000  5000.000000  5000.000000  5000.000000  5000.000000   
mean     30.029020    70.056120    20.142140    30.218360    26.412100   
std       6.720661    15.863577    24.554546    27.349199     8.895356   
min      13.400000    36.000000     0.000000    -0.200000     7.400000   
25%      25.100000    58.300000     4.600000    12.300000    20.100000   
50%      29.000000    69.800000    12.000000    21.700000    25.300000   
75%      34.000000    80.300000    26.100000    38.100000    31.900000   
max      58.600000   128.100000   295.000000   315.800000    64.900000   

               SO2           CO  Proximity_to_Industrial_Areas  \
count  5000.000000  5000.000000                    5000.000000   
mean     10.014820     1.500354                       8.425400   
std       6.750303     0.546027                       3.610944   
min      -6.200000     0.650000                     

In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import csv

def scale_values(input_file, output_file):
    # Read CSV data
    with open(input_file, 'r') as file:
        file_csv = csv.reader(file)
        header = next(file_csv)  # Skip header
        
        # Store data as features and labels
        data = []
        for row in file_csv:
            features = [float(val) for val in row[:-1]]  # Convert features to float
            label = row[-1]  # Keep label as string
            data.append(features + [label])
        
        # Separate features and labels
        features = np.array([row[:-1] for row in data])
        labels = [row[-1] for row in data]
        
        # Scale features
        scaler = StandardScaler()
        features_scaled = scaler.fit_transform(features)
        
        # Combine scaled features with labels
        scaled_data = [list(row) + [label] for row, label in zip(features_scaled, labels)]
        
        # Write scaled data
        with open(output_file, 'w', newline='') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(header)
            writer.writerows(scaled_data)

input_file = "cleaned_pollution_dataset.csv"
output_file = "final_pollution_dataset.csv"
scale_values(input_file, output_file)


In [2]:
# use this only when you know what you are doing

import csv
import random

def add_noise(data, noise_level=0.01):
    augmented_data = []
    for row in data:
        new_row = []
        for val in row[:-1]:
            noise = random.uniform(-noise_level, noise_level) * float(val)
            new_val = float(val) + noise
            new_row.append(new_val)
        new_row.append(row[-1])
        augmented_data.append(new_row)
    return augmented_data

# Read the original dataset
with open('updated_pollution_dataset.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    data = [row for row in reader]

# Augment the dataset
augmented_data = add_noise(data)

# Combine original and augmented data
combined_data = data + augmented_data
print(len(combined_data))
# Write the combined data to a new file
with open('augmented_pollution_dataset.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(combined_data)

10000
