In [3]:
import boto3
import os
import pandas as pd
# Define S3 bucket and folder
s3_bucket = 'sagemaker-us-east-1-099237183269'
s3_folder = 'PollutionProject/'
local_dir = './data/'

# Create local directory if not exists
os.makedirs(local_dir, exist_ok=True)

# Initialize S3 client
s3 = boto3.client('s3')

# List and download files from S3
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=s3_folder)
if 'Contents' in response:
    for obj in response['Contents']:
        s3_file = obj['Key']
        if s3_file.endswith('/'):
            continue  # Skip directories
        local_file = os.path.join(local_dir, os.path.basename(s3_file))
        s3.download_file(s3_bucket, s3_file, local_file)
        print(f'Downloaded {s3_file} to {local_file}')
else:
    print('No files found in S3 bucket.')

Downloaded PollutionProject/Urban_Traffic_Data.csv to ./data/Urban_Traffic_Data.csv
Downloaded PollutionProject/global air pollution dataset.csv to ./data/global air pollution dataset.csv
Downloaded PollutionProject/global_air_quality_data_10000.csv to ./data/global_air_quality_data_10000.csv


In [4]:
# Load CSV files
pollution_data = pd.read_csv('./data/global air pollution dataset.csv')
air_quality_data = pd.read_csv('./data/global_air_quality_data_10000.csv')
traffic_data = pd.read_csv('./data/Urban_Traffic_Data.csv')

# Display first few rows of each dataset
print("Pollution Data:")
display(pollution_data.head())

print("Air Quality Data:")
display(air_quality_data.head())

print("Traffic Data:")
display(traffic_data.head())

# Display summary statistics
print("Pollution Data Summary:")
display(pollution_data.describe())

print("Air Quality Data Summary:")
display(air_quality_data.describe())

print("Traffic Data Summary:")
display(traffic_data.describe())

# Check for missing values
print("Missing Values in Pollution Data:")
print(pollution_data.isnull().sum())

print("Missing Values in Air Quality Data:")
print(air_quality_data.isnull().sum())

print("Missing Values in Traffic Data:")
print(traffic_data.isnull().sum())

Pollution Data:


Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category
0,Russian Federation,Praskoveya,51,Moderate,1,Good,36,Good,0,Good,51,Moderate
1,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,41,Good
2,Italy,Priolo Gargallo,66,Moderate,1,Good,39,Good,2,Good,66,Moderate
3,Poland,Przasnysz,34,Good,1,Good,34,Good,0,Good,20,Good
4,France,Punaauia,22,Good,0,Good,22,Good,0,Good,6,Good


Air Quality Data:


Unnamed: 0,City,Country,Date,PM2.5,PM10,NO2,SO2,CO,O3,Temperature,Humidity,Wind Speed
0,Bangkok,Thailand,2023-03-19,86.57,25.19,99.88,30.63,4.46,36.29,17.67,59.35,13.76
1,Istanbul,Turkey,2023-02-16,50.63,97.39,48.14,8.71,3.4,144.16,3.46,67.51,6.36
2,Rio de Janeiro,Brazil,2023-11-13,130.21,57.22,98.51,9.92,0.12,179.31,25.29,29.3,12.87
3,Mumbai,India,2023-03-16,119.7,130.52,10.96,33.03,7.74,38.65,23.15,99.97,7.71
4,Paris,France,2023-04-04,55.2,36.62,76.85,21.85,2.0,67.09,16.02,90.28,14.16


Traffic Data:


Unnamed: 0,City,Year,Average_Daily_Traffic_Counts,Peak_Hourly_Traffic_Volume,Percentage_of_Commercial_Vehicles,Number_of_Road_Accidents,Average_Traffic_Speed_kmh,Air_Quality_Index,Population_Million
0,New York,2015,19967,4705,15.3,159,30.7,113,11.24
1,London,2015,22128,3408,15.4,609,71.7,85,12.37
2,Tokyo,2015,44738,2017,9.1,775,52.3,53,11.97
3,Mumbai,2015,38295,4140,19.2,614,72.7,72,10.55
4,Sydney,2015,30337,4654,14.9,114,29.1,74,7.16


Pollution Data Summary:


Unnamed: 0,AQI Value,CO AQI Value,Ozone AQI Value,NO2 AQI Value,PM2.5 AQI Value
count,23463.0,23463.0,23463.0,23463.0,23463.0
mean,72.010868,1.368367,35.193709,3.063334,68.519755
std,56.05522,1.832064,28.098723,5.254108,54.796443
min,6.0,0.0,0.0,0.0,0.0
25%,39.0,1.0,21.0,0.0,35.0
50%,55.0,1.0,31.0,1.0,54.0
75%,79.0,1.0,40.0,4.0,79.0
max,500.0,133.0,235.0,91.0,500.0


Air Quality Data Summary:


Unnamed: 0,PM2.5,PM10,NO2,SO2,CO,O3,Temperature,Humidity,Wind Speed
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,77.448439,104.438161,52.198649,25.34449,5.047984,106.031643,14.89715,55.078579,10.231636
std,41.927871,55.062396,27.32049,14.091194,2.852625,55.081345,14.4438,25.982232,5.632628
min,5.02,10.0,5.01,1.0,0.1,10.04,-10.0,10.01,0.5
25%,41.185,57.1375,28.3475,13.19,2.56,58.38,2.2575,32.5275,5.29
50%,77.725,103.69,52.1,25.35,5.09,106.055,14.755,55.08,10.26
75%,113.3925,152.265,75.705,37.5,7.48,153.9825,27.3825,77.4425,15.07
max,149.98,200.0,100.0,49.99,10.0,200.0,40.0,99.99,20.0


Traffic Data Summary:


Unnamed: 0,Year,Average_Daily_Traffic_Counts,Peak_Hourly_Traffic_Volume,Percentage_of_Commercial_Vehicles,Number_of_Road_Accidents,Average_Traffic_Speed_kmh,Air_Quality_Index,Population_Million
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,2019.5,29655.313333,2972.856667,21.151,562.616667,50.828,99.01,7.947133
std,2.87708,11360.48309,1159.2434,9.544409,272.472188,17.596973,27.763512,4.06614
min,2015.0,10029.0,1005.0,5.1,104.0,20.5,50.0,1.02
25%,2017.0,20790.25,1964.25,12.6,315.75,34.575,74.0,4.1175
50%,2019.5,29401.0,3052.0,20.75,573.5,52.1,99.0,8.055
75%,2022.0,39997.75,3911.75,29.15,803.25,66.575,122.0,11.4575
max,2024.0,49935.0,4999.0,39.9,996.0,79.9,149.0,14.99


Missing Values in Pollution Data:
Country               427
City                    1
AQI Value               0
AQI Category            0
CO AQI Value            0
CO AQI Category         0
Ozone AQI Value         0
Ozone AQI Category      0
NO2 AQI Value           0
NO2 AQI Category        0
PM2.5 AQI Value         0
PM2.5 AQI Category      0
dtype: int64
Missing Values in Air Quality Data:
City           0
Country        0
Date           0
PM2.5          0
PM10           0
NO2            0
SO2            0
CO             0
O3             0
Temperature    0
Humidity       0
Wind Speed     0
dtype: int64
Missing Values in Traffic Data:
City                                 0
Year                                 0
Average_Daily_Traffic_Counts         0
Peak_Hourly_Traffic_Volume           0
Percentage_of_Commercial_Vehicles    0
Number_of_Road_Accidents             0
Average_Traffic_Speed_kmh            0
Air_Quality_Index                    0
Population_Million                   0
dtype