In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Load dataset
df = pd.read_csv('Data.csv')

In [18]:
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nDataset Description:")
print(df.describe())

Dataset Shape: (29531, 16)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB

Dataset Description:
              PM2.5          PM10   

In [19]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64


In [20]:
df.replace(0, np.nan, inplace=True)
df = df.dropna()
print("Missing Values After Cleaning:", df.isnull().sum().sum())

Missing Values After Cleaning: 0


In [21]:
df = df.drop_duplicates()

In [22]:
df = pd.get_dummies(df, columns=['AQI_Bucket'], drop_first=True)

In [23]:
print(df.head(10))

           City        Date  PM2.5    PM10    NO    NO2    NOx    NH3    CO  \
2123  Amaravati  25-11-2017  81.40  124.50  1.44  20.50  12.08  10.72  0.12   
2124  Amaravati  26-11-2017  78.32  129.06  1.26  26.00  14.85  10.28  0.14   
2125  Amaravati  27-11-2017  88.76  135.32  6.60  30.85  21.77  12.91  0.11   
2126  Amaravati  28-11-2017  64.18  104.09  2.56  28.07  17.01  11.42  0.09   
2127  Amaravati  29-11-2017  72.47  114.84  5.23  23.20  16.59  12.25  0.16   
2128  Amaravati  30-11-2017  69.80  114.86  4.69  20.17  14.54  10.95  0.12   
2129  Amaravati  01-12-2017  73.96  113.56  4.58  19.29  13.97  10.95  0.10   
2130  Amaravati  02-12-2017  89.90  140.20  7.71  26.19  19.87  13.12  0.10   
2131  Amaravati  03-12-2017  87.14  130.52  0.97  21.31  12.12  14.36  0.15   
2132  Amaravati  04-12-2017  84.64  125.00  4.02  26.98  17.58  14.41  0.18   

        SO2      O3  Benzene  Toluene  Xylene    AQI  AQI_Bucket_Moderate  \
2123  15.24  127.09     0.20     6.50    0.06  184.0 

In [24]:
#By Z-score method
mean_aqi = df['AQI'].mean()
std_aqi = df['AQI'].std()

print (f"Mean of AQI: {mean_aqi}")
print (f"Standard Deviation of AQI: {std_aqi}")

df['Z_Score'] = (df['AQI'] - mean_aqi) / std_aqi
print(df[['AQI', 'Z_Score']])

# Identify outliers based on the Z-score
outliers =df[df['Z_Score'].abs() > 3]
print (outliers)

Mean of AQI: 138.48802144412798
Standard Deviation of AQI: 91.64490404411067
         AQI   Z_Score
2123   184.0  0.496612
2124   197.0  0.638464
2125   198.0  0.649376
2126   188.0  0.540259
2127   173.0  0.376584
...      ...       ...
29523   86.0 -0.572733
29524   77.0 -0.670938
29525   47.0 -0.998288
29526   41.0 -1.063758
29527   70.0 -0.747319

[5969 rows x 2 columns]
            City        Date   PM2.5    PM10     NO    NO2     NOx    NH3  \
3308    Amritsar  20-10-2017  248.12  445.44  29.93  40.11   70.05  60.89   
4265    Amritsar  03-06-2020   26.72  486.99  20.32  12.54   29.09  12.08   
10229      Delhi  01-01-2015  313.22  607.98  69.16  36.39  110.59  33.85   
10230      Delhi  02-01-2015  186.18  269.55  62.09  32.87   88.14  31.83   
10521      Delhi  20-10-2015  198.10  336.47  91.29  74.62  155.10  36.75   
...          ...         ...     ...     ...    ...    ...     ...    ...   
14880  Hyderabad  30-10-2015  516.20   82.97   3.03   0.62    2.72  10.78   
14881 

In [25]:
min_aqi = df['AQI'].min()
max_aqi = df['AQI'].max()
df['AQI_normalized'] = (df['AQI'] - min_aqi) / (max_aqi - min_aqi)

In [26]:
print(df [['AQI', 'AQI_normalized']])

         AQI  AQI_normalized
2123   184.0        0.246177
2124   197.0        0.266055
2125   198.0        0.267584
2126   188.0        0.252294
2127   173.0        0.229358
...      ...             ...
29523   86.0        0.096330
29524   77.0        0.082569
29525   47.0        0.036697
29526   41.0        0.027523
29527   70.0        0.071865

[5969 rows x 2 columns]
