In [None]:
import numpy as np
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#importing the data as a Pandas DataFrame
file_path='/content/drive/My Drive/Training/UNJ/Dataset/data_indian_air_quality.csv'
dataset=pd.read_csv(file_path,encoding="ISO-8859-1")
dataset.describe()



The dataset consists primarily 5 different types pollutants measured over the years in different states and cities of India.

Where SO2 and NO2 are harmful gaseous emmissions; rspm, spm and pm2_5 come under susended air pollutants.

> The count clearly shows that there are variable number of Non-null entries for each of the pollutants.

> To understand the dataset further, we will have a look at all the different columns now and store them for future reference.




In [None]:
dataset.columns
#Apart from the major pollutants, there are columns that refer to the respective states, agencies, sampling dates and the type.
#We will now have a look at what kind of data each of the columns consists of.

Understanding the pollutants briefly here.

**NO2**: Nitrogen Dioxide and is emmitted mostly from combustion from power sources or transport.

**SO2**: Sulphur Dioxide and is emmitted mostly from coal burning, oil burning, manufacturing of Sulphuric acid.

**spm**: Suspended particulate matter and are known to be the deadliest form of air pollution. They are microscopic in nature and are found to be suspended in earth's atmosphere.

**rspm**: Respirable suspended particulate matter. A sub form of spm and are respnsible for respiratory diseases.

**pm2_5**: Suspended particulate matter with diameters less than 2.5 micrometres. They tend to remain suspended for longer durations and potentially very harmful.

Let us get back to the data again and see how it is stored.


In [None]:

dataset.info()

#Now, we can immediatly see that there are quite a few nulls in various columns, which need work and first need a closer inspection.

In [7]:
dataset.head()

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",4.8,17.4,,,,,1990-02-01
1,151.0,February - M021990,Andhra Pradesh,Hyderabad,,Industrial Area,3.1,7.0,,,,,1990-02-01
2,152.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.2,28.5,,,,,1990-02-01
3,150.0,March - M031990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.3,14.7,,,,,1990-03-01
4,151.0,March - M031990,Andhra Pradesh,Hyderabad,,Industrial Area,4.7,7.5,,,,,1990-03-01


Clearly there are lots of null values, noticeably in stn_code,agency, both of which should therefore be not ncluded further in the analysis.

> Intuitively, these two columns will hardly add much value to analysis.

> Now, focusing on the categorical variables, we are left with location_monitoring_station which consists of considerable nulls (approximately 27000).

****It would have been useful to have those values for an in depth analysis, but for now we will keep it out because of the null values and come back later if needed.

1. Out of the two dates columns, immediate attention goes to sampling date which has different formats within, highlighting some data input issues.
1. While, it is importnat to have this metric, more useful is to go back to the origin of the dataset and ask relevant questions,as to why are there different formats? Is it a human error or error due to incorporating different formats.For now, we will keep it out and only have the date column.

In [None]:

dataset.drop(['stn_code','agency','sampling_date','location_monitoring_station'],axis=1,inplace=True)
dataset.info()
dataset.head()

In [None]:
#Fixing the missing values firstly for all the pollutants.
#We will consider taking mean for all the pollutants columns and make use of the Imputer class
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(dataset.iloc[:, 3:8].values)
dataset.iloc[:,3:8] = imputer.transform(dataset.iloc[:, 3:8].values)
dataset.info()
dataset.head()

In [None]:
#Fixing the missing values in the column 'type'
dataset['type'].describe()
#With 10 Unique labels, we will fill the null values by the most common type, which is 'Residential, Rural and Other Areas'.
common_value='Residential,Rural and other Areas'
dataset['type']=dataset['type'].fillna(common_value)
dataset.info()

We have fixed the missing values now and made the dataset much shorter to focus on the key variables.
> We should start with some preliminary visualisations, starting foremost with those of the pollutants

# Grouping the emmissions by state.
> Having looked at the pollutants distributions, we now would focus on how these emmissions are stacked across the indian states.
> We will use groupby on the dataset DataFrame and store it in another DataFrame as statewise_emmissions.

In [None]:

statewise_emmissions = dataset.groupby('state').mean()[['so2', 'no2', 'rspm', 'spm', 'pm2_5']]
statewise_emmissions.plot.area()



***The highest emmissions are for spm, for each of the states.*

> Getting the statistics for highest emmissions, when grouped statewise.

In [None]:
statewise_emmissions.describe()


In [None]:
Top10States_with_highest_No2=statewise_emmissions.sort_values(by='no2',ascending=False).head(10)
Top10States_with_highest_No2_sorted=Top10States_with_highest_No2.loc[:,['no2']]
Top10States_with_highest_No2_sorted.head()

1. West Bengal and Delhi show the highest NO2 emmissions over the years.
1. Questions to ask: Have the Vehicles emmiting NOx and NO2 have been monitored well in these two states?
1. What type of Industrial waste is being generated in these two states?

In [None]:
Top10states_with_highest_So2=statewise_emmissions.sort_values(by='so2',ascending=False).head(10)
Top10states_with_highest_So2_sorted=Top10states_with_highest_So2.loc[:,['so2']]
Top10states_with_highest_So2_sorted.head()

1. With Uttaranachal and Jharkand right at the top, it would be wise to ask whether the coal mining industry in these two states regulated well, as a substantial amount of SO2 emmission can come from the combustion of coal.
> Importantly, there are no states from South India.

In [None]:
Top10states_with_highest_rspm=statewise_emmissions.sort_values(by='rspm',ascending=False).head(10)
Top10states_with_highest_rspm_sorted=Top10states_with_highest_rspm.loc[:,['rspm']]
Top10states_with_highest_rspm_sorted.head()

In [None]:
Top10states_with_highest_spm=statewise_emmissions.sort_values(by='spm',ascending=False).head(10)
Top10states_with_highest_spm_sorted=Top10states_with_highest_spm.loc[:,['spm']]
Top10states_with_highest_spm_sorted.head()

****The distribution for spm is generally on the higher side, but Delhi and Uttar Pradesh show the highest presence of suspended Particulate matter.

1. Are their specific insustries that contribute higher to the spm concentration in and around Delhi?
> To be underlined is the non-presence of Southern and North eastern states.

In [None]:
Top10states_with_highest_pm2_5=statewise_emmissions.sort_values(by='pm2_5',ascending=False).head(10)
Top10states_with_highest_pm2_5_sorted=Top10states_with_highest_pm2_5.loc[:,['pm2_5']]
Top10states_with_highest_pm2_5_sorted.head()

> The data output can be misleading owing to the missing values of pm2_5 values and subsequent mean.
> Neverthless, Delhi still shows highest measured value of pm2_5.


In [None]:
#Getting the statistics citywise for the pollutants
locationwise_emmissions=dataset.groupby('location').mean()[['so2','no2','rspm','spm','pm2_5']]

In [None]:
Top10Cities_with_highest_NO2=locationwise_emmissions.sort_values(by='no2',ascending=False).head(10)
Top10Cities_with_highest_NO2_sorted=Top10Cities_with_highest_NO2.loc[:,['no2']]
Top10Cities_with_highest_NO2_sorted.head()


In [None]:

Top10Cities_with_highest_So2=locationwise_emmissions.sort_values(by='so2',ascending=False).head(10)
Top10Cities_with_highest_So2_sorted=Top10Cities_with_highest_So2.loc[:,['so2']]
Top10Cities_with_highest_So2_sorted.head()



In [None]:
Top10Cities_with_highest_rspm=locationwise_emmissions.sort_values(by='rspm',ascending=False).head(10)
Top10Cities_with_highest_rspm_sorted=Top10Cities_with_highest_rspm.loc[:,['rspm']]
Top10Cities_with_highest_rspm_sorted.head()

In [None]:
Top10Cities_with_highest_spm=locationwise_emmissions.sort_values(by='spm',ascending=False).head(10)
Top10Cities_with_highest_spm_sorted=Top10Cities_with_highest_spm.loc[:,['spm']]
Top10Cities_with_highest_spm_sorted.head()

In [None]:
Top10Cities_with_highest_pm2_5=locationwise_emmissions.sort_values(by='pm2_5',ascending=False).head(10)
Top10Cities_with_highest_pm2_5_sorted=Top10Cities_with_highest_pm2_5.loc[:,['pm2_5']]
Top10Cities_with_highest_pm2_5_sorted.head()