# Predicting particulate matter PM2 2.5 using machine learning

## Step 0: Data Preparation for modeling 
- Load data 
- Dealing with missing data 
- EDA 
- Class definitions, if PM2.5 is less than 10 then class 1, if more than 10 then class 0 
 

In [5]:
## import dependencies 

%matplotlib inline  
import pandas as pd
import numpy as np
import seaborn as sns
from pandas import Grouper

import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [6]:
#Load the data
df_Kensing = pd.read_csv('~/Downloads/Air-Quality-Data-SQUARES/2554737734-London_Kensington.csv', parse_dates=[['Date', 'Time']])
#Check first 5 rows
df_Kensing.head()

In [8]:
#check null values 
df_Kensing.isnull().sum()

Date_Time          0
Ozone              4
Status             4
Nitric oxide       4
Status.1           4
                ... 
Unnamed: 156       4
Unnamed: 157    9976
Unnamed: 158       4
Unnamed: 159    9976
Unnamed: 160       4
Length: 160, dtype: int64

In [9]:
#discard unwanted columns
df_Kensing = df_Kensing.iloc[:, :43]
df_Kensing.columns
#create a new dataframe with required columns
df_Kensington = df_Kensing[['Date_Time', 'Ozone', 'Status', 'Nitric oxide', 
       'Nitrogen dioxide', 'Nitrogen oxides as nitrogen dioxide',
    'Sulphur dioxide', 'Carbon monoxide', 
       'Modelled Wind Direction',  'Modelled Wind Speed',
       'Modelled Temperature', 'PM10 particulate matter (Hourly measured)', 'PM2.5 particulate matter (Hourly measured)' ]]
df_Kensington.describe()


Unnamed: 0,Date_Time,Ozone,Status,Nitric oxide,Nitrogen dioxide,Nitrogen oxides as nitrogen dioxide,Sulphur dioxide,Carbon monoxide,Modelled Wind Direction,Modelled Wind Speed,Modelled Temperature,PM10 particulate matter (Hourly measured),PM2.5 particulate matter (Hourly measured)
count,9976,9972,9972,9972,9972,9972,9972,9972,9972,9972.0,9972,9972,9972
unique,9974,2587,2,6183,7919,7952,5033,4159,3065,118.0,360,1571,1279
top,nan nan,No data,V ugm-3,No data,No data,No data,No data,No data,No data,2.2,No data,No data,No data
freq,3,505,6576,97,99,99,1136,160,180,280.0,180,49,47


In [10]:
df_Kensington.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9976 entries, 0 to 9975
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Date_Time                                   9976 non-null   object
 1   Ozone                                       9972 non-null   object
 2   Status                                      9972 non-null   object
 3   Nitric oxide                                9972 non-null   object
 4   Nitrogen dioxide                            9972 non-null   object
 5   Nitrogen oxides as nitrogen dioxide         9972 non-null   object
 6   Sulphur dioxide                             9972 non-null   object
 7   Carbon monoxide                             9972 non-null   object
 8   Modelled Wind Direction                     9972 non-null   object
 9   Modelled Wind Speed                         9972 non-null   object
 10  Modelled Temperature    

In [11]:
#replace no data with np.nan 
df_Ken = df_Kensington.replace('No data', np.nan).dropna()


In [17]:
#discard date-time column for the time being convert all columns datatupes to float
df1 = df_Ken[['Ozone', 'Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides as nitrogen dioxide', 'Sulphur dioxide', 'Carbon monoxide', 'Modelled Wind Direction', 'Modelled Wind Speed', 'Modelled Temperature', 'PM2.5 particulate matter (Hourly measured)']].astype(float)


In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8224 entries, 39 to 9959
Data columns (total 10 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Ozone                                       8224 non-null   float64
 1   Nitric oxide                                8224 non-null   float64
 2   Nitrogen dioxide                            8224 non-null   float64
 3   Nitrogen oxides as nitrogen dioxide         8224 non-null   float64
 4   Sulphur dioxide                             8224 non-null   float64
 5   Carbon monoxide                             8224 non-null   float64
 6   Modelled Wind Direction                     8224 non-null   float64
 7   Modelled Wind Speed                         8224 non-null   float64
 8   Modelled Temperature                        8224 non-null   float64
 9   PM2.5 particulate matter (Hourly measured)  8224 non-null   float64
dtypes: float64(

### Class definitions, if PM2.5 concentration is less than 10 then class 1, if more than 10 then class 0

In [20]:
def classifier(row):
    if row['PM2.5 particulate matter (Hourly measured)'] < 10:
        return 1
    else: 
        return 0
df1['AirQuality-PM2.5'] = df1.apply(classifier, axis=1)


In [23]:
class1 = df1['AirQuality-PM2.5'].sum()
print(f"Total examples in Class 1 is : {class1} /n Total examples in class 0 is: {8224-class1}")

Total examples in Class 1 is : 6466 /n Total examples in class 0 is: 1758


## Step 1:  Baseline Model Training

- Define X and y
- split train and test set 
- split train dataset again to train and validation set 
- Create a dictionary of classifiers, logistic, knn, svm, and random forest
- Fit and Predict models 

In [None]:
classifiers = {
    'logistic': LogisticRegression(C=C, penalty='l1',
                                      solver='saga',
                                      multi_class='multinomial',
                                      max_iter=10000),
    'knn': LogisticRegression(C=C, penalty='l2',
                                                    solver='saga',
                                                    multi_class='multinomial',
                                                    max_iter=10000),
    'L2 logistic (OvR)': LogisticRegression(C=C, penalty='l2',
                                            solver='saga',
                                            multi_class='ovr',
                                            max_iter=10000),
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,
                      random_state=0),
    'GPC': GaussianProcessClassifier(kernel)
}

## Step 2: Model Evaluation 

Calculate evaluation metrics for all models using validation data 
plot ROC curve for each models. 