In [1]:
#importing the packages
import pandas as pd
import numpy as np
import random 

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import joblib # for saving algorithm and preprocessing objects
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
# uploading the dataset
df = pd.read_csv('pollution_us_2000_2016.csv')
df.head()

In [None]:
print(df.columns)

In [None]:
print(df.shape)

In [None]:
print(df.describe())

In [None]:
#droping all the unnecessary features
df.drop(['Unnamed: 0','State Code', 'County Code', 'Site Num', 'Address', 'County', 'City',
         'NO2 Units', 'O3 Units' ,'SO2 Units', 'CO Units',
         'NO2 1st Max Hour', 'O3 1st Max Hour', 'SO2 1st Max Hour', 'CO 1st Max Hour'], axis=1, inplace=True)

In [None]:
print(df['NO2 AQI'].min())
print(df['NO2 AQI'].max())

print(df['O3 AQI'].min())
print(df['O3 AQI'].max())

print(df['SO2 AQI'].min())
print(df['SO2 AQI'].max())

print(df['CO AQI'].min())
print(df['CO AQI'].max())

In [None]:
#IQR range
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

#removing Outliers
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

In [None]:
print(df['NO2 AQI'].min())
print(df['NO2 AQI'].max())

print(df['O3 AQI'].min())
print(df['O3 AQI'].max())

print(df['SO2 AQI'].min())
print(df['SO2 AQI'].max())

print(df['CO AQI'].min())
print(df['CO AQI'].max())

In [None]:
#encoding dates
df.insert(loc=1, column='Year', value=df['Date Local'].apply(lambda year: year.split('-')[0])) 
df.drop('Date Local', axis=1, inplace=True)
df['Year']=df['Year'].astype('int')

In [None]:
#filling the FIRST Nan values with the means by the state
for i in df.columns[2:]:
    df[i] = df[i].fillna(df.groupby('State')[i].transform('mean'))

In [None]:
# grouped data by state and year
dfG = df.groupby(['State', 'Year']).mean().reset_index()

In [None]:
dfG[dfG['State']=='Hawaii']['NO2 Mean'].min()

In [None]:
#function for inserting a row
def Insert_row_(row_number, df, row_value): 
    # Slice the upper half of the dataframe 
    df1 = df[0:row_number] 
   
    # Store the result of lower half of the dataframe 
    df2 = df[row_number:] 
   
    # Inser the row in the upper half dataframe 
    df1.loc[row_number]=row_value 
   
    # Concat the two dataframes 
    df_result = pd.concat([df1, df2]) 
   
    # Reassign the index labels 
    df_result.index = [*range(df_result.shape[0])] 
   
    # Return the updated dataframe 
    return df_result 

In [None]:
#all the years
year_list = df['Year'].unique()
print(year_list)

In [None]:
#all the states
state_list = df['State'].unique()
print(state_list)

In [None]:
dfG.shape

In [None]:
dfG.describe()

In [None]:
# add more years with NaN values
for state in state_list:
    year_diff = set(year_list).difference(list(dfG[dfG['State']==state]['Year']))
    for i in year_diff:
        if i == 2000:
            row_value = [state, i, 25.9423, 42.7453, 40.4157, 0.0408353, 0.0501655, 43.1472, 4.30707, 6.65621, 7.93535, 0.721985, 1.094762, 10.577235]
        elif i == 2016:
            row_value = [state, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        else:
            row_value = [state, i, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
        dfG = Insert_row_(random.randint(1,494), dfG, row_value) 

In [None]:
# replacing Nan values by interpolation by state
groups = []
i = 0
for _, group in dfG.groupby('State'):
    a = group.sort_values(by='Year').interpolate()
    groups.append(a)
    
final_df = pd.DataFrame(columns=list(groups[0].columns) )
for g in groups:
    final_df = final_df.append(g, ignore_index = True) 

In [None]:
# calculating the average AQI
average_AQI = (final_df['NO2 AQI'] + final_df['SO2 AQI'] + \
              final_df['CO AQI'] + final_df['O3 AQI']) / 4 
final_df.insert(loc=len(final_df.columns), column='average_AQI', value=average_AQI)

In [None]:
final_df.head()

In [None]:
final_df[final_df["State"]=='Missouri']['average_AQI'].plot(kind='density', subplots=True, layout=(1, 2), 
                  sharex=False, figsize=(10, 4));

In [None]:
plt.scatter(final_df[final_df['State']=='Hawaii']['Year'], final_df[final_df['State']=='Hawaii']['average_AQI']);

In [None]:
final_df.describe()

In [None]:
joblib.dump(final_df, "./processed_data.joblib", compress=True)
testing_Data = joblib.load("./processed_data.joblib")

In [None]:
def state_data(state, data, df):
    
    t = df[df['State']==state].sort_values(by='Year')
    
    clf = LinearRegression()
    clf.fit(t[['Year']], t[data])
    
    years = np.arange(2017, 2026, 1)
    
    tt = pd.DataFrame({'Year': years, data: clf.predict(years.reshape(-1, 1))})
    pd.concat([t, tt], sort=False).set_index('Year')[data].plot(color='red')
    t.set_index('Year')[data].plot(figsize=(15, 5), xticks=(np.arange(2000, 2026, 1)))
    
    return print(clf.predict(years.reshape(-1, 1)))

In [None]:
state_data('Missouri', 'average_AQI', final_df)