In [None]:
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data

# marylebone street
mar = pd.read_csv(r"C:\Users\tom_r\Desktop\data_science\Data-Science\Westminster Pollution\data\LaqnData_marylebone_road.csv")

# oxford street
oxf = pd.read_csv(r"C:\Users\tom_r\Desktop\data_science\Data-Science\Westminster Pollution\data\LaqnData_oxford_street.csv")

# elizabeth bridge
eli = pd.read_csv(r"C:\Users\tom_r\Desktop\data_science\Data-Science\Westminster Pollution\data\LaqnData_elizabeth_bridge.csv")

# covent garden
cov = pd.read_csv(r"C:\Users\tom_r\Desktop\data_science\Data-Science\Westminster Pollution\data\LaqnData_covent_garden.csv")

In [None]:
mar['Site'] = 'marylebone'
oxf['Site'] = 'oxford street'
eli['Site'] = 'elizabeth bridge'
cov['Site'] = 'covent garden'

In [None]:
data = [mar, oxf, eli, cov]

for d in data:
    print(d.info())

In [None]:
# drop unneeded column
for d in data:
    d.drop(columns=['Provisional or Ratified'], inplace=True)
    print(d.head(1))
    print(d['Species'].value_counts())

In [None]:
eli.head()

In [None]:
oxf.head()

In [None]:
# Change datetime formats
for d in data:
    d['ReadingDateTime'] = pd.to_datetime(d['ReadingDateTime'], format="%d/%m/%Y %H:%M")


In [None]:
# combine datasets
df = pd.concat(data)
df.info()

In [None]:
# convert remaining datatypes
df['Site'] = df['Site'].astype('category')
df['Species'] = df['Species'].astype('category')
df['Units'] = df['Units'].astype('category')
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

In [None]:
df.info()

In [None]:
# deal with nan values
for col in df.columns.values:
    print(col, df[col].isna().sum())

In [None]:
nan = df[df['Value'].isna() == True]
nan

In [None]:
nan['Species'].value_counts()

In [None]:
# As analysis is mainly going to focus on common species NO, NO2 and NOX nan valued rows will be dropped
df = df.dropna(subset=['Value'])
df = df.reset_index(drop=True)

In [None]:
# plot to see site trends throughout the year for Nitrous Oxide

no = df[df['Species'] == 'NO']
no['ReadingDateTime'] = pd.to_datetime(no['ReadingDateTime'])  # ensure type
no.set_index('ReadingDateTime', inplace=True)

weekly_no = no.groupby('Site').resample('W')['Value'].mean().reset_index()

fig, ax = plt.subplots(figsize=(20, 6))
plt.title("Nitrous Oxides Trend in 2024")
plt.xlabel('Date')
plt.ylabel('Reading Value (mg m-3')
sns.lineplot(data=weekly_no, x='ReadingDateTime', y='Value', hue='Site')
plt.show()

In [None]:
# Plot just Elizabeth bridge for Nitrous Oxide, Nitrous Dioxide and Oxides of Nitrogen

eb_species = ['NO', 'NO2', 'NOX']

eb = (df[df['Site']=='elizabeth bridge']
      .set_index('ReadingDateTime')
      .groupby('Species')
      .resample('W')['Value']
      .mean().reset_index())

eb = eb[eb['Species'].isin(eb_species)]
eb['Species'] = eb['Species'].cat.remove_unused_categories()

print(eb['Species'].unique())
print(len(eb))

In [None]:
fig, ax = plt.subplots(figsize=(20,6))
plt.title("Nitrous Pollutions in Elizabeth Bridge 2024")
plt.xlabel('Date')
plt.ylabel('Reading Value (mg m-3')
sns.lineplot(data=eb, x='ReadingDateTime', y='Value', hue='Species')