In [116]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


sys.path.append('../')

# Loading Data
### Loader and Cleaner Functions

In [131]:
def make_func(offset=0):
    def func(x):
        try:
            y = x[1 + offset].split(' ')[0]
        except AttributeError:
            y = x[1+offset]
        return '{} {}'.format(x[0 + offset], y)
    return func


def aurn_cleaner(loc):
    # Read data
    aurn_data = pd.read_csv(loc, skiprows=3, na_values='No data')
    
    # Clean empty column names
    aurn_data.columns = [col if "Unnamed" not in col else "status_{}".format(idx) for idx, col in enumerate(aurn_data.columns)]
    
    # Identify Date 
    aurn_data.rename(columns={'status_0':'Date'}, inplace=True)
    
    # Drop meta row
    aurn_data = aurn_data.drop(aurn_data.index[0])
    
    # Extract measurement quality
    melted = pd.DataFrame(aurn_data.Date)
    for offset in range(1, aurn_data.shape[1]-1, 2):
        melted['{}'.format(aurn_data.columns[offset])] = aurn_data.apply(make_func(offset), axis=1)
    
    # Clean up
    cleaned = pd.DataFrame(melted.set_index('Date').stack())
    clean_split = cleaned[0].str.split(' ').apply(pd.Series, 1)
    clean_split.columns = ['pm25_value', 'measurement_quality']
    clean_split = clean_split.reset_index()
    clean_split.rename(columns={'level_1':'station_name'}, inplace=True)
    
    return clean_split

### Load the Data

In [133]:
aurn_raw = aurn_cleaner('../data/aurn_2010_18.csv')
print(aurn_raw.head())

         Date               station_name pm25_value measurement_quality
0  2010-01-01                   Aberdeen          4                   V
1  2010-01-01           Auchencorth Moss          8                   V
2  2010-01-01             Barnstaple A39        nan                 nan
3  2010-01-01             Belfast Centre         44                   V
4  2010-01-01  Birmingham A4540 Roadside        nan                 nan


### Filter NaNs

In [167]:
aurn_data = aurn_raw[(aurn_raw['pm25_value']!='nan') & (aurn_raw['measurement_quality']!='nan')]
print('{} Rows removed. \nTotal rows: {}'.format(aurn_raw.shape[0]-aurn_data.shape[0], aurn_data.shape[0]))
aurn = aurn_data.astype({"station_name": str,
                         "pm25_value": int, 
                         "measurement_quality": str})

95377 Rows removed. 
Total rows: 191063


### Store

In [169]:
aurn.to_csv('../data/cleaned/aurn.csv')