# Load data

In [22]:
import pandas as pd
import numpy as np

In [23]:
df = pd.read_json('../data/Archive_Sofia_stations_RAW.json')
df.columns = df.iloc[0] # set first row as header
df = df.iloc[1: , :] # drop first row
df

Unnamed: 0,timest,station,param,level,isinvalid
1,2016-01-20 16:00:00,1,1,81.79,0
2,2016-01-20 16:00:00,2,1,106.47,0
3,2016-01-20 16:00:00,3,1,121.81,0
4,2016-01-20 16:00:00,4,1,140.35,0
5,2016-01-20 16:00:00,5,1,25.27,0
...,...,...,...,...,...
1252194,2018-06-19 10:00:00,5,10,0,0
1252195,2018-06-19 10:00:00,5,11,0,0
1252196,2018-06-19 10:00:00,6,9,0,0
1252197,2018-06-19 10:00:00,6,10,0,0


In [24]:
df['station'] = df['station'].astype('int')
df['param'] = df['param'].astype('int')

# Remove invalid rows

In [25]:
df = df[df['isinvalid'] == '0']
df.drop(['isinvalid'], axis=1, inplace=True) # drop isinvalid column
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['isinvalid'], axis=1, inplace=True) # drop isinvalid column


Unnamed: 0,timest,station,param,level
1,2016-01-20 16:00:00,1,1,81.79
2,2016-01-20 16:00:00,2,1,106.47
3,2016-01-20 16:00:00,3,1,121.81
4,2016-01-20 16:00:00,4,1,140.35
5,2016-01-20 16:00:00,5,1,25.27
...,...,...,...,...
1252194,2018-06-19 10:00:00,5,10,0
1252195,2018-06-19 10:00:00,5,11,0
1252196,2018-06-19 10:00:00,6,9,0
1252197,2018-06-19 10:00:00,6,10,0


# Add station coordinates

In [26]:
station_data = {
    1: {
        'name': 'Druzhba',
        'longitude': 23.400164,
        'latitude': 42.666508
    },
    2: {
        'name': 'Nadezhda',
        'longitude': 23.310972,
        'latitude': 42.732292
    },
    3: {
        'name': 'Krasno selo',
        'longitude': 23.400164,
        'latitude': 42.666508
    },
    4: {
        'name': 'Pavlovo',
        'longitude': 23.268403,
        'latitude': 42.669797
    },
    5: {
        'name': 'Kopitoto',
        'longitude': 23.268403,
        'latitude': 42.669797
    },
    6: {
        'name': 'Mladost',
        'longitude': 23.383271,
        'latitude': 42.655488
    },
}

In [27]:
conditions = [
    (df['station'] == 1),
    (df['station'] == 2),
    (df['station'] == 3),
    (df['station'] == 4),
    (df['station'] == 5),
    (df['station'] == 6)
]

df['station_name'] = np.select(
    conditions,
    [x['name'] for x in station_data.values()]
)

df['longitude'] = np.select(
    conditions,
    [x['longitude'] for x in station_data.values()]
)

df['latitude'] = np.select(
    conditions,
    [x['latitude'] for x in station_data.values()]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['station_name'] = np.select(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['longitude'] = np.select(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['latitude'] = np.select(


In [28]:
df

Unnamed: 0,timest,station,param,level,station_name,longitude,latitude
1,2016-01-20 16:00:00,1,1,81.79,Druzhba,23.400164,42.666508
2,2016-01-20 16:00:00,2,1,106.47,Nadezhda,23.310972,42.732292
3,2016-01-20 16:00:00,3,1,121.81,Krasno selo,23.400164,42.666508
4,2016-01-20 16:00:00,4,1,140.35,Pavlovo,23.268403,42.669797
5,2016-01-20 16:00:00,5,1,25.27,Kopitoto,23.268403,42.669797
...,...,...,...,...,...,...,...
1252194,2018-06-19 10:00:00,5,10,0,Kopitoto,23.268403,42.669797
1252195,2018-06-19 10:00:00,5,11,0,Kopitoto,23.268403,42.669797
1252196,2018-06-19 10:00:00,6,9,0,Mladost,23.383271,42.655488
1252197,2018-06-19 10:00:00,6,10,0,Mladost,23.383271,42.655488


# Add parameter information

In [29]:
parameter_data = {
    0: 'Particulate matter',
    1: 'NO2',
    2: 'NO',
    3: 'C6H6',
    4: 'CO',
    5: 'O3',
    6: 'SO2',
    7: 'Humidity',
    8: 'Atmospheric pressure',
    9: 'Wind',
    10: 'Sun radiation',
    11: 'Temperature'
}

In [30]:
df['param_name'] = [
    parameter_data[df.iloc[x]['param']] for x in range(df.shape[0])
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['param_name'] = [


In [31]:
df

Unnamed: 0,timest,station,param,level,station_name,longitude,latitude,param_name
1,2016-01-20 16:00:00,1,1,81.79,Druzhba,23.400164,42.666508,NO2
2,2016-01-20 16:00:00,2,1,106.47,Nadezhda,23.310972,42.732292,NO2
3,2016-01-20 16:00:00,3,1,121.81,Krasno selo,23.400164,42.666508,NO2
4,2016-01-20 16:00:00,4,1,140.35,Pavlovo,23.268403,42.669797,NO2
5,2016-01-20 16:00:00,5,1,25.27,Kopitoto,23.268403,42.669797,NO2
...,...,...,...,...,...,...,...,...
1252194,2018-06-19 10:00:00,5,10,0,Kopitoto,23.268403,42.669797,Sun radiation
1252195,2018-06-19 10:00:00,5,11,0,Kopitoto,23.268403,42.669797,Temperature
1252196,2018-06-19 10:00:00,6,9,0,Mladost,23.383271,42.655488,Wind
1252197,2018-06-19 10:00:00,6,10,0,Mladost,23.383271,42.655488,Sun radiation


In [33]:
df.to_csv('../data/Archive_Sofia_stations_processed.csv', index=False)