In [1]:
# Author: Tiago Tamagusko (tamagusko@gmail.com)
# Version: 3.0 (2023-09-17)

In [2]:
import pandas as pd

In [3]:
# Data

# Structure
df = pd.read_csv('raw/ltpp_data.csv')

In [4]:
df.head()

Unnamed: 0,YEAR,AADTT,TRAFFIC_OPEN_DATE,SN,PRECIPITATION,TEMPERATURE,IRI,STATION_ID
0,1989,1190.0,1985,6.1,1372.800049,12.0,0.8984,34_1638
1,1989,490.0,1985,3.3,260.799988,6.1,1.2738,16_1021
2,1989,151.0,1982,4.3,634.200012,6.0,1.0314,30_1001
3,1989,195.0,1984,3.2,1546.099976,16.200001,1.0448,37_1030
4,1989,1690.0,1985,8.3,1125.199951,7.2,0.7238,23_1012


In [5]:
DATA = df.copy()

In [6]:
DATA.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   YEAR               395 non-null    int64  
 1   AADTT              395 non-null    float64
 2   TRAFFIC_OPEN_DATE  395 non-null    int64  
 3   SN                 395 non-null    float64
 4   PRECIPITATION      395 non-null    float64
 5   TEMPERATURE        395 non-null    float64
 6   IRI                395 non-null    float64
 7   STATION_ID         395 non-null    object 
dtypes: float64(5), int64(2), object(1)
memory usage: 24.8+ KB


In [7]:
# Data engeneering
DATA['AGE'] = DATA['YEAR'] - DATA['TRAFFIC_OPEN_DATE']

# 
# For AADTT
DATA['ACCUMULATED_AADTT'] = DATA.sort_values('YEAR').groupby(['STATION_ID'])['AADTT'].cumsum()

# For PRECIPITATION
DATA['MEDIAN_PRECIPITATION'] = (DATA.sort_values('YEAR')
                          .groupby(['STATION_ID'])['PRECIPITATION']
                          .cumsum()) / (df.sort_values('YEAR')
                                        .groupby(['STATION_ID'])
                                        .cumcount() + 1)

# For TEMPERATURE
DATA['MEDIAN_TEMPERATURE'] = (DATA.sort_values('YEAR')
                        .groupby(['STATION_ID'])['TEMPERATURE']
                        .cumsum()) / (df.sort_values('YEAR')
                                      .groupby(['STATION_ID'])
                                      .cumcount() + 1)



In [8]:
# Drop unused columns
columns_to_drop = ['TRAFFIC_OPEN_DATE', 
                   'AADTT', 
                   'PRECIPITATION', 
                   'TEMPERATURE']
DATA = DATA.drop(columns_to_drop, axis=1)
DATA.head()

Unnamed: 0,YEAR,SN,IRI,STATION_ID,AGE,ACCUMULATED_AADTT,MEDIAN_PRECIPITATION,MEDIAN_TEMPERATURE
0,1989,6.1,0.8984,34_1638,4,1190.0,1372.800049,12.0
1,1989,3.3,1.2738,16_1021,4,490.0,260.799988,6.1
2,1989,4.3,1.0314,30_1001,7,151.0,634.200012,6.0
3,1989,3.2,1.0448,37_1030,5,195.0,1546.099976,16.200001
4,1989,8.3,0.7238,23_1012,4,1690.0,1125.199951,7.2


In [9]:
# View data

def query_data(id):  
    return DATA[(df['STATION_ID'] == id)]

unique_station_ids = DATA['STATION_ID'].unique()
unique_station_ids

array(['34_1638', '16_1021', '30_1001', '37_1030', '23_1012', '50_1002',
       '6_1253', '18_2009', '53_1501', '56_2018', '85_1801', '85_1808',
       '26_1010', '6_2647', '26_1001', '27_1029', '28_3082', '13_4111',
       '12_9054', '12_4108', '12_4107', '12_4105', '12_4103', '12_4099',
       '12_4097', '12_3996', '17_1003', '28_3083', '1_1021', '48_3609',
       '48_3559', '48_1060', '48_1049', '47_3075', '45_1008', '40_4165',
       '40_1015', '35_2006', '35_1112', '35_1022', '12_3995', '29_1005',
       '12_1060', '28_1016', '6_8151', '1_4126', '6_2004', '6_8201',
       '5_3048', '1_4125', '15_7080', '40_1017', '15_1008', '15_1006',
       '15_1003'], dtype=object)

In [10]:
query_data('37_1030')

Unnamed: 0,YEAR,SN,IRI,STATION_ID,AGE,ACCUMULATED_AADTT,MEDIAN_PRECIPITATION,MEDIAN_TEMPERATURE
3,1989,3.2,1.0448,37_1030,5,195.0,1546.099976,16.200001
45,1990,3.2,1.1028,37_1030,6,398.0,1330.700012,16.8
71,1991,3.2,1.1042,37_1030,7,609.0,1353.033325,16.866667
107,1992,3.2,0.9996,37_1030,8,818.0,1321.5,16.675
162,1993,3.2,1.0624,37_1030,9,1037.0,1278.780005,16.62
177,1994,3.2,1.1084,37_1030,10,1235.0,1270.466675,16.616667
208,1995,3.2,1.1546,37_1030,11,1464.0,1230.628575,16.585714
238,1997,3.2,1.1872,37_1030,13,1731.0,1200.587501,16.475
262,1998,3.2,1.1636,37_1030,14,2009.0,1211.266662,16.522222
304,1999,3.2,1.1698,37_1030,15,2298.0,1235.209991,16.53


In [12]:
# Save data
DATA.to_csv('processed/data.csv', index=None, header=True)