In [1]:
import pandas as pd
from scipy.stats import shapiro

In [2]:
def add_columns_per_m(df):
    df['stops_per_m'] = df['numOfStops'] / df['route_length']
    df['emission_per_m'] = df['emission'] / df['route_length']
    df['elevation_up_per_m'] = df['elevation_up'] / df['route_length']
    df['elevation_down_per_m'] = df['elevation_down'] / df['route_length']
    df['timeloss_per_m'] = df['timeloss'] / df['route_length']

    return df

In [3]:
def drop_unnecessary_columns(df):
    return df.drop(['routeId', 'tripId', 'loc', 'seed'], axis=1)

In [4]:
def avg_speed(df):
    max_value = df['avgSpeed'].max()
    df['avgSpeed'] = df['avgSpeed'] / max_value
    return df

In [5]:
def drop_stops_and_route_length(df):
    return df.drop(['numOfStops', 'route_length'], axis=1)

In [6]:
def drop_redundant_columns(df):
    return df.drop(['numOfStops', 'route_length', 'emission', 'elevation_up', 'elevation_down', 'timeloss'], axis=1)

In [7]:
def normalize(df):
    return df.apply(lambda x: x / abs(x).max(), axis=0)

In [8]:
def add_delta_elevation(df):
    df['elevation_down_per_m'] = df['elevation_down_per_m'].abs()
    df['delta_elevation_per_m'] = df['elevation_up_per_m'] - df['elevation_down_per_m']
    new_df = df.drop(['elevation_up_per_m', 'elevation_down_per_m'], axis=1)
    return new_df

In [9]:
def drop_max_timeloss(df):
    max_timeloss_index = df['timeloss_per_m'].idxmax()
    df = df.drop(max_timeloss_index)
    
    return df

In [10]:
tableEmission = pd.read_csv('C:\\Users\\Admin\\Desktop\\onlab\\bme_onlab\\data\\emissionData.csv', delimiter=';')
tableBattery = pd.read_csv('C:\\Users\\Admin\\Desktop\\onlab\\bme_onlab\\data\\batteryData.csv', delimiter=';')

## Drop unnecessary columns

In [11]:
tableEmission = tableEmission[tableEmission['seed'] <= 3]
tableBattery = tableBattery[tableBattery['seed'] <= 3]

In [12]:
tableEmission = drop_unnecessary_columns(tableEmission)
tableEmission

Unnamed: 0,avgSpeed,timeloss,route_length,elevation_up,elevation_down,trafficScale,numOfStops,emission
0,4.563878,69.93,673.45,8.3200,-10.7300,0.3,2,267.681119
1,6.863021,26.94,660.10,2.4600,-2.3200,0.3,2,222.161497
2,5.729324,119.93,3647.97,105.7300,-68.5600,0.3,12,1140.608494
3,2.436582,1236.73,4048.13,22.4600,-79.7000,0.3,11,2167.769607
4,4.027150,574.89,3968.76,81.2400,-24.6800,0.3,11,1808.274473
...,...,...,...,...,...,...,...,...
1685,6.250916,32.55,633.16,4.2092,-5.1983,0.3,2,265.070995
1686,6.495458,28.85,648.58,3.0018,-3.8665,0.3,2,253.089093
1687,6.267747,381.75,9439.36,135.4277,-48.1504,0.3,22,3331.811618
1688,6.909029,156.34,3703.28,9.4831,-24.5813,0.3,10,1086.312362


In [13]:
tableBattery = drop_unnecessary_columns(tableBattery)
tableBattery

Unnamed: 0,avgSpeed,timeloss,route_length,elevation_up,elevation_down,trafficScale,numOfStops,emission
0,1.013591,339.81,575.27,5.3400,-7.9300,0.3,2,69.618208
1,6.425740,33.46,660.10,2.3400,-2.3200,0.3,2,141.947410
2,5.872307,134.66,3647.97,105.6900,-68.5300,0.3,12,1252.215855
3,3.558635,614.38,4075.61,22.7300,-79.8800,0.3,11,395.144354
4,2.757995,1019.37,4018.91,80.8600,-24.3000,0.3,11,953.118991
...,...,...,...,...,...,...,...,...
1620,5.549829,49.49,633.16,4.2245,-5.0366,0.3,2,176.893838
1621,5.549829,49.64,633.16,4.2245,-5.0366,0.3,2,176.869209
1622,6.102325,36.86,648.58,3.0112,-3.8609,0.3,2,192.287315
1623,6.438761,464.55,9439.36,135.5613,-48.2845,0.3,22,1858.849971


## create new columns from existing ones

In [14]:
tableBattery = avg_speed(tableBattery)
tableEmission = avg_speed(tableEmission)

In [15]:
tableBattery = add_columns_per_m(tableBattery)
tableEmission = add_columns_per_m(tableEmission)

In [16]:
tableBattery = drop_redundant_columns(tableBattery)
tableEmission = drop_redundant_columns(tableEmission)
tableBattery

tableEmission = drop_max_timeloss(tableEmission)
tableEmission = drop_max_timeloss(tableEmission)
tableEmission = drop_max_timeloss(tableEmission)

In [17]:
tableEmissionDeltaEle = add_delta_elevation(tableEmission)
tableBatteryDeltaEle = add_delta_elevation(tableBattery)

tableBattery = tableBattery.drop(['delta_elevation_per_m'], axis=1)
tableEmission = tableEmission.drop(['delta_elevation_per_m'], axis=1)

In [18]:
tableBattery = normalize(tableBattery)
tableEmission = normalize(tableEmission)
#tableBatteryDeltaEle = normalize(tableBatteryDeltaEle)
#tableEmissionDeltaEle = normalize(tableEmissionDeltaEle)

tableBattery

Unnamed: 0,avgSpeed,trafficScale,stops_per_m,emission_per_m,elevation_up_per_m,elevation_down_per_m,timeloss_per_m
0,0.102877,0.3,0.320765,0.174944,0.130642,0.194862,0.214397
1,0.652199,0.3,0.279544,0.310861,0.049891,0.049683,0.018398
2,0.596027,0.3,0.303500,0.496222,0.407751,0.265556,0.013398
3,0.361194,0.3,0.249017,0.140156,0.078491,0.277059,0.054714
4,0.279931,0.3,0.252530,0.342836,0.283164,0.085472,0.092061
...,...,...,...,...,...,...,...
1620,0.563296,0.3,0.291438,0.403875,0.093902,0.112448,0.028370
1621,0.563296,0.3,0.291438,0.403819,0.093902,0.112448,0.028456
1622,0.619373,0.3,0.284509,0.428583,0.065341,0.084149,0.020627
1623,0.653520,0.3,0.215035,0.284675,0.202118,0.072309,0.017863


In [19]:
tableBattery.to_csv('battery.csv', index=False, sep=';')
tableEmission.to_csv('emission.csv', index=False, sep=';')
tableBatteryDeltaEle.to_csv('batteryDeltaEle.csv', index=False, sep=';')
tableEmissionDeltaEle.to_csv('emissionDeltaEle.csv', index=False, sep=';')

In [20]:
for column in tableEmission.columns:
    stat, p = shapiro(tableEmission[column])
    print(f'Column: {column}, p-value: {p}')
    if p > 0.05:
        print(f'Column "{column}" appears to be normally distributed.')
    else:
        print(f'Column "{column}" does not appear to be normally distributed.')

Column: avgSpeed, p-value: 6.134378703590581e-09
Column "avgSpeed" does not appear to be normally distributed.
Column: trafficScale, p-value: 2.1879623200137913e-28
Column "trafficScale" does not appear to be normally distributed.
Column: stops_per_m, p-value: 9.154898572945238e-31
Column "stops_per_m" does not appear to be normally distributed.
Column: emission_per_m, p-value: 7.164535086154426e-20
Column "emission_per_m" does not appear to be normally distributed.
Column: elevation_up_per_m, p-value: 1.1521747424630367e-27
Column "elevation_up_per_m" does not appear to be normally distributed.
Column: elevation_down_per_m, p-value: 8.594306021573055e-30
Column "elevation_down_per_m" does not appear to be normally distributed.
Column: timeloss_per_m, p-value: 1.6591838483081558e-40
Column "timeloss_per_m" does not appear to be normally distributed.
