# Data Import

In [1]:
import pandas as pd
import os
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from geopy.distance import geodesic

In [3]:
# Set the run_code to test | run
run_code = 'run'

if run_code == 'run':
    dataset_path = '/kaggle/input/sncb-eda-data-clean/cleaned_ar41_for_ulb.csv'
else:
    dataset_path = '/kaggle/input/sncb-data-augumentation/enriched_cleaned_ar41_for_ulb.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    data = pd.read_csv(dataset_path)

    # Display the basic information and the first few rows of the dataframe
    data_info = data.info()
    data_head = data.head()

    # If you want to print the information to the console
    print(data_info)
    print(data_head)
else:
    print(f"The file {dataset_path} does not exist.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17677337 entries, 0 to 17677336
Data columns (total 18 columns):
 #   Column              Dtype  
---  ------              -----  
 0   timestamps_UTC      object 
 1   mapped_veh_id       float64
 2   lat                 float64
 3   lon                 float64
 4   RS_E_InAirTemp_PC1  float64
 5   RS_E_InAirTemp_PC2  float64
 6   RS_E_OilPress_PC1   float64
 7   RS_E_OilPress_PC2   float64
 8   RS_E_RPM_PC1        float64
 9   RS_E_RPM_PC2        float64
 10  RS_E_WatTemp_PC1    float64
 11  RS_E_WatTemp_PC2    float64
 12  RS_T_OilTemp_PC1    float64
 13  RS_T_OilTemp_PC2    float64
 14  date                object 
 15  hour                float64
 16  dayofweek           float64
 17  weekday             object 
dtypes: float64(15), object(3)
memory usage: 2.4+ GB
None
        timestamps_UTC  mapped_veh_id        lat       lon  \
0  2023-01-23 07:25:08          102.0  51.017864  3.769079   
1  2023-01-23 07:25:16          102.0  51.0

In [None]:
data['lat'] = data['lat'].astype('float32')
data['lon'] = data['lon'].astype('float32')

# Speed Calculate

In [None]:
# Convert timestamps to datetime format
data['timestamps_UTC'] = pd.to_datetime(data['timestamps_UTC'])

# Vectorize the distance calculation
def calculate_distances(lats, lons):
    distances = np.zeros(len(lats))
    for i in range(1, len(lats)):
        if not np.isnan(lats[i]) and not np.isnan(lons[i]):
            distances[i] = geodesic((lats[i-1], lons[i-1]), (lats[i], lons[i])).kilometers
    return distances

# Calculate distances using the vectorized function
latitudes = data['lat'].to_numpy()
longitudes = data['lon'].to_numpy()
data['Distance'] = calculate_distances(latitudes, longitudes)

# Calculate time intervals in hours
time_diffs = data['timestamps_UTC'].diff().dt.total_seconds() / 3600
time_diffs[0] = np.nan  # first element will be NaT after diff

# Calculate speeds in km/h
data['Speed'] = data['Distance'] / time_diffs

# Replace infinite values with NaN if division by zero occurred
data['Speed'].replace([np.inf, -np.inf], np.nan, inplace=True)

# Print the results
print(data[['timestamps_UTC', 'lat', 'lon', 'Distance', 'Speed']])

In [None]:
# Set 'Speed' to 0 where 'Speed' is greater than 120
data.loc[data['Speed'] > 120, 'Speed'] = 0

In [None]:
# Change data type to reduce the data size
data['Distance'] = data['Distance'].astype('float32')
data['Speed'] = data['Speed'].astype('float32')

# Distribution of Distance

In [None]:
# Setting the overall aesthetics for the plot to have an academic look
sns.set_theme(style='ticks', palette='muted')

# Initialize the matplotlib figure
plt.figure(figsize=(12, 6))

# Plotting the Kernel Density Estimate (KDE) plot for RS_E_RPM_PC1
sns.kdeplot(data['Distance'], fill=True, color="#1f77b4", label='RS_E_RPM_PC1s', bw_adjust=0.5)


# Adding titles and labels with a formal font type
plt.title('Distribution of Distance', fontsize=16, fontweight='bold', family='serif')
plt.xlabel('Distance', fontsize=14, family='serif')
plt.ylabel('Density', fontsize=14, family='serif')

# Adding gridlines and removing the top and right axis lines for a cleaner look
sns.despine(trim=True)
plt.grid(True)

# Adjusting the legend to be more formal
plt.legend(title='Sensors', title_fontsize='13', fontsize='12', frameon=False)

# Show the plot with the improved academic aesthetics
plt.show()

# Distribution of Speed

In [None]:
# Setting the overall aesthetics for the plot to have an academic look
sns.set_theme(style='ticks', palette='muted')

# Initialize the matplotlib figure
plt.figure(figsize=(12, 6))

# Plotting the Kernel Density Estimate (KDE) plot for RS_E_RPM_PC1
sns.kdeplot(data['Speed'], fill=True, color="#1f77b4", label='RS_E_RPM_PC1s', bw_adjust=0.5)


# Adding titles and labels with a formal font type
plt.title('Distribution of Speed', fontsize=16, fontweight='bold', family='serif')
plt.xlabel('Speed', fontsize=14, family='serif')
plt.ylabel('Density', fontsize=14, family='serif')

# Adding gridlines and removing the top and right axis lines for a cleaner look
sns.despine(trim=True)
plt.grid(True)

# Adjusting the legend to be more formal
plt.legend(title='Sensors', title_fontsize='13', fontsize='12', frameon=False)

# Show the plot with the improved academic aesthetics
plt.show()

# Weather Data

In [None]:
dataset_path = '/kaggle/input/weather-data-7d/WeatherData.csv'

# Check if the file exists before trying to read it
if os.path.exists(dataset_path):
    weather_data = pd.read_csv(dataset_path)

    # Display the basic information and the first few rows of the dataframe
    data_info = weather_data.info()
    data_head = weather_data.head()

    # If you want to print the information to the console
    print(data_info)
    print(data_head)
else:
    print(f"The file {dataset_path} does not exist.")

In [None]:
# Remove duplicate rows from the weather_data dataframe
weather_data = weather_data.drop_duplicates()

In [None]:
# Convert 'datetime' to a datetime object for easier manipulation
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

# Set a style for the plots
sns.set(style="whitegrid")

# Create a figure with multiple subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 12))

# Temperature trends
sns.lineplot(x='datetime', y='temp_min', data=weather_data, ax=axes[0, 0])
sns.lineplot(x='datetime', y='temp_max', data=weather_data, ax=axes[0, 0])
axes[0, 0].set_title('Temperature Trends')
axes[0, 0].set_ylabel('Temperature (K)')
axes[0, 0].legend(['Min Temperature', 'Max Temperature'])

# Pressure trend
sns.lineplot(x='datetime', y='pressure', data=weather_data, ax=axes[0, 1])
axes[0, 1].set_title('Pressure Trend')
axes[0, 1].set_ylabel('Pressure (hPa)')

# Humidity trend
sns.lineplot(x='datetime', y='humidity', data=weather_data, ax=axes[1, 0])
axes[1, 0].set_title('Humidity Trend')
axes[1, 0].set_ylabel('Humidity (%)')

# Wind speed trend
sns.lineplot(x='datetime', y='wind', data=weather_data, ax=axes[1, 1])
axes[1, 1].set_title('Wind Speed Trend')
axes[1, 1].set_ylabel('Wind Speed (m/s)')

# Cloudiness trend
sns.lineplot(x='datetime', y='clouds', data=weather_data, ax=axes[2, 0])
axes[2, 0].set_title('Cloudiness Trend')
axes[2, 0].set_ylabel('Cloudiness (%)')

# Drop unused subplot
fig.delaxes(axes[2][1])

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

# Basic statistics for each column
basic_statistics = weather_data.describe()

basic_statistics

# Merge Data

In [None]:
# Convert the timestamp columns to datetime objects and round lat/lon to two decimal places in both datasets
data['timestamps_UTC'] = pd.to_datetime(data['timestamps_UTC'])
weather_data['datetime'] = pd.to_datetime(weather_data['datetime'])

data['lat'] = data['lat'].round(2)
data['lon'] = data['lon'].round(2)
weather_data['lat'] = weather_data['lat'].round(2)
weather_data['lon'] = weather_data['lon'].round(2)

# Add a new column to both datasets for merging: date and hour (rounded to the nearest hour)
data['date_hour'] = data['timestamps_UTC'].dt.round('H')
weather_data['date_hour'] = weather_data['datetime'].dt.round('H')

# Merge the datasets on the new date_hour, lat, and lon columns
merged_data = pd.merge(data, weather_data, how='left', on=['date_hour', 'lat', 'lon'])
merged_data.head()

In [None]:
merged_data[merged_data['weather_id'].notnull()]

# Save to CSV

In [None]:
data.to_csv('enriched_cleaned_ar41_for_ulb.csv', index=True)