In [None]:
# Installing libraries: 
# pip install geopy 

In [None]:
# 1
# removing duplciate lane entries
# removing lane info "_lane" from dataset
import pandas as pd

def clean_dataset(df):
    # Group the data by the two columns and keep only the first occurrence of each group
    reduced_df = df.drop_duplicates(subset=[5, 6], keep='first')
    # Convert column 1 values to strings and remove anything after and including the last underscore
    #reduced_df[1] = reduced_df[1].astype(str).str.rsplit('_', n=1).str[0]
    reduced_df.loc[:, 1] = reduced_df.loc[:, 1].astype(str).str.rsplit('_', n=1).str[0]

    reduced_df.iloc[:, 5] = reduced_df.iloc[:, 5].astype(str).apply(lambda x: float(x[:2] + '.' + x[2:]))
    reduced_df.iloc[:, 6] = reduced_df.iloc[:, 6].astype(str).apply(lambda x: float(x[:4] + '.' + x[4:]))

    return reduced_df


In [None]:
# 2
# group data based on sensor location description 
import pandas as pd

def get_sensor_group(data, sensor_loc, ignore=None):
    # Filter the DataFrame based on column 3 and ignore string
    mask = data[3].str.startswith(sensor_loc) & (~data[3].str.contains(ignore) if ignore is not None else True)
    matching_group = data[mask]
    return matching_group


In [None]:
# Connecting the sensor groups based on travelling-salesman-problem algorithm:
# Its working as intended
# Make it a function to take sensor group and to produce a variable sensor adjacency list as well

import pandas as pd
import folium
from geopy.distance import distance

def calculate_distance(sensor_data):
    # Calculate distances between each pair of sensors
    num_sensors = len(sensor_data)
    distances = [[0.0] * num_sensors for _ in range(num_sensors)]
    for i in range(num_sensors):
        for j in range(i+1, num_sensors):
            coord1 = (sensor_data.iloc[i][5], sensor_data.iloc[i][6])
            coord2 = (sensor_data.iloc[j][5], sensor_data.iloc[j][6])
            dist = distance(coord1, coord2).meters
            distances[i][j] = dist
            distances[j][i] = dist
    return distances

def tsp(sensor_data):
    
    distances = calculate_distance(sensor_data)
    map = folium.Map(location=[sensor_data[5].mean(), sensor_data[6].mean()], zoom_start=10)

    # Connect sensors with the shortest distance
    num_sensors = len(sensor_data)
    connected = set()
    min_distance = float('inf')
    min_i, min_j = -1, -1

    # Find the pair of sensors with the shortest distance
    for i in range(num_sensors):
        for j in range(i+1, num_sensors):
            if distances[i][j] < min_distance:
                min_distance = distances[i][j]
                min_i, min_j = i, j

    # Connect the pair of sensors with a polyline
    if min_i != -1 and min_j != -1:
        sensor1 = sensor_data.iloc[min_i]
        sensor2 = sensor_data.iloc[min_j]
        folium.PolyLine([(sensor1[5], sensor1[6]), (sensor2[5], sensor2[6])],
                        color='red').add_to(map)
        connected.add(min_i)
        connected.add(min_j)

    # Connect the remaining sensors
    while len(connected) < num_sensors:
        min_distance = float('inf')
        min_i = -1
        for i in range(num_sensors):
            if i not in connected:
                for j in connected:
                    if distances[i][j] < min_distance:
                        min_distance = distances[i][j]
                        min_i = i
                        min_j = j
        if min_i != -1:
            sensor1 = sensor_data.iloc[min_i]
            sensor2 = sensor_data.iloc[min_j]
            folium.PolyLine([(sensor1[5], sensor1[6]), (sensor2[5], sensor2[6])],
                            color='red').add_to(map)
            connected.add(min_i)

    
    for _, sensor in sensor_data.iterrows():
        tooltip_text = f"Sensor_ID: {sensor[1]}, Des: {sensor[3]}"
        folium.CircleMarker(
            location=[sensor[5], sensor[6]],
            radius=5,
            color='blue',
            fill=True,
            fill_color='blue',
            tooltip=tooltip_text  # Add tooltip
            ).add_to(map)
    return map



In [None]:
# Run the full process

# def main():

# Read from the original data and claen it:
data = clean_dataset(pd.read_csv('../detector/detectors2018.csv', header=None))

# Select sensor group:

#sensor_loc = "CC-215 WB"
#sensor_loc="CC-215 EB"
#sensor_loc="I-15 NB"
#sensor_loc= "I-515 SB"
#sensor_loc="US-95 SB"
sensor_loc="US-95 NB"


# Ignore "-" from the sensor group
ignore = 'Ramp'

# Get list of sensors using the sensor group
sensor_group = get_sensor_group(data, sensor_loc, ignore)

# Plot the map of the sensor group 

sensor_map = tsp(sensor_group)
sensor_map

In [None]:
# Dataset Preparation based on sensor-group:
# Scripts takes in the sensor list from sensor group and merges the data files to a dataset

# Matching sensor data with sensor files downloaded:

# Read from the original data and claen it:
data = clean_dataset(pd.read_csv('../detector/detectors2018.csv', header=None))
# Get the list of sensors using the sensor group name:
sensor_loc="I-15 NB"
ignore = 'Ramp'
sensor_group = get_sensor_group(data, sensor_loc, ignore)

##########################################################################################

# This script merges individual sensor data files from a set of specific list of sensor name
import pandas as pd
import os
from tqdm import tqdm

# read the list of file names from the csv file
#file_list = pd.read_csv('/home/tzahid/Desktop/pred_sensor_list.csv')['pred_sensor'].tolist()
file_list=sensor_group[1]

# add '.csv' extension to file names in the list
file_list = ['d_' + filename + '.csv' for filename in file_list]

# get a list of all csv files in the folder
path = '/home/tzahid/Desktop/new2018dump/'
all_files = os.listdir(path)
csv_files = [filename for filename in all_files if filename.endswith('.csv') and filename in file_list]


print(f"{len(csv_files)} CSV files found matching with the list of file names.")


# get the range of index values
max_index = 0
for file in csv_files:
    try:
        data = pd.read_csv(os.path.join(path, file), usecols=[0])
        max_index = max(max_index, data['index_date'].max())
    except FileNotFoundError:
        print(f"File not found: {file}. Skipping to next file.")

index_range = range(1, max_index+1)

# create an empty dataframe with the desired index
df = pd.DataFrame(index=index_range)

# loop through the csv files and extract the 5th column and add it to the dataframe
for file in tqdm(csv_files, desc='Processing CSV files'):
    try:
        col_name = os.path.splitext(file)[0]
        col_name = col_name.replace(" ", "_")  # replace spaces with underscores in column names
        data = pd.read_csv(os.path.join(path, file), usecols=[0, 4], header=0, skiprows=[0], names=['index_date', col_name]) # modify usecols to define which columns to extract, here 0 and 4
        data = data.drop_duplicates(subset=['index_date'])  # drop any duplicate rows
        data = data.set_index('index_date')  # set the index to the 'index_date' column
        data = data.reindex(index_range, fill_value=0)  # add missing index rows and fill with 0
        df[col_name] = data[col_name]  # add the column to the main dataframe
    except FileNotFoundError:
        print(f"File not found: {file}. Skipping to next file.")

# save the concatenated dataframe to a new csv file
# name of the final file
_filename= sensor_loc +' without '+ ignore +'.csv'
df.to_csv(os.path.join(path, _filename), index_label='index_date')

# print the range of index values
print(f"Index range: 1 - {max_index}")

# print final message
print(f"Concatenation complete. {len(df.columns)} files have been concatenated.")



In [None]:
import pandas as pd
import numpy as np
import miceforest as mf

In [None]:
# Dataset Analysis

#raw_data=pd.read_csv("E:/I-15 Datataset/2018/I-15 NB without Ramp.csv")
import matplotlib.pyplot as plt
import numpy as np

# Get the row and column number of the dataframe

num_rows, num_columns = raw_data.shape
print(f"raw data rows: {num_rows} columns: {num_columns}")
#raw_data.head(10)

# Replace 0 by NaN
data_masked = raw_data.mask(raw_data == 0, np.nan)
data_masked.head(15)

# Remove columns which has less than threshold amount of rows
threshold_percentage = 80
thresh_value = int((threshold_percentage / 100) * len(data_masked))
df_cleaned = data_masked.dropna(axis=1, thresh=thresh_value)
num_rows, num_columns = df_cleaned.shape
print(f"cleaned data rows: {num_rows} columns: {num_columns}")

# Display the NaN percentage for each column and total
nan_percentage = df_cleaned.isna().mean() * 100
total_nan_percentage=nan_percentage.mean()

print(f"Column NaN %: \n{nan_percentage}")
print(f"Total NaN %: {total_nan_percentage}")

#print(df_cleaned)
#df_cleaned.head(30)

# Impute the dataframe with NaN values

kds = mf.ImputationKernel(
  df_cleaned,
  datasets=1,
  save_all_iterations=True,
  random_state=1991
)
    # Run the MICE algorithm for 3 iterations
kds.mice(3)

# Sample into 5/10/15 mins windows evenly

In [189]:
data_complete= kds.complete_data()
data_complete.head(10)

date_index=pd.read_csv("E:/I-15 Datataset/2018/dates.csv")
timed_data_complete= pd.merge(data_complete, date_index, left_on='index_date', right_on='date_index', how='inner')
# timed_data_complete


In [190]:
import pandas as pd

timed_data_complete['date_time'] = pd.to_datetime(timed_data_complete['date_time'])
timed_data_complete.set_index('date_time', inplace=True)

timed_sampled = timed_data_complete.resample('5T').mean()

print(timed_sampled)

output_file_path = 'E:/I-15 Datataset/2018/5min_sampled_data.csv'  # Replace with the desired file path
timed_sampled.to_csv(output_file_path)


                     index_date  d_398_3_411  d_443_2_343  d_444_3_345  \
date_time                                                                
2018-01-02 00:00:00     36572.5   115.500000        82.75        240.0   
2018-01-02 00:05:00     36577.0   127.000000        89.20        240.0   
2018-01-02 00:10:00     36582.0   157.200000        85.00        240.0   
2018-01-02 00:15:00     36587.0   186.400000        94.60        240.0   
2018-01-02 00:20:00     36592.0   170.400000        94.80        240.0   
...                         ...          ...          ...          ...   
2018-12-31 23:35:00    373028.0   202.666667       240.00        240.0   
2018-12-31 23:40:00    373031.0   150.666667       240.00        240.0   
2018-12-31 23:45:00    373033.5   193.000000       240.00        240.0   
2018-12-31 23:50:00    373035.5   147.000000       240.00        240.0   
2018-12-31 23:55:00    373037.0   158.000000       240.00        240.0   

                     d_439_2_333  d_4