In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt

# Define the folder containing 15-minute set files
data_folder = './time_sets'

# Initialize an empty DataFrame to combine all processed sets
all_data = pd.DataFrame()

# Loop through all CSV files in the folder
for file in os.listdir(data_folder):
    if file.endswith('.csv'):
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path)

        # Debugging: Show initial data structure
        print(f"Processing file: {file}")
        print(f"Initial data shape: {df.shape}")

        # Ensure Timestamp is a proper datetime object
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        print("Data types after Timestamp conversion:\n", df.dtypes)

        # Drop rows with invalid timestamps
        invalid_rows = df[df['Timestamp'].isna()]
        if not invalid_rows.empty:
            print("Rows with invalid timestamps:\n", invalid_rows)
            df = df.dropna(subset=['Timestamp'])

        # Sort by Timestamp
        df = df.sort_values('Timestamp')

        # Set Timestamp as index
        df = df.set_index('Timestamp')
        print("Index after setting Timestamp:\n", df.index)

        # Remove duplicate timestamps
        if df.index.duplicated().any():
            print("Duplicate timestamps detected. Removing duplicates.")
            df = df[~df.index.duplicated(keep='first')]

        # Debugging: Check time range and unique timestamps
        print("Time range:", df.index.min(), "to", df.index.max())
        print("Number of unique timestamps:", df.index.nunique())

        # Resample to 1-minute intervals
        df_resampled = df.resample('1T').asfreq()
        print("Data after resampling, before interpolation:\n", df_resampled.head(10))

        # Interpolate missing values
        df = df_resampled.interpolate(method='linear').reset_index()

        # Debugging: Check the final output
        print("Data shape after interpolation:", df.shape)
        print("Head of interpolated data:\n", df.head())

        # Ensure required columns exist
        required_columns = ['Latitude', 'Longitude', 'SOG', 'COG', 'Heading', 'Navigational status']
        if all(col in df.columns for col in required_columns):
            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"Skipping {file}, missing required columns.")

# Feature selection
features = ['Latitude', 'Longitude', 'SOG', 'COG', 'Heading']
target = 'Navigational status'

# Drop rows with missing values in the selected columns
all_data = all_data.dropna(subset=features + [target])

# Encode the target variable (categorical to numeric)
all_data[target] = all_data[target].astype('category').cat.codes

# Debugging: Inspect class distribution
print("Class distribution:\n", all_data[target].value_counts())

# Define feature matrix (X) and target vector (y)
X = all_data[features]
y = all_data[target]

# Handle class imbalance with class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print("Computed class weights:\n", class_weight_dict)

# Split the dataset into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier with class weights
model = RandomForestClassifier(random_state=42, class_weight=class_weight_dict)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Visualize the confusion matrix
plt.figure(figsize=(10, 7))
plt.title("Confusion Matrix")
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
