# COMFE
CPSC 300 Software Engineering


In [114]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score
import joblib

## Data Set Exploration

In [None]:
data = pd.read_csv("data_with_dist_to_near1.csv")
# data = pd.read_csv("C:\\Users\\swagvillain\\Downloads\\bounding_boxes_with_labels.csv")


# Displaying the first few rows of the data
print(data.head())

In [None]:
# Summary of data
print(data.info())

In [None]:
# Description of data
data.describe(include="all", percentiles=([0.1, 0.25, 0.5, 0.75]))

### finding rows with co-efficients 0 (flat objects)
Trying to find walls, floors etc. that we may not want to consider in the model. 

In [None]:
# Filter rows where any of the specified columns have a value less than 1.0
filtered_rows = data[(data['coeffs_x'] <= 1.0) | (data['coeffs_y'] <= 1.0) | (data['coeffs_z'] <= 1.0) ]

# Display the filtered rows
print(filtered_rows)



### Data Preprocessing

In [None]:
# finding how many duplicate rows we have
data.duplicated().sum()

## Rounding (already done)

In [None]:
# import numpy as np

# def round_near_integers(data, threshold=1e-1):
#     """
#     Round values close to 0, 1, or -1 in a DataFrame to their nearest integers.

#     Parameters:
#         data (pd.DataFrame): The input DataFrame.
#         threshold (float): The tolerance for rounding near integers.

#     Returns:
#         pd.DataFrame: The modified DataFrame with rounded values.
#     """
#     # Define a rounding function
#     def round_values(value):
#         if np.isclose(value, 0, atol=threshold):
#             return 0
#         elif np.isclose(value, 1, atol=threshold):
#             return 1
#         elif np.isclose(value, -1, atol=threshold):
#             return -1
#         else:
#             return value

#     # Identify numeric columns to round, excluding 'ID' and 'room_ID'
#     numeric_columns = data.select_dtypes(include=[np.number]).columns
#     columns_to_round = [col for col in numeric_columns if col not in ['ID', 'room_ID']]
    
#     # Apply rounding function to the selected columns
#     data[columns_to_round] = data[columns_to_round].applymap(round_values)
#     return data

    


In [None]:
print(data.head())

### Correlations and visualization

Let's look at the data correlations. first we split the data into numeric and categorical, then we will look at correlations.

In [None]:
numeric_data = data.select_dtypes(include=[np.number])
categorical_data = data.select_dtypes(exclude=[np.number])



In [None]:
corr = numeric_data.corr()
corr.style.background_gradient(cmap='coolwarm')

## Modelling Time.
Let's make some models to train on our cleaned and processed data.

## calculating nearest other bounding box

In [None]:

# import pandas as pd
# import numpy as np
# from tqdm import tqdm

# # Assuming `data` is the DataFrame provided
# data['dist_to_nearest'] = np.inf  # Initialize with infinity for all rows

# # Function to calculate Euclidean distance between two bounding boxes
# def compute_distance(row1, row2, dimensions=['x', 'y', 'z']):
#     deltas = []
#     for dim in dimensions:
#         delta = (row1[f'centroid_{dim}'] - row2[f'centroid_{dim}']) ** 2
#         deltas.append(delta)
#     return np.sqrt(sum(deltas))

# # Iterate over each unique room_ID
# for room_id in tqdm(data['room_ID'].unique(), desc="Processing room_IDs"):
#     room_data = data[data['room_ID'] == room_id]
    
#     # Iterate over each bounding box in the room
#     for i, row in room_data.iterrows():
#         min_distance = np.inf
#         for j, other_row in room_data.iterrows():
#             if i != j:  # Skip the current box itself
#                 dist = compute_distance(row, other_row, dimensions=['x', 'y', 'z'])
#                 min_distance = min(min_distance, dist)
        
#         # Update the distance for this row in the main DataFrame
#         data.at[i, 'dist_to_nearest'] = min_distance

In [None]:
print(data.head)

data.to_csv('data_with_dist_to_near1.csv')

### Adding fake data to the mix

#### Remove basis from real data

In [None]:
data = data.drop(columns=['Unnamed: 0.1','Unnamed: 0','basis_0_0', 'basis_0_0', 'basis_0_1', 'basis_0_2', 'basis_1_0', 'basis_1_1', 'basis_1_2', 'basis_2_0',  'basis_2_1',  'basis_2_2'])

In [None]:
# Define the number of rows
num_rows = 200000

# Generate random data based on the provided statistics
np.random.seed(42)  # For reproducibility

dummy_data = {
    # "basis_0_0": np.random.randint(-1, 1, size=num_rows), ignore the rotation
    # "basis_0_1": np.random.randint(-1, 1, size=num_rows),
    # "basis_0_2": np.random.randint(-1, 1, size=num_rows),
    # "basis_1_0": np.random.randint(-1, 1, size=num_rows),
    # "basis_1_1": np.random.randint(-1, 1, size=num_rows),
    # "basis_1_2": np.random.randint(-1, 1, size=num_rows),
    # "basis_2_0": np.random.randint(-1, 1, size=num_rows),
    # "basis_2_1": np.random.randint(-1, 1, size=num_rows),
    # "basis_2_2": np.random.randint(-1, 1, size=num_rows),
    "centroid_x": np.random.normal(-39345, 2000, size=num_rows),  # mean=39345, std=9316
    "centroid_y": np.random.uniform(-44081, 1000, size=num_rows), # mean=-44081, std=6586
    "centroid_z": np.random.normal(-3873, 22531, size=num_rows),
    "coeffs_x": np.random.uniform(0, 19234, size=num_rows),
    "coeffs_y": np.random.uniform(0, 17126, size=num_rows),
    "coeffs_z": np.random.uniform(0, 7747, size=num_rows),
    "room_ID": np.random.randint(0, 3496, size=num_rows),
    "dist_to_nearest": np.random.exponential(scale=734.26, size=num_rows),  # Using mean to set scale
    "is_real": 0
}

# Convert to a pandas DataFrame
dummy_data = pd.DataFrame(dummy_data)

# Display the first few rows
print(dummy_data.tail())

### add column to real data

In [None]:
data['is_real'] = 1

In [None]:
full_data = pd.concat([data, dummy_data], ignore_index=True)

print(full_data.head)


### splitting data
We split the data into train and test sets.

In [None]:


X = full_data.drop(columns=['is_real'])
y = full_data['is_real']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# checking to see if it was split correctly
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)




### Decision Tree

making tree classifier and using cross validation

In [117]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import numpy as np
import pickle

# Create Decision Tree classifier
dt_classifier = DecisionTreeClassifier(max_depth=5, min_samples_split=10)

# Define the cross-validation strategy
cv = StratifiedKFold(n_splits=5)

# Initialize progress bar
progress_bar = tqdm(total=cv.get_n_splits(X_train, y_train), desc="Cross-Validation Progress")

# Perform cross-validation manually to show progress
dt_scores = []
for train_idx, test_idx in cv.split(X_train, y_train):
    X_fold_train, X_fold_test = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_fold_train, y_fold_test = y_train.iloc[train_idx], y_train.iloc[test_idx]
    dt_classifier.fit(X_fold_train, y_fold_train)
    dt_scores.append(dt_classifier.score(X_fold_test, y_fold_test))
    progress_bar.update(1)

progress_bar.close()

# Convert scores to a NumPy array for mean and standard deviation calculations
dt_scores = np.array(dt_scores)

# Mean accuracy across folds
print("Mean Accuracy (Decision Tree):", dt_scores.mean())
print("Standard Deviation Accuracy (Decision Tree):", dt_scores.std())

# Exporting/saving the model

# joblib.dump(dt_classifier, 'COMFE_model.pkl')
with open('COMFE_model.pkl', 'wb') as f:
    pickle.dump(dt_classifier, f)


Cross-Validation Progress: 100%|██████████| 5/5 [00:07<00:00,  1.56s/it]

Mean Accuracy (Decision Tree): 0.9999883639560607
Standard Deviation Accuracy (Decision Tree): 3.878665642114124e-06





#### Finding Best Hyperparameters for DT

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Define hyperparameters grid for Decision Tree
# dt_param_grid = {
#     'max_depth': [None, 10, 20, 30],
# }

# # GridSearchCV for Decision Tree
# dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5)
# dt_grid_search.fit(X_train, y_train)

# # Best parameters for Decision Tree
# print("Best Parameters (Decision Tree):", dt_grid_search.best_params_)
# print("Best Score (Decision Tree):", dt_grid_search.best_score_)

we see that the best max_depth is 10. We will use that below.

#### making final (best) decision tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# # Define the Decision Tree classifier with the best parameters
# best_dt_classifier = DecisionTreeClassifier(max_depth=10)

# # Train the model on the full training set
# best_dt_classifier.fit(X_train, y_train)

# # Evaluate the model on the testing set
# dt_test_accuracy = best_dt_classifier.score(X_test, y_test)
# print("Test Accuracy (Decision Tree):", dt_test_accuracy)

### Logistic Regression

#### making the model and using cross validation

In [None]:
# print(X_train.isnull().sum())  # Shows count of NaNs per feature
# print(y_train.isnull().sum())  # Check for NaNs in target


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import cross_val_score, StratifiedKFold
# from sklearn.preprocessing import StandardScaler

# # Scale the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)

# # Create Logistic Regression classifier with adjusted parameters
# logreg_classifier = LogisticRegression(max_iter=1000, class_weight='balanced')

# # Use StratifiedKFold for cross-validation
# stratified_cv = StratifiedKFold(n_splits=5)

# # Perform cross-validation for Logistic Regression classifier
# logreg_scores = cross_val_score(logreg_classifier, X_train_scaled, y_train, cv=stratified_cv)

# # Print the cross-validation scores
# print("Cross-Validation Scores (Logistic Regression):", logreg_scores)

# # Print the mean and standard deviation of the cross-validation scores
# print("Mean Accuracy (Logistic Regression):", logreg_scores.mean())
# print("Standard Deviation of Accuracy (Logistic Regression):", logreg_scores.std())


### Finding most important features in the decision tree classifier

In [None]:
feature_importances = best_dt_classifier.feature_importances_

# Get feature names
feature_names = X_train.columns

# Create a dictionary mapping feature names to their importances
feature_importance_dict = dict(zip(feature_names, feature_importances))

# Sort features by their importances
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the most important features
print("Top 5 Most Important Features:")
for feature, importance in sorted_features[:5]:
    print(f"{feature}: {importance}")

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_features[:10])), [imp for _, imp in sorted_features[:10]], align='center')
plt.yticks(range(len(sorted_features[:10])), [feat for feat, _ in sorted_features[:10]])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Most Important Features')
plt.show()