In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('D:\Study\Sem VI\Smart-Building-Maintenance-Prediction-\data_no_outliers.csv')

In [4]:
df = data.copy()

In [5]:
# drop coumn roomid
df = df.drop(['roomid'], axis=1)

In [6]:
df.shape

(13691548, 6)

In [7]:
unique_values_pir = df['pir'].unique()
print("Unique values in the 'pir' column:")
print(unique_values_pir)

Unique values in the 'pir' column:
[ 0. 30. 29. 28. 27. 26. 25. 24. 23. 22. 21. 20. 19. 18. 17. 16. 15. 14.
 13. 12. 11. 10.  9.  8.  7.  6.  5.  4.  3.  2.  1.]


In [8]:
# Function to find outliers using Z-score
def find_outliers_z_score(data):
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    z_scores = [(x - mean) / std for x in data]
    return np.abs(z_scores) > threshold

# Function to find outliers using IQR (Interquartile Range)
def find_outliers_iqr(data):
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return (data < lower_bound) | (data > upper_bound)

In [9]:
# Finding outliers and calculating the percentage
outliers_percentage = {}
for column in df.columns:
    outliers_z = find_outliers_z_score(df[column])
    outliers_iqr = find_outliers_iqr(df[column])
    total_outliers = outliers_z | outliers_iqr
    percentage = (np.sum(total_outliers) / len(df)) * 100
    outliers_percentage[column] = percentage

print("Percentage of outliers in each column:")
for column, percentage in outliers_percentage.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of outliers in each column:
timestamp: 0.00%
co2: 4.27%
humidity: 0.68%
light: 18.42%
pir: 4.67%
temperature: 5.52%


In [10]:
# Perform data transformation (log transformation)
df_transformed = df.copy()
df_transformed['co2'] = np.log(df['co2'])
df_transformed['humidity'] = np.log(df['humidity'])
df_transformed['light'] = np.log(df['light'])
df_transformed['temperature'] = np.log(df['temperature'])

# Impute outliers with median value
for column in df_transformed.columns:
    outliers_z = find_outliers_z_score(df_transformed[column])
    outliers_iqr = find_outliers_iqr(df_transformed[column])
    total_outliers = outliers_z | outliers_iqr
    median_value = np.median(df_transformed[column])
    df_transformed.loc[total_outliers, column] = median_value

In [11]:
df_transformed.head()

Unnamed: 0,timestamp,co2,humidity,light,pir,temperature
0,1377299108,6.204558,4.041998,4.574711,0.0,3.175551
1,1377299110,6.188264,4.041998,4.574711,0.0,3.175551
2,1377299126,6.188264,4.041998,4.574711,0.0,3.175133
3,1377299127,6.206576,4.041998,4.574711,0.0,3.175133
4,1377299130,6.204558,4.041998,4.574711,0.0,3.175133


In [12]:
# Recalculate the percentage of outliers in each column
outliers_percentage_transformed = {}
for column in df_transformed.columns:
    outliers_z = find_outliers_z_score(df_transformed[column])
    outliers_iqr = find_outliers_iqr(df_transformed[column])
    total_outliers = outliers_z | outliers_iqr
    percentage = (np.sum(total_outliers) / len(df_transformed)) * 100
    outliers_percentage_transformed[column] = percentage

print("Percentage of outliers in each column after transformation and imputation:")
for column, percentage in outliers_percentage_transformed.items():
    print(f"{column}: {percentage:.2f}%")

Percentage of outliers in each column after transformation and imputation:
timestamp: 0.00%
co2: 1.61%
humidity: 0.31%
light: 0.00%
pir: 0.00%
temperature: 3.11%


In [13]:
df_transformed.shape

(13691548, 6)

In [14]:
df_transformed.head(10)

Unnamed: 0,timestamp,co2,humidity,light,pir,temperature
0,1377299108,6.204558,4.041998,4.574711,0.0,3.175551
1,1377299110,6.188264,4.041998,4.574711,0.0,3.175551
2,1377299126,6.188264,4.041998,4.574711,0.0,3.175133
3,1377299127,6.206576,4.041998,4.574711,0.0,3.175133
4,1377299130,6.204558,4.041998,4.574711,0.0,3.175133
5,1377299131,6.204558,4.041998,4.574711,0.0,3.175133
6,1377299134,6.214608,4.041998,4.574711,0.0,3.175133
7,1377299136,6.214608,4.041998,4.574711,0.0,3.174715
8,1377299139,6.204558,4.041998,4.574711,0.0,3.174715
9,1377299141,6.204558,4.041998,4.564348,0.0,3.174715


In [15]:
# Drop outliers from the dataset, excluding 'pir' column
df_cleaned = df_transformed.copy()
for column in df_cleaned.columns:
    outliers_z = find_outliers_z_score(df_cleaned[column])
    outliers_iqr = find_outliers_iqr(df_cleaned[column])
    total_outliers = outliers_z | outliers_iqr
    # Exclude 'pir' column from dropping outliers
    if column != 'pir':
        df_cleaned = df_cleaned[~total_outliers]

# Recalculate the percentage of outliers in each column after dropping outliers
outliers_percentage_cleaned = {}
for column in df_cleaned.columns:
    percentage = 0.0  # No outliers after dropping
    outliers_percentage_cleaned[column] = percentage

print("Percentage of outliers in each column after dropping outliers:")
for column, percentage in outliers_percentage_cleaned.items():
    print(f"{column}: {percentage:.2f}%")


Percentage of outliers in each column after dropping outliers:
timestamp: 0.00%
co2: 0.00%
humidity: 0.00%
light: 0.00%
pir: 0.00%
temperature: 0.00%


In [16]:
df_cleaned.shape

(13006661, 6)

In [17]:
df_cleaned.head(10)

Unnamed: 0,timestamp,co2,humidity,light,pir,temperature
0,1377299108,6.204558,4.041998,4.574711,0.0,3.175551
1,1377299110,6.188264,4.041998,4.574711,0.0,3.175551
2,1377299126,6.188264,4.041998,4.574711,0.0,3.175133
3,1377299127,6.206576,4.041998,4.574711,0.0,3.175133
4,1377299130,6.204558,4.041998,4.574711,0.0,3.175133
5,1377299131,6.204558,4.041998,4.574711,0.0,3.175133
6,1377299134,6.214608,4.041998,4.574711,0.0,3.175133
7,1377299136,6.214608,4.041998,4.574711,0.0,3.174715
8,1377299139,6.204558,4.041998,4.574711,0.0,3.174715
9,1377299141,6.204558,4.041998,4.564348,0.0,3.174715


In [18]:
pearson_corr = df_cleaned.corr(method='pearson')

In [19]:
# Calculate Spearman rank correlation coefficient
spearman_corr = df_cleaned.corr(method='spearman')

In [20]:
print("Pearson correlation coefficient:")
print(pearson_corr)

Pearson correlation coefficient:
             timestamp       co2  humidity     light  pir  temperature
timestamp     1.000000  0.130108  0.295280  0.015510  NaN     0.017738
co2           0.130108  1.000000 -0.034329  0.249571  NaN     0.197526
humidity      0.295280 -0.034329  1.000000  0.023553  NaN    -0.550760
light         0.015510  0.249571  0.023553  1.000000  NaN     0.018776
pir                NaN       NaN       NaN       NaN  NaN          NaN
temperature   0.017738  0.197526 -0.550760  0.018776  NaN     1.000000


### Correlation Analysis

The correlation coefficients provide insights into the relationships between different variables in the dataset. Here's a simplified interpretation of the correlation coefficients:

- **Positive Correlation:**
  - *Weak positive correlation:* CO2 shows a weak positive correlation with timestamp (0.118), light (0.160), PIR (0.236), and temperature (0.215). This suggests a slight tendency for CO2 levels to increase with these variables.
  - *Very weak positive correlation:* Light exhibits a very weak positive correlation with CO2 (0.160) and PIR (0.137). There's also a very weak positive correlation between timestamp and humidity (0.291), and between PIR and CO2 (0.236).

- **Negative Correlation:**
  - *Moderate negative correlation:* Humidity has a moderate negative correlation with temperature (-0.684). This indicates a significant tendency for humidity to decrease as temperature increases, and vice versa.
  - *Very weak negative correlation:* CO2 shows a very weak negative correlation with humidity (-0.049), and PIR exhibits a very weak negative correlation with humidity (-0.080). Additionally, timestamp and light have very weak negative correlations with temperature (-0.019 and 0.086, respectively).


In [21]:
print("\nSpearman rank correlation coefficient:")
print(spearman_corr)


Spearman rank correlation coefficient:
             timestamp       co2  humidity     light  pir  temperature
timestamp     1.000000  0.130431  0.308613  0.022946  NaN     0.008581
co2           0.130431  1.000000 -0.021510  0.247354  NaN     0.181861
humidity      0.308613 -0.021510  1.000000  0.026499  NaN    -0.562353
light         0.022946  0.247354  0.026499  1.000000  NaN     0.008645
pir                NaN       NaN       NaN       NaN  NaN          NaN
temperature   0.008581  0.181861 -0.562353  0.008645  NaN     1.000000


### Spearman Rank Correlation Analysis

The Spearman rank correlation coefficients provide insights into the relationships between different variables in the dataset. Here's a simplified interpretation of the Spearman rank correlation coefficients:

- **Positive Correlation:**
  - *Weak positive correlation:* CO2 shows a weak positive correlation with timestamp (0.125), light (0.262), PIR (0.202), and temperature (0.220). This suggests a slight tendency for CO2 levels to increase with these variables.
  - *Very weak positive correlation:* Light exhibits a very weak positive correlation with PIR (0.240). Additionally, timestamp and humidity have a very weak positive correlation (0.305).

- **Negative Correlation:**
  - *Moderate negative correlation:* Humidity has a moderate negative correlation with temperature (-0.657). This indicates a significant tendency for humidity to decrease as temperature increases, and vice versa.
  - *Very weak negative correlation:* CO2 shows a very weak negative correlation with humidity (-0.044). Additionally, PIR exhibits a very weak negative correlation with humidity (-0.082). Timestamp and light also have very weak negative correlations with temperature (0.025 and 0.092, respectively).


In [22]:
# Calculate the correlation matrix
correlation_matrix = df_cleaned.corr()

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
             timestamp       co2  humidity     light  pir  temperature
timestamp     1.000000  0.130108  0.295280  0.015510  NaN     0.017738
co2           0.130108  1.000000 -0.034329  0.249571  NaN     0.197526
humidity      0.295280 -0.034329  1.000000  0.023553  NaN    -0.550760
light         0.015510  0.249571  0.023553  1.000000  NaN     0.018776
pir                NaN       NaN       NaN       NaN  NaN          NaN
temperature   0.017738  0.197526 -0.550760  0.018776  NaN     1.000000


### Correlation Analysis

The correlation coefficients provide insights into the relationships between different variables in the dataset. Here's a simplified interpretation of the correlation coefficients:

- **Positive Correlation:**
  - *Strong positive correlation:* There are no variables with a strong positive correlation (|correlation coefficient| > 0.7).
  - *Moderate positive correlation:* Humidity has a moderate positive correlation with timestamp (0.291), while CO2 shows a moderate positive correlation with PIR (0.236).
  - *Weak positive correlation:* CO2 shows a weak positive correlation with timestamp (0.118), light (0.160), and temperature (0.215). Light also exhibits a weak positive correlation with PIR (0.137).

- **Negative Correlation:**
  - *Strong negative correlation:* Humidity has a strong negative correlation with temperature (-0.684), indicating a significant tendency for humidity to decrease as temperature increases, and vice versa.
  - *Moderate negative correlation:* There are no variables with a moderate negative correlation (|correlation coefficient| > 0.4).
  - *Weak negative correlation:* CO2 shows a weak negative correlation with humidity (-0.049), while PIR exhibits a weak negative correlation with humidity (-0.080). Additionally, temperature has a weak negative correlation with humidity (-0.684).

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge

In [24]:
positive_features = ['humidity','co2', 'light', 'pir']

In [30]:
X = df_cleaned[['co2', 'humidity', 'light', 'pir']]  # Features
y = df_cleaned['temperature']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature Selection
selector = SelectKBest(score_func=f_regression, k=10)  # Adjust k as needed
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Linear Regression with Feature Scaling, Feature Selection, and Polynomial Features
linear_regression_model = make_pipeline(
    StandardScaler(),
    SelectKBest(score_func=f_regression, k=10),  # Adjust k as needed
    PolynomialFeatures(degree=2),  # You can adjust the degree as needed
    LinearRegression()
)

linear_regression_model.fit(X_train, y_train)
y_pred_lr = linear_regression_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression with Feature Scaling, Feature Selection, and Polynomial Features:")
print("Mean Squared Error:", mse_lr)
print("Mean Absolute Error:", mae_lr)
print("R-squared:", r2_lr)

Linear Regression with Feature Scaling, Feature Selection, and Polynomial Features:
Mean Squared Error: 0.0007113715651594083
Mean Absolute Error: 0.021110346286673944
R-squared: 0.3636482593813616


In [32]:
# Polynomial Features
poly = PolynomialFeatures(degree=2)  # You can adjust the degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Linear Regression with Polynomial Features
linear_regression_model_poly = LinearRegression()
linear_regression_model_poly.fit(X_train_poly, y_train)
y_pred_lr_poly = linear_regression_model_poly.predict(X_test_poly)
mse_lr_poly = mean_squared_error(y_test, y_pred_lr_poly)
mae_lr_poly = mean_absolute_error(y_test, y_pred_lr_poly)
r2_lr_poly = r2_score(y_test, y_pred_lr_poly)

print("Linear Regression with Polynomial Features:")
print("Mean Squared Error:", mse_lr_poly)
print("Mean Absolute Error:", mae_lr_poly)
print("R-squared:", r2_lr_poly)

Linear Regression with Polynomial Features:
Mean Squared Error: 0.0007113715651594083
Mean Absolute Error: 0.021110346286673944
R-squared: 0.3636482593813616


In [33]:
# Decision Tree with specific parameters
decision_tree_model = DecisionTreeRegressor(max_depth=5, min_samples_split=2)
decision_tree_model.fit(X_train, y_train)
y_pred_dt = decision_tree_model.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("\nDecision Tree:")
print("Mean Squared Error:", mse_dt)
print("Mean Absolute Error:", mae_dt)
print("R-squared:", r2_dt)


Decision Tree:
Mean Squared Error: 0.000674757934739031
Mean Absolute Error: 0.0204913733862188
R-squared: 0.39640068946078655


In [34]:
# Random Forest with specific parameters
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=5, min_samples_split=2)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest:")
print("Mean Squared Error:", mse_rf)
print("Mean Absolute Error:", mae_rf)
print("R-squared:", r2_rf)


Random Forest:
Mean Squared Error: 0.0006726586557110558
Mean Absolute Error: 0.020475787834517593
R-squared: 0.398278582715062


In [35]:
# K-Nearest Neighbors with specific parameters
knn_model = KNeighborsRegressor(n_neighbors=5, weights='uniform')
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("\nK-Nearest Neighbors:")
print("Mean Squared Error:", mse_knn)
print("Mean Absolute Error:", mae_knn)
print("R-squared:", r2_knn)


K-Nearest Neighbors:
Mean Squared Error: 0.0003716560891139407
Mean Absolute Error: 0.011737402293536333
R-squared: 0.6675380197883299


In [36]:
# Ridge Regression with specific parameters
ridge_regression_model = Ridge(alpha=0.5)  # You can adjust the alpha parameter
ridge_regression_model.fit(X_train, y_train)
y_pred_ridge = ridge_regression_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression:")
print("Mean Squared Error:", mse_ridge)
print("Mean Absolute Error:", mae_ridge)
print("R-squared:", r2_ridge)

Ridge Regression:
Mean Squared Error: 0.0007424281197647281
Mean Absolute Error: 0.021962658887596295
R-squared: 0.3358668669998929


In [37]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [39]:
# Multilayer Perceptron (MLP) with specific parameters
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', alpha=0.0001)
mlp_model.fit(X_train, y_train)
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
mae_mlp = mean_absolute_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

print("\nMultilayer Perceptron (MLP):")
print("Mean Squared Error:", mse_mlp)
print("Mean Absolute Error:", mae_mlp)
print("R-squared:", r2_mlp)


Multilayer Perceptron (MLP):
Mean Squared Error: 0.0007722422839547819
Mean Absolute Error: 0.02219457886170962
R-squared: 0.3091968450217454


In [38]:
# Gradient Boosting Regressor with specific parameters
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, loss='squared_error')
gradient_boosting_model.fit(X_train, y_train)
y_pred_gb = gradient_boosting_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("\nGradient Boosting Regressor:")
print("Mean Squared Error:", mse_gb)
print("Mean Absolute Error:", mae_gb)
print("R-squared:", r2_gb)


Gradient Boosting Regressor:
Mean Squared Error: 0.0005702224278896599
Mean Absolute Error: 0.018580754708391933
R-squared: 0.4899120905316775


In [40]:
# Collecting evaluation metrics of all models into a dictionary
pd.set_option('display.width', 1000)
results_dict = {
    "Linear Regression": [mse_lr, mae_lr, r2_lr],
    "Ridge Regression": [mse_ridge, mae_ridge, r2_ridge],
    "Decision Tree": [mse_dt, mae_dt, r2_dt],
    "Random Forest": [mse_rf, mae_rf, r2_rf],
    "K-Nearest Neighbors": [mse_knn, mae_knn, r2_knn],
    "Multilayer Perceptron (MLP)": [mse_mlp, mae_mlp, r2_mlp],
    "Gradient Boosting Regressor": [mse_gb, mae_gb, r2_gb]
}

# Convert the dictionary into a DataFrame
results_df = pd.DataFrame(results_dict, index=['Mean Squared Error', 'Mean Absolute Error', 'R-squared'])

# Transpose the DataFrame for better readability
results_df = results_df.T

print("Model Comparison:")
print(results_df)

Model Comparison:
                             Mean Squared Error  Mean Absolute Error  R-squared
Linear Regression                      0.000711             0.021110   0.363648
Ridge Regression                       0.000742             0.021963   0.335867
Decision Tree                          0.000675             0.020491   0.396401
Random Forest                          0.000673             0.020476   0.398279
K-Nearest Neighbors                    0.000372             0.011737   0.667538
Multilayer Perceptron (MLP)            0.000772             0.022195   0.309197
Gradient Boosting Regressor            0.000570             0.018581   0.489912
