In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

The data is collected from week 2.

In [43]:
df = pd.read_csv("dht22 data.csv", header=None, names=['Timestamp', 'Humidity', 'Temperature'])
df.head()

Unnamed: 0,Timestamp,Humidity,Temperature
0,2024-07-20 10:37:47.756,36.9,23.4
1,2024-07-20 10:37:49.765,36.8,23.3
2,2024-07-20 10:37:51.771,36.6,23.2
3,2024-07-20 10:37:53.778,36.4,23.0
4,2024-07-20 10:37:55.784,36.6,23.0


### 1. Train the model

In [44]:
X = df.Temperature.values.reshape(-1, 1)
y = df.Humidity
model = LinearRegression()
model.fit(X, y)

### 2. Scatter plot for temperature vs humidity

In [45]:
x_range = np.linspace(X.min(), X.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = px.scatter(df, x='Temperature', y='Humidity', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

In [46]:
# Metrics
y_pred = model.predict(X)

mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 5.464130382594111
R-squared: 0.3186979102236983


The plot shows us a negative correlation between the Temp and Humid. Using some metrics, the Mean Squared Error (MSE) or the square difference between actual values and predicted values, has a value of 5.46 average difference. The R^2 or the explained variability for the dependent variable, has a value of 0.32. This two metrics indicate the the linear model is not capturing the data very well, there might be outliers or the data is not linear.

In [47]:
# Identify outliers points using IQR
df2 = df.drop('Timestamp', axis=1)
Q1 = df2.quantile(0.25)
Q3 = df2.quantile(0.75)
IQR = Q3 - Q1

outliers = ((df2 < (Q1 - 1.5 * IQR)) | (df2 > (Q3 + 1.5 * IQR))).any(axis=1)

# Filter the outliers
outlier_points = df2[outliers]
outlier_points

Unnamed: 0,Humidity,Temperature
0,36.9,23.4
1,36.8,23.3
2,36.6,23.2
3,36.4,23.0
4,36.6,23.0
...,...,...
3310,41.2,22.5
3311,41.0,22.6
3312,41.1,22.5
3313,41.7,22.5


So using IQR, we found 63 outliers.

### 3. Filter out the outliers

In [48]:
# Filter out outliers
df_cleaned = df[~outliers]

# Re-train the model
X_cleaned = df_cleaned['Temperature'].values.reshape(-1, 1)
y_cleaned = df_cleaned['Humidity']

model_cleaned = LinearRegression()
model_cleaned.fit(X_cleaned, y_cleaned)

In [49]:
# Model evaluations
y_pred_cleaned = model_cleaned.predict(X_cleaned)
mse_cleaned = mean_squared_error(y_cleaned, y_pred_cleaned)
r2_cleaned = r2_score(y_cleaned, y_pred_cleaned)
print(f"Mean Squared Error (cleaned): {mse_cleaned}")
print(f"R-squared (cleaned): {r2_cleaned}")

Mean Squared Error (cleaned): 4.613830959329298
R-squared (cleaned): 0.33344475775338056


In [50]:
# Scatter plot
x_range = np.linspace(X_cleaned.min(), X_cleaned.max(), 100)
y_range = model.predict(x_range.reshape(-1, 1))

fig = px.scatter(df_cleaned, x='Temperature', y='Humidity', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

There is a slight increase in performance in the 2 metrics, however, it seems to me the data is not linear, so we need to use a non-linear data 

In [51]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_cleaned, y_cleaned)

In [52]:
# Model evaluation
y_pred_cleaned = rf_model.predict(X_cleaned)
mse_cleaned = mean_squared_error(y_cleaned, y_pred_cleaned)
r2_cleaned = r2_score(y_cleaned, y_pred_cleaned)

print(f"Mean Squared Error (cleaned): {mse_cleaned}")
print(f"R-squared (cleaned): {r2_cleaned}")

Mean Squared Error (cleaned): 2.492193747560458
R-squared (cleaned): 0.6399554249442736


We can also try to create new features, to see if there is any performance boost.

### 4. Create new features

In [53]:
# 1. Heat Index (simplified version for Celsius)
df_cleaned['Heat_Index'] = (
    -8.784695 + 1.61139411 * df_cleaned['Temperature'] + 2.338549 * df_cleaned['Humidity'] 
    - 0.14611605 * df_cleaned['Temperature'] * df_cleaned['Humidity'] - 0.01230809 * df_cleaned['Temperature']**2 
    - 0.01642482 * df_cleaned['Humidity']**2 + 0.00221173 * df_cleaned['Temperature']**2 * df_cleaned['Humidity'] 
    + 0.00072546 * df_cleaned['Temperature'] * df_cleaned['Humidity']**2 
    - 0.00000358 * df_cleaned['Temperature']**2 * df_cleaned['Humidity']**2
)

# 2. Temperature-Humidity Index (THI)
df_cleaned['THI'] = df_cleaned['Temperature'] - (0.55 - 0.55 * df_cleaned['Humidity'] / 100) * (df_cleaned['Temperature'] - 14.5)

# 3. Dew Point (Simplified)
df_cleaned['Dew_Point'] = df_cleaned['Temperature'] - ((100 - df_cleaned['Humidity']) / 5)

# 4. Temperature-to-Humidity Ratio
df_cleaned['Temp_Humidity_Ratio'] = df_cleaned['Temperature'] / (df_cleaned['Humidity'] + 1)

# 5. Product of Temperature and Humidity
df_cleaned['Temp_Humidity_Product'] = df_cleaned['Temperature'] * df_cleaned['Humidity']

# 6. Humidity Deviation from 50%
df_cleaned['Humidity_Deviation'] = np.abs(df_cleaned['Humidity'] - 50)

df_cleaned.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0,Timestamp,Humidity,Temperature,Heat_Index,THI,Dew_Point,Temp_Humidity_Ratio,Temp_Humidity_Product,Humidity_Deviation
56,2024-07-20 10:39:40.138,41.9,20.0,25.251384,18.242475,8.38,0.4662,838.0,8.1
57,2024-07-20 10:39:42.145,42.3,20.0,25.259004,18.254575,8.46,0.461894,846.0,7.7
58,2024-07-20 10:39:44.152,42.6,20.0,25.264016,18.26365,8.52,0.458716,852.0,7.4
59,2024-07-20 10:39:46.158,42.8,20.0,25.267023,18.2697,8.56,0.456621,856.0,7.2
60,2024-07-20 10:39:48.165,42.7,19.9,25.294391,18.19819,8.44,0.455378,849.73,7.3


In [58]:
# Fit the model
X = df_cleaned.drop(['Humidity','Timestamp'], axis=1)
y = df_cleaned.Humidity

rf_model2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model2.fit(X, y)

In [59]:
# Model evaluation
y_pred = rf_model2.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"Mean Squared Error (cleaned): {mse}")
print(f"R-squared (cleaned): {r2}")

Mean Squared Error (cleaned): 0.0001505752444134065
R-squared (cleaned): 0.9999782465548909


I can't plot a complex model like this, but as we can see from the given metrics, our model is now performing extremely well.