In [24]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from data_cleaning.data_cleaner import DataCleaner

print(tf.__version__)


2.16.2


# Linear Regression for Sea Level 1

**inputs**: Year, Country, Sea level rise, Average rainfall  
**output**: Average temperature

In [25]:
# Current working directory
current_directory = os.getcwd()

# Go up one directory to the root
root = os.path.abspath(os.path.join(current_directory, ".."))

# Get the path to the data
data_path = os.path.join(root, 'clean-data/processed_Sea_level_1_data.csv')

dc: DataCleaner = DataCleaner(data_path)
dc.preview(10)


Unnamed: 0,Year,Country,Avg Temperature (°C),Sea Level Rise (mm),Rainfall (mm)
0,2000,Argentina,16.9,4.0,2047.0
1,2000,Australia,11.933333,2.266667,2033.333333
2,2000,Brazil,31.2,3.7,803.0
3,2000,Canada,19.3,2.65,1383.0
4,2000,China,26.2,2.2,1849.0
5,2000,France,16.6,2.8,1819.666667
6,2000,Germany,9.75,1.45,2641.0
7,2000,India,21.25,3.25,1124.5
8,2000,Indonesia,23.585714,3.242857,1781.428571
9,2000,Mexico,16.9,1.2,1974.5


In [26]:
df_copy = dc.df.copy()

# One Hot Encode the country column
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
country_encoded = encoder.fit_transform(dc.df[['Country']])
country_columns = encoder.get_feature_names_out(['Country'])

df_encoded = pd.concat([dc.df.drop(columns=['Country']), pd.DataFrame(country_encoded, columns=country_columns)], axis=1)
print(df_encoded)

     Year  Avg Temperature (°C)  Sea Level Rise (mm)  Rainfall (mm)  \
0    2000             16.900000             4.000000    2047.000000   
1    2000             11.933333             2.266667    2033.333333   
2    2000             31.200000             3.700000     803.000000   
3    2000             19.300000             2.650000    1383.000000   
4    2000             26.200000             2.200000    1849.000000   
..    ...                   ...                  ...            ...   
335  2023             20.000000             2.750000    1772.500000   
336  2023             30.900000             3.300000     979.000000   
337  2023             19.600000             2.700000    1260.500000   
338  2023             30.300000             2.750000    1503.000000   
339  2023             16.257143             2.671429    1532.000000   

     Country_Argentina  Country_Australia  Country_Brazil  Country_Canada  \
0                  1.0                0.0             0.0             

In [27]:
# Extract features and target
X = df_encoded["Year"].values.reshape(-1, 1) # 2D column vector
X = np.hstack((X, df_encoded.drop(columns=["Avg Temperature (°C)", "Year"]).values)) # remove target column and year column (already included) stack year and other features horizontally, shape: (n_samples, total_features)
Y = df_encoded["Avg Temperature (°C)"].values.reshape(-1, 1) # 2D column vector

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [28]:
# Create the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, input_dim=X.shape[1], activation='linear')  # Linear Regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [29]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [30]:
# Train the model
history = model.fit(X_train, Y_train, epochs=100, batch_size=5, validation_split=0.2, verbose=1)

Epoch 1/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1098451.8750 - val_loss: 822835.5000
Epoch 2/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 790235.3750 - val_loss: 582508.7500
Epoch 3/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 555668.6875 - val_loss: 397430.0312
Epoch 4/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 385741.7188 - val_loss: 263271.0938
Epoch 5/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 246181.3281 - val_loss: 168656.3438
Epoch 6/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 154412.1094 - val_loss: 104673.4375
Epoch 7/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 92600.6328 - val_loss: 63153.3086
Epoch 8/100
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 64287

In [31]:
# # Evaluate the model on the test data
# test_loss = model.evaluate(X_test, Y_test)
# print(f"Test Loss (MSE): {test_loss}")

In [32]:
# Predict on the test set
Y_pred = model.predict(X_test)
print(f"Y Predictions:\n{Y_pred[:10]}")
print(f"Y Test:\n{Y_test[:10]}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Y Predictions:
[[18.992044]
 [19.354977]
 [21.439741]
 [18.653927]
 [21.714447]
 [20.84709 ]
 [15.448186]
 [18.062258]
 [22.541473]
 [22.44096 ]]
Y Test:
[[33.3       ]
 [19.2       ]
 [12.05      ]
 [12.06666667]
 [21.9       ]
 [17.275     ]
 [15.4       ]
 [26.8       ]
 [19.6       ]
 [26.65      ]]


In [33]:
r2 = r2_score(Y_test, Y_pred)
print(f"R² Score: {r2:.4f}")

R² Score: -0.0927


In [2]:
# Flatten arrays for plotting
Y_test_flat = Y_test.flatten()
Y_pred_flat = Y_pred.flatten()

# Plot: Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(Y_test_flat, Y_pred_flat, color='blue', alpha=0.6, label='Predicted Values')

# Line of equality (ideal prediction line)
plt.plot([min(Y_test_flat), max(Y_test_flat)], [min(Y_test_flat), max(Y_test_flat)], 
         color='red', linestyle='--', label='Equality Line')

# Labels and title
plt.xlabel('Actual Avg Temperature (°C)')
plt.ylabel('Predicted Avg Temperature (°C)')
plt.title('Actual vs Predicted Avg Temperature')

# Grid and legend
plt.grid(True)
plt.legend()

#plt.savefig("lin_reg_temp_trend.png", dpi=300, bbox_inches='tight')

# Show plot
plt.show()


NameError: name 'Y_test' is not defined