In [1]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, r2_score

# Multiple Linear Regression - Train on One Country, Test on Another

In [2]:
path = os.getcwd() + "/clean_data/processed_city_temperature_data.csv"
df = pd.read_csv(path)
df.head(50)

Unnamed: 0,dt,AverageTemperature,City,Country,Latitude,Longitude
0,1743-11-01,6.068,Århus,Denmark,57.05N,10.33E
1,1744-04-01,5.788,Århus,Denmark,57.05N,10.33E
2,1744-05-01,10.644,Århus,Denmark,57.05N,10.33E
3,1744-06-01,14.051,Århus,Denmark,57.05N,10.33E
4,1744-07-01,16.082,Århus,Denmark,57.05N,10.33E
5,1744-09-01,12.781,Århus,Denmark,57.05N,10.33E
6,1744-10-01,7.95,Århus,Denmark,57.05N,10.33E
7,1744-11-01,4.639,Århus,Denmark,57.05N,10.33E
8,1744-12-01,0.122,Århus,Denmark,57.05N,10.33E
9,1745-01-01,-1.333,Århus,Denmark,57.05N,10.33E


In [7]:
def linear_coefs(X, Y):
  """
  Args:
    X: N x d matrix of input features
    Y: N x 1 matrix (column vector) of output response

  Returns:
    Beta: d x 1 matrix of linear coefficients
  """
  beta = tf.linalg.solve(tf.matmul(tf.transpose(X), X), tf.matmul(tf.transpose(X), Y))  # Return closed form solution (use tf.transpose and tf.mathmul instead of np version)
  return beta

In [20]:
def convert_coordinate(coordinate_string):
    """
    Args:
        coordinate_string: Coordinate with 'N' or 'S' for latitude and 'W' or 'E' for longitude

    Returns:
        relative_coordinate: float representation of coordinate (where 'N' and 'E' are positive)
    """
    if 'S' in coordinate_string or 'W' in coordinate_string:
        relative_coordinate = -float(coordinate_string[:-1]) 
    else:
        relative_coordinate = float(coordinate_string[:-1]) 
        
    return relative_coordinate


In [33]:
def multiple_linear_reg(train_data, train_country, input_features, output_features):
    if train_data.empty:
        print(f"No data available for training ({train_country}).")
    else:
        # Training and training plot 
        # Need all datatypes to be the same for tensorflow
        X_train = train_data[input_features].apply(pd.to_numeric).values # Maintain original dimensions (have more than 1 input dimensions now)
        Y_train = train_data[output_features].apply(pd.to_numeric).values.reshape(-1, 1) # Convert to column vectors (stil one output value)

        # Generate linear coefficients on training data (with intercept)
        X_train_with_intercept = tf.concat([X_train, tf.ones([tf.shape(X_train)[0], 1])], axis=1)# Append a dummy feature of ones that is N x 1 (increase dimensionality by 1)
        beta_train_linear = linear_coefs(X_train_with_intercept, Y_train) # Run linear regression on X (including intercept) and outputs Y
        Y_train_prediction = tf.matmul(X_train_with_intercept, beta_train_linear)  # Multiply original inputs X by found features (dot product of input X with parameters produces output Y)

        # Evaluate accuracy of predictions on the training data
        mae_train = mean_absolute_error(Y_train, Y_train_prediction.numpy())
        print(f"Mean Absolute Error (in Celcius): {mae_train}")

        # Evaluate coefficient of determination
        r2_train = r2_score(Y_train, Y_train_prediction.numpy())
        print(f"Coefficient of Determination (r^2): {r2_train}")

In [34]:
# Compute the month offset (monotonically increasing) from the first date for each city
df['dt'] = pd.to_datetime(df['dt'])
df['month_offset'] = (
    (df['dt'].dt.year - df.groupby('City')['dt'].transform('min').dt.year) * 12
    + (df['dt'].dt.month - df.groupby('City')['dt'].transform('min').dt.month)
    + 1)

df_month = df[['month_offset', 'AverageTemperature', 'Country', 'Latitude', 'Longitude']]
df_month.loc[:, 'Latitude'] = df_month['Latitude'].apply(convert_coordinate) # Make sure latitude and longitude are numeric (no characters)
df_month.loc[:, 'Longitude'] = df_month['Longitude'].apply(convert_coordinate)
df_month = df_month.reset_index(drop=False)

# Select training data (regression over all the cities in one country)
train_country = "Indonesia"
train_data = df_month[df_month['Country'] == train_country]

input_features = ['month_offset', 'Latitude', 'Longitude']
output_features = 'AverageTemperature'
multiple_linear_reg(train_data=train_data, train_country=train_country, input_features=input_features, output_features=output_features)


Mean Absolute Error (in Celcius): 0.7060159110832422
Coefficient of Determination (r^2): 0.2567114930411032


# Multiple Linear Regression - Aggregate Over Year

In [37]:
df['year'] = pd.to_datetime(df['dt']).dt.year

# Define test dataframe
df_year = df[['year', 'AverageTemperature', 'Country', 'City', 'Latitude', 'Longitude']]
df_year.loc[:, 'Latitude'] = df_year['Latitude'].apply(convert_coordinate) # Make sure latitude and longitude are numeric (no characters)
df_year.loc[:, 'Longitude'] = df_year['Longitude'].apply(convert_coordinate)

# Filter outside of function call (this dataset is massive)
train_country = "Indonesia"
df_year = df_year[df_year['Country'] == train_country]
df_year = df_year.groupby(['Country', 'year', 'City']).mean()
df_year = df_year.reset_index(drop=False)

# Select training data (regression over all the cities in one country)
train_country = "Indonesia"
input_features = ['year', 'Latitude', 'Longitude']
output_features = 'AverageTemperature'
multiple_linear_reg(df_year, train_country=train_country, input_features=input_features, output_features=output_features)


Mean Absolute Error (in Celcius): 0.5820692232968211
Coefficient of Determination (r^2): 0.3399739750365909


In [None]:
# TODO: Come up with a good visualization scheme
        # plt.scatter(X_train, Y_train, label=f"Training Data ({train_country})", color="blue")
        # plt.plot(X_train, Y_train_prediction, label="Training Fit", color="red")

        # plt.ylabel(f"Output: {output_features}")
        # plt.xlabel(f"Input: {input_features}")
        # plt.title(f"Regression Model for {train_country} (Training Data)")
        # plt.legend()
        # plt.show()