<a href="https://colab.research.google.com/github/zixuantan/Databusters/blob/main/NUS_DSESC_DATABUSTERS_XX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1) Libraries Installation
##### The cell below is to help you keep track the libraries used and install them quickly.
##### Ensure the correct library names are used, and follow the syntax: **%pip install PACKAGE_NAME**.

In [None]:
# test 2

In [None]:
# %pip install pandas
# %pip install matplotlib
# add commented pip installation lines for packages used as shown above for ease of testing
# the line should follow the format %pip install PACKAGE_NAME


## 2) Main Section for Code
### **ALL code for machine learning and dataset analysis** should be entered below.
##### Ensure that your code is clear and readable.
##### Remember to include comments and markdown notes as necessary to explain and highlight important segments of your code.

In [3]:
# Set the URL of the CSV file
url <- 'https://raw.githubusercontent.com/zixuantan/Databusters/refs/heads/main/Quarterly%20Data.csv'

# Read the CSV file
quarterly <- read.csv(url)
Q_data = quarterly[3:nrow(quarterly),]


#################
# data cleaning #
#################
missing_values <- colSums(is.na(Q_data))
missing_values[missing_values > 32]
# from the summary, i discovered some variables with close to (or even more than) half of the obs being NA
# i decided to remove those with more than 32NAs, so that's 24 predictors --> good balance between number
# of predictors and observations removed

# moreover, looking through many of these 24 predictors, they seem to be highly correlated with some
# other predictors, so it is justifiable to remove them (find 2 e.g., add in abit of economic explanation)

#removing 24 predictors
# Step 1: Find columns with more than 32 NAs
columns_to_remove <- names(missing_values[missing_values > 32])  # Columns with more than 32 NAs

# Step 2: Remove those columns from the dataset
data_cleaned <- Q_data[, !(names(Q_data) %in% columns_to_remove)]

#Remove the 32 rows with NA values
# Find the last row's index
last_row_index <- nrow(data_cleaned)

#Remove rows with NA values, except for the last row
data_cleaned <- data_cleaned[complete.cases(data_cleaned) | rownames(data_cleaned) == rownames(data_cleaned)[last_row_index], ]

# tranform gdp growth
data_cleaned$GDP_an_growth <- c(0, diff(log(data_cleaned$GDPC1)*400))

####################################
# AR Lag for each forecast horizon #
####################################
install.packages("dplyr")
library(dplyr)
# Horizon 1: Q1 2025
max_lag <- 12

# Create lagged variables for Annualized GDP Growth
for (lag in 2:max_lag) {
  data_cleaned[[paste0("Lag_", lag, "_Annualized_GDP_Growth")]] <-
    dplyr::lag(data_cleaned$GDP_an_growth, lag)
}

# Drop rows with NA values introduced by lagging
data_cleaned <- na.omit(data_cleaned)

aic_bic_results <- data.frame(Lag_Length = integer(), AIC = numeric(), BIC = numeric())

# Fit models with different lag lengths and record AIC/BIC values
for (lag in 2:max_lag) {
  # Select the appropriate lagged variables as predictors
  predictors <- paste0("Lag_", 2:lag, "_Annualized_GDP_Growth")
  X_lagged <- data_cleaned[, predictors, drop = FALSE]
  y <- data_cleaned$GDP_an_growth

  # Add constant for intercept
  X_lagged <- cbind(Intercept = 1, X_lagged)

  # Fit the model using OLS
  model <- glm(y ~ ., data = as.data.frame(X_lagged))

  # Store AIC and BIC for each lag length
  aic_bic_results <- rbind(aic_bic_results,
                           data.frame(Lag_Length = lag,
                                      AIC = AIC(model),
                                      BIC = BIC(model)))
}

# Sort and print results by AIC
aic_bic_results <- aic_bic_results[order(aic_bic_results$AIC), ]
print(aic_bic_results)


# Horizon 2: Q2 2025
max_lag <- 12

# Create lagged variables for Annualized GDP Growth
for (lag in 3:max_lag) {
  data_cleaned[[paste0("Lag_", lag, "_Annualized_GDP_Growth")]] <-
    dplyr::lag(data_cleaned$GDP_an_growth, lag)
}

# Drop rows with NA values introduced by lagging
data_cleaned <- na.omit(data_cleaned)

aic_bic_results <- data.frame(Lag_Length = integer(), AIC = numeric(), BIC = numeric())

# Fit models with different lag lengths and record AIC/BIC values
for (lag in 3:max_lag) {
  # Select the appropriate lagged variables as predictors
  predictors <- paste0("Lag_", 3:lag, "_Annualized_GDP_Growth")
  X_lagged <- data_cleaned[, predictors, drop = FALSE]
  y <- data_cleaned$GDP_an_growth

  # Add constant for intercept
  X_lagged <- cbind(Intercept = 1, X_lagged)

  # Fit the model using OLS
  model <- glm(y ~ ., data = as.data.frame(X_lagged))

  # Store AIC and BIC for each lag length
  aic_bic_results <- rbind(aic_bic_results,
                           data.frame(Lag_Length = lag,
                                      AIC = AIC(model),
                                      BIC = BIC(model)))
}

# Sort and print results by AIC
aic_bic_results <- aic_bic_results[order(aic_bic_results$AIC), ]
print(aic_bic_results)


# Horizon 3: Q2 2025
max_lag <- 12

# Create lagged variables for Annualized GDP Growth
for (lag in 5:max_lag) {
  data_cleaned[[paste0("Lag_", lag, "_Annualized_GDP_Growth")]] <-
    dplyr::lag(data_cleaned$GDP_an_growth, lag)
}

# Drop rows with NA values introduced by lagging
data_cleaned <- na.omit(data_cleaned)

aic_bic_results <- data.frame(Lag_Length = integer(), AIC = numeric(), BIC = numeric())

# Fit models with different lag lengths and record AIC/BIC values
for (lag in 5:max_lag) {
  # Select the appropriate lagged variables as predictors
  predictors <- paste0("Lag_", 5:lag, "_Annualized_GDP_Growth")
  X_lagged <- data_cleaned[, predictors, drop = FALSE]
  y <- data_cleaned$GDP_an_growth

  # Add constant for intercept
  X_lagged <- cbind(Intercept = 1, X_lagged)

  # Fit the model using OLS
  model <- glm(y ~ ., data = as.data.frame(X_lagged))

  # Store AIC and BIC for each lag length
  aic_bic_results <- rbind(aic_bic_results,
                           data.frame(Lag_Length = lag,
                                      AIC = AIC(model),
                                      BIC = BIC(model)))
}

# Sort and print results by AIC
aic_bic_results <- aic_bic_results[order(aic_bic_results$AIC), ]
print(aic_bic_results)




###############################################
# Optimal ADL lag and its respective datasets #
###############################################

# Y(t) = aY(t-2) + b [Q1 2025]
growth_tmr_Q1 = data_cleaned$GDP_an_growth[4:231]
tsdata_Q1 = cbind(growth_tmr_Q1, data_cleaned[2:229,])


# Y(t) = aY(t-3) + b [Q2 2025)]
growth_tmr_Q2 = data_cleaned$GDP_an_growth[5:231]
tsdata_Q2 = cbind(growth_tmr_Q2, data_cleaned[2:228,])


# Y(t) = aY(t-5) + b [Q2 2025)]
growth_tmr_Q4 = data_cleaned$GDP_an_growth[7:231]
tsdata_Q4 = cbind(growth_tmr_Q4, data_cleaned[2:226,])



Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



   Lag_Length      AIC      BIC
1           2 1265.276 1275.430
2           3 1267.056 1280.594
3           4 1269.049 1285.971
4           5 1271.049 1291.356
5           6 1272.222 1295.914
6           7 1273.777 1300.853
7           8 1275.301 1305.761
8           9 1277.298 1311.143
9          10 1279.253 1316.482
10         11 1281.242 1321.856
11         12 1282.417 1326.416
   Lag_Length      AIC      BIC
1           3 1197.317 1207.300
2           4 1199.317 1212.628
3           5 1201.303 1217.942
4           6 1202.561 1222.529
5           7 1204.247 1227.542
6           8 1205.835 1232.458
7           9 1207.786 1237.737
8          10 1209.781 1243.060
9          11 1211.733 1248.340
10         12 1212.499 1252.434
  Lag_Length      AIC      BIC
1          5 1126.218 1136.022
2          6 1127.872 1140.944
3          7 1129.838 1146.178
4          8 1131.696 1151.303
5          9 1133.358 1156.233
6         10 1135.340 1161.483
7         11 1137.321 1166.732
8         12 113

#### Remember to rename your file name to **NUS_DSESC_DATABUSTERS_XX.ipynb** and ensure that it can run successfully. Good luck and have fun!