# 2D Design Project

<b>Problem Statement</b>: We wish to predict Singapore's GDP growth amidst COVID-19 based on various factors. By comparing the predicted growth rate with the actual growth rate, we can determine the effectiveness of Singapore's coping strategies against COVID-19.

Factors/Variables to consider (from most to least important):
- Time/date (time series data)
- Vaccination rate
- Daily active cases
- Hospitalised
- Recovered
- Government grants/funding
- Phases (circuit breaker, phase 1 etc.)

Predict : 
- Growth rate of GDP

## Data Pre Processing

In [2]:
# Multiple Linear Regression

import numpy as np
import pandas as pd
import datetime as dt


# Importing the dataset
# Start from 1 Aug 2021 (first record of vaccination rate)
df = pd.read_excel('Covid-19 SG.xlsx', skiprows=range(1, 557))

# processing with -
#  Vaccination rate
# - Daily active cases
# - Hospitalised
# - Recovered
# - daily new cases
sg_population = 5.686 * 10**6 # if got type we can find a dataset and map this

# Processing Nan Values
df = df[pd.notnull(df['Phase'])]
df['Cumulative Individuals Vaccinated'] = df['Cumulative Individuals Vaccinated'].fillna(0) # drop na if we want to remove vacine
df['Percentage Vaccinated'] = df['Cumulative Individuals Vaccinated'].divide(sg_population)

# Convert to Date time object for easier processing
df['Date'] = pd.to_datetime(df['Date'])

# Convert Date to numerical value
df['Date'] = df['Date'].map(dt.datetime.toordinal)

# Find 7 days Moving Average as another feature
df['7 days Moving Average'] = df['Daily Confirmed'].rolling(window=7).mean()

# Replace NaN values
df['7 days Moving Average'].fillna(df['Daily Confirmed'], inplace=True)

# df['Date'] = pd.to_datetime(df['Date'],format="%d/%m/%Y")


# Getting useful columns
# new_columns = ['Date','Daily Confirmed', 'Still Hospitalised','Phase','7 days Moving Average','Percentage Vaccinated']
new_columns = ['Date', 'Still Hospitalised','Phase','7 days Moving Average', 'Percentage Vaccinated']
df = df.reindex(columns=new_columns)

df = df[new_columns]


In [3]:
# Preprocessing SGX Data for Y Axis
sgx_df = pd.read_csv('./HistoricalPrices.csv', skipfooter=43)

sgx_df['Date'] = pd.to_datetime(sgx_df['Date'])
sgx_df['Date'] = sgx_df['Date'].map(dt.datetime.toordinal)
sgx_df = sgx_df.rename(columns={' Open':'STI Price'})
# Narrowing down to just open instead of high,low and close
sgx_df = sgx_df[['Date','STI Price']]

# merge wit dataset on date
merged_df = pd.merge(df,sgx_df, how='inner', on='Date')

  return func(*args, **kwargs)


In [15]:
merged_df

Unnamed: 0,Date,Still Hospitalised,Phase,7 days Moving Average,Percentage Vaccinated,STI Price,Phase_Phase 2 (Heightened Alert),Phase_Preparatory Stage,Phase_Stabilisation Phase
0,738004,572.0,Phase 2 (Heightened Alert),111.0,0.758562,3176.42,1,0,0
1,738005,562.0,Phase 2 (Heightened Alert),102.0,0.759976,3149.25,1,0,0
2,738006,525.0,Phase 2 (Heightened Alert),95.0,0.761512,3154.6,1,0,0
3,738007,547.0,Phase 2 (Heightened Alert),98.0,0.76282,3186.65,1,0,0
4,738008,516.0,Phase 2 (Heightened Alert),97.0,0.764014,3175.0,1,0,0
5,738012,478.0,Preparatory Stage,82.142857,0.769179,3187.53,0,1,0
6,738013,478.0,Preparatory Stage,77.571429,0.770952,3204.76,0,1,0
7,738014,440.0,Preparatory Stage,72.0,0.772724,3190.23,0,1,0
8,738015,458.0,Preparatory Stage,65.142857,0.774289,3167.03,0,1,0
9,738018,396.0,Preparatory Stage,55.571429,0.779473,3139.23,0,1,0


## Data Visualization : 
Tells us if we need to transform the data


In [16]:
new_df.to_csv('./output/processed_data.csv',index=False)

In [6]:
# Merging Dataset

X = merged_df.iloc[:, :-1].values # selects all the columns excluding STI price
Y = merged_df.iloc[:, -1].values # STI price column


#### Data preprocessing of the phase column

In [7]:

# Encoding categorical data which is the phase

# Encoding the Independent Variable

# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# from sklearn.compose import ColumnTransformer

# labelencoder_X = LabelEncoder()
# X[:, 2] = labelencoder_X.fit_transform(X[:, 2]) # phase column
# ct = ColumnTransformer([("Phase", OneHotEncoder(), [2])], remainder="passthrough")
# X = ct.fit_transform(X)


one_hot_cont = pd.get_dummies(merged_df['Phase'], prefix="Phase")
new_df = merged_df.join(one_hot_cont)


# Modeling with Linear Regression

In [8]:


# # Avoiding the Dummy Variable Trap (dummy variables: binary variables for categorical data)
# X = X[:, 1:] # avoid one of the dummy variables

# # Splitting the dataset into the Training set and Test set

# from sklearn.model_selection import train_test_split
# X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

# # Fitting the Multiple Linear Regression in the Training set

# from sklearn.linear_model import LinearRegression
# regressor = LinearRegression()
# regressor.fit(X_Train, Y_Train)
# # print(regressor.intercept_)
# # print(regressor.coef_)

# # Predicting the Test set results

# Y_Pred = regressor.predict(X_Test)



In [9]:
def CostFunction(x,y,w,b):
    cost = np.sum((((x.dot(w) + b) - y) ** 2) / (2*len(y)))
    return cost

def GradientDescent(x, y, w, b, learning_rate, epochs):
    print(x.shape)
    cost_list = [0] * epochs
   
    for epoch in range(epochs):
        z = x.dot(w) + b
        loss = z - y
        
        weight_gradient = x.T.dot(loss) / len(y)
        bias_gradient = np.sum(loss) / len(y)
        
        w = w - learning_rate*weight_gradient
        b = b - learning_rate*bias_gradient
  
        cost = CostFunction(x, y, w, b)
        cost_list[epoch] = cost
        
        if (epoch%(epochs/10)==0):
            print("Cost is:",cost)
        
    return w, b, cost_list

def predict(X, w, b):
    return X.dot(w) + b

def r2score(y_pred, y):
    rss = np.sum((y_pred - y) ** 2)
    tss = np.sum((y-y.mean()) ** 2)
    
    r2 = 1 - (rss / tss)
    return r2

def train_test_split(df_feature, df_target, random_state=None, test_size=0.5):
    np.random.seed(random_state)
    N = df_feature.shape[0]
    print(N)
    sample = int(test_size*N)
    train_idx = np.random.choice(N, sample,replace=False)
    print(len(train_idx))
    
    df_feature_train = df_feature.iloc[train_idx]
    df_target_train = df_target.iloc[train_idx]

    test_idx = [idx for idx in range(N) if idx not in train_idx]
    print(len(test_idx))
    
    df_feature_test = df_feature.iloc[test_idx]
    df_target_test = df_target.iloc[test_idx]

    return df_feature_train, df_feature_test, df_target_train, df_target_test

In [10]:
def standard_scaling(df):
    dfout = df.apply(lambda x: (x - np.mean(x)) / np.std(x))
    return dfout

def min_max_scaling(df):
    return df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

def normalization(df):
    return df.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

In [11]:
feature_names = ['Date','Still Hospitalised','7 days Moving Average','Percentage Vaccinated','Phase_Phase 2 (Heightened Alert)','Phase_Preparatory Stage','Phase_Stabilisation Phase']
target_name = ["STI Price"]
X = new_df.loc[:,feature_names]
y = new_df.loc[:,target_name]

In [12]:
X.values

array([[7.38004000e+05, 5.72000000e+02, 1.11000000e+02, 7.58561731e-01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.38005000e+05, 5.62000000e+02, 1.02000000e+02, 7.59975906e-01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.38006000e+05, 5.25000000e+02, 9.50000000e+01, 7.61511783e-01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.38007000e+05, 5.47000000e+02, 9.80000000e+01, 7.62820436e-01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.38008000e+05, 5.16000000e+02, 9.70000000e+01, 7.64013894e-01,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.38012000e+05, 4.78000000e+02, 8.21428571e+01, 7.69178684e-01,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [7.38013000e+05, 4.78000000e+02, 7.75714286e+01, 7.70951636e-01,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [7.38014000e+05, 4.40000000e+02, 7.20000000e+01, 7.72723883e-01,
        0.00000000e+00, 1

In [13]:
X_train.shape[1]

NameError: name 'X_train' is not defined

In [None]:
# Dividing the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.25)

# Initial random weights
w = np.random.randn(X_train.shape[1])
# Initial bias
b = 0

# call the gradient descent function to get the finalised weights and bias (model training)
w, b, c= GradientDescent(X_train, y_train, np.zeros(X_train.shape[1]), 0, 0.002,epochs=100)
plt.plot(c)

y_pred = predict(X_test, w, b)

r2score(y_pred, y_test)

(44, 7)


ValueError: operands could not be broadcast together with shapes (7,) (7,44) 

## Optimization


In [None]:
# Building the optimal model using Backward Elimination

import statsmodels.api as sm
X = np.append(arr = np.ones((59, 1)).astype(int), values = X, axis = 1)
# X = np.append(arr = np.ones((59, 1)).astype('float64'), values = X, axis = 1)

X_Optimal = X[:, [0,1,2,3,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,2,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

X_Optimal = X[:, [0,1,4]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
regressor_OLS.summary()

# Fitting the Multiple Linear Regression in the Optimal Training set

X_Optimal_Train, X_Optimal_Test = train_test_split(X_Optimal,test_size = 0.2, random_state = 0)
regressor.fit(X_Optimal_Train, Y_Train)

# Predicting the Optimal Test set results

Y_Optimal_Pred = regressor.predict(X_Optimal_Test)

In [None]:
# X = merged_df.iloc[:, :-1].values # selects all the columns excluding STI price

# Evaluating the model


In [None]:
# optimized with bw elimation
X_Optimal = X[:, [0,1,2,3,4,5]]
X_Optimal = np.array(X_Optimal, dtype=float)
regressor_OLS = sm.OLS(endog = Y, exog = X_Optimal).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.529
Model:                            OLS   Adj. R-squared:                  0.485
Method:                 Least Squares   F-statistic:                     11.93
Date:                Tue, 26 Oct 2021   Prob (F-statistic):           9.36e-08
Time:                        12:56:52   Log-Likelihood:                -287.59
No. Observations:                  59   AIC:                             587.2
Df Residuals:                      53   BIC:                             599.7
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        2.38e+06   5.89e+05      4.044      0.0

In [None]:
Y_Pred

array([3087.27157853, 3063.1365116 , 3089.03177691, 3085.2729684 ,
       3109.35837855, 3166.32833225, 3066.88064559, 3104.94332752,
       3076.4737427 , 3086.97484616, 3151.43215778, 3116.52180025])

In [None]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


# predicting the accuracy score
score=r2_score(Y_Test,Y_Pred)
print(f"r2 score is {score}")
print(f"mean_sqrd_error is == {mean_squared_error(Y_Test,Y_Pred)}")
print(f"root_mean_squared error of is == {np.sqrt(mean_squared_error(Y_Test,Y_Pred))}")


# After Optimization with BE
print("============================")
print("After Optimization with Backwards Elimination")
# predicting the accuracy score
score=r2_score(Y_Test,Y_Optimal_Pred)
print(f"r2 score is {score}")
print(f"mean_sqrd_error is == {mean_squared_error(Y_Test,Y_Pred)}")
print(f"root_mean_squared error of is == {np.sqrt(mean_squared_error(Y_Test,Y_Pred))}")

r2 score is 0.7798496442687198
mean_sqrd_error is == 375.25938334777646
root_mean_squared error of is == 19.371612822575628
After Optimization with Backwards Elimination
r2 score is 3.958279437643597e-05
mean_sqrd_error is == 375.25938334777646
root_mean_squared error of is == 19.371612822575628


# Data Visualization

In [None]:
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

x_range = np.linspace(X.min(), X.max(), 100)
y_range = regressor.predict(x_range.reshape(-1, 1))

fig = px.scatter(df, x='total_bill', y='STI index', opacity=0.65)
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Regression Fit'))
fig.show()

ValueError: X has 1 features, but LinearRegression is expecting 65 features as input.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8063f459-52be-4c78-9eaa-2f01d373f9b4' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>