In [8]:
## INFS630 - Final project - Linear Regression Model
## Priyadharshini Sakthivel - priyadharshini.sakthivel@mail.mcgill.ca
## Vaishali Mishra - vaishali.mishra@mail.mcgill.ca
## Linear Regression Model for the whole dataset

In [9]:
# importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import matplotlib.cm as cm
import seaborn as sns
import time
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
# read cleaned csv file
Data = pd.read_csv('/Users/vaishalimishra/Desktop/Winter_2023/INFS 630 - Data Mining/Final Project/Data/Data_prepped.csv',parse_dates=['readable_time'])

# drop unwanted columns
Data.drop(['Unnamed: 0'], inplace=True, axis=1)
Data

Unnamed: 0,readable_time,temp,lu_vege_pc
0,2022-06-01 00:00:00,18.282500,5.11
1,2022-06-01 01:00:00,17.802500,5.11
2,2022-06-01 02:00:00,17.190000,5.11
3,2022-06-01 03:00:00,16.790000,5.11
4,2022-06-01 04:00:00,16.507500,5.11
...,...,...,...
91950,2023-02-01 06:00:00,-0.613333,63.85
91951,2023-02-01 07:00:00,-0.615000,63.85
91952,2023-02-01 08:00:00,-0.620000,63.85
91953,2023-02-01 09:00:00,-0.620000,63.85


In [11]:
Data.dtypes

readable_time    datetime64[ns]
temp                    float64
lu_vege_pc              float64
dtype: object

In [12]:
# Creating a column for unix time as datetime does not work with linear regression
Data['unix_time']= Data['readable_time'].apply(lambda x: time.mktime(x.timetuple()))
Data

Unnamed: 0,readable_time,temp,lu_vege_pc,unix_time
0,2022-06-01 00:00:00,18.282500,5.11,1.654056e+09
1,2022-06-01 01:00:00,17.802500,5.11,1.654060e+09
2,2022-06-01 02:00:00,17.190000,5.11,1.654063e+09
3,2022-06-01 03:00:00,16.790000,5.11,1.654067e+09
4,2022-06-01 04:00:00,16.507500,5.11,1.654070e+09
...,...,...,...,...
91950,2023-02-01 06:00:00,-0.613333,63.85,1.675249e+09
91951,2023-02-01 07:00:00,-0.615000,63.85,1.675253e+09
91952,2023-02-01 08:00:00,-0.620000,63.85,1.675256e+09
91953,2023-02-01 09:00:00,-0.620000,63.85,1.675260e+09


In [13]:
# function to build linear regression model
def regression_model(a):
    tss = TimeSeriesSplit(n_splits=2)
    a.set_index('readable_time', inplace=True)
    a.sort_index(inplace=True) # training sets are always the past data since time series is being used
    
    # dependent and independent variables
    x = a.drop(labels = ['temp'],axis=1)
    y = a['temp']
    
    # split train and test sets
    for train_index, test_index in tss.split(x):
        X_train, X_test = x.iloc[train_index, :], x.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # regression model
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    # print coefficients and constant
    c=lr.intercept_
    m=lr.coef_
    print(('c:{0}\n m:{1}').format(c,m))

    # using model on training dataset
    y_pred_train=lr.predict(X_train)
    # r2 and rmse
    r2_train = r2_score(y_train,y_pred_train)
    rmse_train = mean_squared_error(y_train, y_pred_train,squared=False)
    print('r squared for training set:{0}\n rmse for training set:{1}'.format(r2_train,rmse_train))

    # using model on testing dataset
    y_pred_test = lr.predict(X_test)
    # r2 and rmse
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = mean_squared_error(y_test, y_pred_test,squared=False)
    print('r squared for testing set:{0}\n rmse for testing set:{1}'.format(r2_test,rmse_test))

In [14]:
# Calling the function
regression_model(Data)

c:1416.7605167464076
 m:[-4.49580545e-03 -8.41706752e-07]
r squared for training set:0.2877991697140412
 rmse for training set:5.1014330118462885
r squared for testing set:-1.594229719860944
 rmse for testing set:10.58692735071367
