In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import tensorflow as tf
from statsmodels.tsa.stattools import adfuller
import itertools
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


In [None]:
#Training Data loading for LSTM Model
from google.colab import drive
drive.mount('/content/drive')

# Assuming you place the file in a folder named 'Colab Notebooks' in your Google Drive
file_path = '/content/drive/My Drive/Time Series Hotel Project/TrainingLoanData.csv'

data = pd.read_csv(file_path)
print('Shape of data:', data.shape)
data.head()


Mounted at /content/drive
Shape of data: (1205112, 12)


Unnamed: 0,Location Name,Location City,Location Zip,Location County,Unit Capacity,Obligation End Date (YYYYMMDD),Filer Type,Total Room Receipts,Year,Population,Population Growth Rate,Quarter
0,OMNI AUSTIN HOTEL AT FIC CENTRE ...,AUSTIN,78701,227,314,20000229,50,1213477.0,2000,20944499.0,4.49,Q1
1,OMNI AUSTIN HOTEL AT FIC CENTRE ...,AUSTIN,78701,227,314,20000331,50,1586063.0,2000,20944499.0,4.49,Q1
2,OMNI AUSTIN HOTEL AT FIC CENTRE ...,AUSTIN,78701,227,314,20000430,50,1348446.0,2000,20944499.0,4.49,Q2
3,OMNI AUSTIN HOTEL AT FIC CENTRE ...,AUSTIN,78701,227,314,20000531,50,1483993.0,2000,20944499.0,4.49,Q2
4,OMNI AUSTIN HOTEL AT FIC CENTRE ...,AUSTIN,78701,227,314,20000630,50,1276623.0,2000,20944499.0,4.49,Q2


In [None]:
data['Obligation End Date (YYYYMMDD)'].unique()

array([20000229, 20000331, 20000430, 20000531, 20000630, 20000731,
       20000831, 20000930, 20001031, 20001130, 20001231, 20000131,
       20020228, 20020331, 20020430, 20020531, 20020630, 20020731,
       20020831, 20020930, 20021031, 20021130, 20021231, 20020131,
       20030228, 20030331, 20030430, 20030531, 20030630, 20030731,
       20030831, 20030930, 20031031, 20031130, 20031231, 20030131,
       20040229, 20040331, 20040430, 20040531, 20040630, 20040731,
       20040831, 20040930, 20041031, 20041130, 20041231, 20040131,
       20050531, 20050630, 20050731, 20050831, 20050930, 20051031,
       20051130, 20051231, 20050430, 20050131, 20050228, 20050331,
       20060228, 20060331, 20060430, 20060531, 20060630, 20060731,
       20060831, 20060930, 20061031, 20061130, 20061231, 20060131,
       20070228, 20070331, 20070430, 20070531, 20070630, 20070731,
       20070831, 20070930, 20071031, 20071130, 20071231, 20070131,
       20080229, 20080331, 20080430, 20080531, 20080630, 20080

In [None]:
data.columns

Index(['Location Name', 'Location City', 'Location Zip', 'Location County',
       'Unit Capacity', 'Obligation End Date (YYYYMMDD)', 'Filer Type',
       'Total Room Receipts', 'Year', 'Population', 'Population Growth Rate',
       'Quarter'],
      dtype='object')

**Loading the data**

In [None]:
trainingData = data[["Obligation End Date (YYYYMMDD)","Location Zip", "Unit Capacity",	"Population",	"Population Growth Rate",	"Total Room Receipts"]]

In [None]:
trainingData.head(10)

Unnamed: 0,Obligation End Date (YYYYMMDD),Location Zip,Unit Capacity,Population,Population Growth Rate,Total Room Receipts
0,20000229,78701,314,20944499.0,4.49,1213477.0
1,20000331,78701,314,20944499.0,4.49,1586063.0
2,20000430,78701,314,20944499.0,4.49,1348446.0
3,20000531,78701,314,20944499.0,4.49,1483993.0
4,20000630,78701,314,20944499.0,4.49,1276623.0
5,20000731,78701,314,20944499.0,4.49,1023982.0
6,20000831,78701,314,20944499.0,4.49,1246176.0
7,20000930,78701,314,20944499.0,4.49,1242596.0
8,20001031,78701,314,20944499.0,4.49,1476390.0
9,20001130,78701,314,20944499.0,4.49,1371780.0


In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0,1))
trainingData_scaled = sc.fit_transform(trainingData)
print(trainingData_scaled)

[[6.48577101e-04 8.20509909e-01 8.41171728e-03 0.00000000e+00
  1.00000000e+00 4.38044337e-02]
 [1.32362674e-03 8.20509909e-01 8.41171728e-03 0.00000000e+00
  1.00000000e+00 5.72541478e-02]
 [1.97882197e-03 8.20509909e-01 8.41171728e-03 0.00000000e+00
  1.00000000e+00 4.86765826e-02]
 ...
 [9.99331568e-01 7.70015899e-01 4.29991938e-04 1.00000000e+00
  1.17845118e-01 1.95652683e-05]
 [1.00000000e+00 7.70015899e-01 4.29991938e-04 1.00000000e+00
  1.17845118e-01 1.67496024e-05]
 [1.00000000e+00 8.21546193e-01 5.37489922e-05 1.00000000e+00
  1.17845118e-01 3.60982810e-06]]


In [None]:
maxLen = len(trainingData_scaled)

In [None]:
X_train = []
y_train = []
for i in range(6, maxLen):
  X_train.append(trainingData_scaled[i-6:i, 0])
  y_train.append(trainingData_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

In [None]:
X_train.shape[0]


1205106

In [None]:
print(maxLen)

1205112


**Creating LSTM Neural Network**

In [None]:
modelLSTM = tf.keras.Sequential() #allows us to create an layered model using tensorflow

In [None]:
modelLSTM.add(tf.keras.layers.LSTM(units = 100, activation = "tanh", return_sequences=True, input_shape=(X_train.shape[1], 1)))   #input_shape=(x_train.shape[0], x_train.shape[1])

In [None]:
modelLSTM.add(tf.keras.layers.Dropout(0.2))  #this will help prevent overfitting

In [None]:
modelLSTM.add(tf.keras.layers.Dense(1))  #output layer

In [None]:
modelLSTM.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
modelLSTM.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 6, 100)            40800     
                                                                 
 dropout (Dropout)           (None, 6, 100)            0         
                                                                 
 dense (Dense)               (None, 6, 1)              101       
                                                                 
Total params: 40901 (159.77 KB)
Trainable params: 40901 (159.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
trainLSTM = modelLSTM.fit(X_train, y_train, epochs=10, batch_size=16, validation_split = 0.2, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math

# Predict on the training and validation set
train_predict = modelLSTM.predict(X_train)
# If you have a validation set separated beforehand you should use that instead
val_predict = modelLSTM.predict(X_val)

# Inverse the scaling if you have scaled your data
train_predict = scaler.inverse_transform(train_predict)
val_predict = scaler.inverse_transform(val_predict)
y_train_inv = scaler.inverse_transform([y_train])
y_val_inv = scaler.inverse_transform([y_val])

# Calculate metrics
train_mae = mean_absolute_error(y_train_inv, train_predict)
train_rmse = math.sqrt(mean_squared_error(y_train_inv, train_predict))
val_mae = mean_absolute_error(y_val_inv, val_predict)
val_rmse = math.sqrt(mean_squared_error(y_val_inv, val_predict))

print('Training Data MAE:', train_mae)
print('Training Data RMSE:', train_rmse)
print('Validation Data MAE:', val_mae)
print('Validation Data RMSE:', val_rmse)

# Plot predictions vs actual values
plt.figure(figsize=(10,6))
plt.plot(y_val, label='Actual')
plt.plot(val_predict, label='Predicted')
plt.title('Validation Data - Actual vs Predicted')
plt.legend()
plt.show()


NameError: ignored