In [None]:
from numpy.lib.polynomial import RankWarning
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
try:
  from pyspark.sql.types import *
  from pyspark.sql import SparkSession
  spark = SparkSession.builder.getOrCreate()
except Exception as e:
  !pip install pyspark
  from pyspark.sql.types import *
  from pyspark.sql import SparkSession
  spark = SparkSession.builder.getOrCreate()
from google.colab import drive
#mount google drive
drive.mount('/content/drive')
csvfilepath = '/content/drive/MyDrive/RP1data/SPSIRDATA.csv'
rawdf = pd.read_csv(csvfilepath)
display(rawdf)

#collect total slots
sparkdf = spark.createDataFrame(rawdf)
sparkdf.createOrReplaceTempView("slottable")
distinctslots = spark.sql("select distinct field1 from slottable")
allslots = [row.field1 for row in distinctslots.collect()]
#print(len(allslots))

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=a98ece946601bbbd932d5ef1f21aef0acfb4b58f53add4cc6850353f6c8cd494
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
Mounted at /content/drive


Unnamed: 0,created_at,entry_id,field1,field2,field3
0,2020-01-01 00:00:00 UTC,1,IR20,1,2000-0-8T45:162:0
1,2020-01-01 08:35:00 UTC,2,IR40,0,2000-0-1T45:162:0
2,2020-01-02 02:12:00 UTC,3,IR22,1,2000-0-4T45:162:0
3,2020-01-02 17:45:00 UTC,4,IR12,0,2000-0-5T45:162:0
4,2020-01-03 05:48:00 UTC,5,IR18,1,2000-0-3T45:162:0
...,...,...,...,...,...
2764,2023-12-04 23:18:00 UTC,2765,IR39,1,2000-0-4T45:162:0
2765,2023-12-05 16:10:00 UTC,2766,IR27,1,2000-0-4T45:162:0
2766,2023-12-06 00:29:00 UTC,2767,IR24,1,2000-0-1T45:162:0
2767,2023-12-06 15:37:00 UTC,2768,IR22,1,2000-0-8T45:162:0


In [None]:
#process raw dataframe
renamecolumns = {
    'created_at': 'Timestamp',
    'entry_id': 'entry_id',
    'field1': 'SlotID',
    'field2': 'Availability',
    'field3': 'field3'
}

df = rawdf.rename(columns=renamecolumns)
display(df)

Unnamed: 0,Timestamp,entry_id,SlotID,Availability,field3
0,2020-01-01 00:00:00 UTC,1,IR20,1,2000-0-8T45:162:0
1,2020-01-01 08:35:00 UTC,2,IR40,0,2000-0-1T45:162:0
2,2020-01-02 02:12:00 UTC,3,IR22,1,2000-0-4T45:162:0
3,2020-01-02 17:45:00 UTC,4,IR12,0,2000-0-5T45:162:0
4,2020-01-03 05:48:00 UTC,5,IR18,1,2000-0-3T45:162:0
...,...,...,...,...,...
2764,2023-12-04 23:18:00 UTC,2765,IR39,1,2000-0-4T45:162:0
2765,2023-12-05 16:10:00 UTC,2766,IR27,1,2000-0-4T45:162:0
2766,2023-12-06 00:29:00 UTC,2767,IR24,1,2000-0-1T45:162:0
2767,2023-12-06 15:37:00 UTC,2768,IR22,1,2000-0-8T45:162:0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#extract features from timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hourofday'] = df['Timestamp'].dt.hour
df['minute'] = df['Timestamp'].dt.minute
df['Dayofweek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month
#print(df['Month'])

#encode slotid column
encoder = OneHotEncoder(sparse=False)
slotid_enc = encoder.fit_transform(df[['SlotID']])
cols_enc = encoder.get_feature_names_out(['SlotID'])
#create a df with all slotids
slotid_df = pd.DataFrame(slotid_enc, columns=cols_enc)
#display(slotid_df)

#extract features and target value
X = df[['minute', 'Hourofday', 'Dayofweek', 'Month']]
X = pd.concat([X, slotid_df], axis=1)
y = df['Availability']
print(X)


      minute  Hourofday  Dayofweek  Month  SlotID_IR1  SlotID_IR10  \
0          0          0          2      1         0.0          0.0   
1         35          8          2      1         0.0          0.0   
2         12          2          3      1         0.0          0.0   
3         45         17          3      1         0.0          0.0   
4         48          5          4      1         0.0          0.0   
...      ...        ...        ...    ...         ...          ...   
2764      18         23          0     12         0.0          0.0   
2765      10         16          1     12         0.0          0.0   
2766      29          0          2     12         0.0          0.0   
2767      37         15          2     12         0.0          0.0   
2768      35          6          3     12         0.0          0.0   

      SlotID_IR11  SlotID_IR12  SlotID_IR13  SlotID_IR14  ...  SlotID_IR46  \
0             0.0          0.0          0.0          0.0  ...          0.0   
1  



In [None]:
#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
#random forest classifier model
model = RandomForestClassifier(n_estimators=100, random_state = 42)
model.fit(X_train, y_train) #train model on test dataset

In [None]:
from sklearn.metrics import accuracy_score
#accuracy
y_pred = model.predict(X_test)
print(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"model accuracy: ",accuracy)

[0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0
 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 0 0 0
 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1
 1 0 0 0 0 0 0 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 0
 0 0 1 1 1 0 1 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 1 1 0
 1 1 1 0 0 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1
 1 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1
 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 0 1
 0 1 0 0 1 0 1 0 1 0 1 1 0 0 1 1 0 0 0 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 1
 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 0 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1
 1 1 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 0 1 1 1 1
 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0
 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1
 0 1 1 1 0 1 0 0 1 0 0 0 

In [None]:
#input datetime
print("Predict slot availability at time: ")
giventime = input("MM dd HH mm = ")
giventime = pd.to_datetime(giventime, format = '%m %d %H %M')
givenday = giventime.dayofweek
givenhour = giventime.hour
givenminute = giventime.minute
givenmonth = giventime.month

#predict available slots for the given time of day
X_given = pd.DataFrame({
    'minute':[givenminute],
    'Hourofday':[givenhour],
    'Dayofweek':[givenday],
    'Month':[givenmonth]
})
X_given = pd.concat([X_given, slotid_df], axis=1)
X_given.fillna(0, inplace=True)
#print(X_given)

slot_prediction = model.predict(X_given)
print(slot_prediction)
available_slots = df.loc[slot_prediction == 1, 'SlotID'].unique()
print("Predicted available slots: ",available_slots)
print(len(available_slots))

#probability of slots to be available
probability = model.predict_proba(X_given)
available_index = [index for index, pred in enumerate(slot_prediction) if pred == 1]
available_probability = probability[available_index][:,1]
slots_probability = dict(zip(available_slots, available_probability))
print("Predicted probabilities: ",slots_probability)
print(len(slots_probability))

Predict slot availability at time: 
MM dd HH mm = 8 8 8 8
[0 0 0 ... 1 0 1]
Predicted available slots:  ['IR12' 'IR18' 'IR7' 'IR29' 'IR20' 'IR14' 'IR31' 'IR33' 'IR46' 'IR5'
 'IR45' 'IR16' 'IR4' 'IR49' 'IR50' 'IR44' 'IR28' 'IR3' 'IR25' 'IR27' 'IR1'
 'IR19' 'IR24' 'IR10' 'IR21' 'IR48' 'IR36' 'IR17' 'IR23' 'IR2' 'IR47'
 'IR26' 'IR30' 'IR35' 'IR38' 'IR11' 'IR39' 'IR9']
38
Predicted probabilities:  {'IR12': 0.58, 'IR18': 0.7, 'IR7': 0.71, 'IR29': 0.58, 'IR20': 0.65, 'IR14': 0.57, 'IR31': 0.61, 'IR33': 0.57, 'IR46': 0.55, 'IR5': 0.52, 'IR45': 0.55, 'IR16': 0.56, 'IR4': 0.6, 'IR49': 0.51, 'IR50': 0.69, 'IR44': 0.51, 'IR28': 0.63, 'IR3': 0.61, 'IR25': 0.56, 'IR27': 0.66, 'IR1': 0.56, 'IR19': 0.59, 'IR24': 0.59, 'IR10': 0.55, 'IR21': 0.63, 'IR48': 0.63, 'IR36': 0.67, 'IR17': 0.58, 'IR23': 0.55, 'IR2': 0.53, 'IR47': 0.67, 'IR26': 0.55, 'IR30': 0.58, 'IR35': 0.55, 'IR38': 0.67, 'IR11': 0.63, 'IR39': 0.58, 'IR9': 0.53}
38


In [None]:
##mlp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
#multi layer perceprtion
model_mlp = MLPClassifier(hidden_layer_sizes=(10, 20, 20, 10), max_iter=1500, random_state=41)
model_mlp.fit(X_train, y_train)

In [None]:
y_pred = model_mlp.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5282791817087846


In [None]:
#input datetime
print("Predict slot availability at time: ")
giventime = input("MM dd HH mm = ")
giventime = pd.to_datetime(giventime, format = '%m %d %H %M')
givenday = giventime.dayofweek
givenhour = giventime.hour
givenminute = giventime.minute
givenmonth = giventime.month

#predict available slots for the given time of day
X_given = pd.DataFrame({
    'minute':[givenminute],
    'Hourofday':[givenhour],
    'Dayofweek':[givenday],
    'Month':[givenmonth]
})
X_given = pd.concat([X_given, slotid_df], axis=1)
X_given.fillna(0, inplace=True)
#print(X_given)

slot_prediction = model_mlp.predict(X_given)
print(slot_prediction)
available_slots = df.loc[slot_prediction == 1, 'SlotID'].unique()
print("Predicted available slots: ",available_slots)
print(len(available_slots))

#probability of slots to be available
probability = model_mlp.predict_proba(X_given)
available_index = [index for index, pred in enumerate(slot_prediction) if pred == 1]
available_probability = probability[available_index][:,1]
slots_probability = dict(zip(available_slots, available_probability))
print("Predicted probabilities: ",slots_probability)
print(len(slots_probability))

Predict slot availability at time: 
MM dd HH mm = 8 8 8 8
[1 0 1 ... 0 1 0]
Predicted available slots:  ['IR20' 'IR22' 'IR1' 'IR19' 'IR36' 'IR32' 'IR26']
7
Predicted probabilities:  {'IR20': 0.7880132284144041, 'IR22': 0.9946770456855875, 'IR1': 0.9946770456855875, 'IR19': 0.9682719990715111, 'IR36': 0.9141465831490161, 'IR32': 0.9141465831490161, 'IR26': 0.7084812747890866}
7


In [None]:
#result, multi layer perceptron gives slots with more probability of availability.