In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [2]:
# File path
file_path = "/Users/freewheelin/Documents/ev_charging_station.txt"

# Load the dataset
data = pd.read_csv(file_path)

print(data.head())

   sessionId  kwhTotal  dollars              created                ended  \
0    1366563      7.78     0.00  0014-11-18 15:40:26  0014-11-18 17:11:04   
1    3075723      9.74     0.00  0014-11-19 17:40:26  0014-11-19 19:51:04   
2    4228788      6.76     0.58  0014-11-21 12:05:46  0014-11-21 16:46:04   
3    3173284      6.17     0.00  0014-12-03 19:16:12  0014-12-03 21:02:18   
4    3266500      0.93     0.00  0014-12-11 20:56:11  0014-12-11 21:14:06   

   startTime  endTime  chargeTimeHrs weekday platform  ...  managerVehicle  \
0         15       17       1.510556     Tue  android  ...               0   
1         17       19       2.177222     Wed  android  ...               0   
2         12       16       4.671667     Fri  android  ...               0   
3         19       21       1.768333     Wed  android  ...               0   
4         20       21       0.298611     Thu  android  ...               0   

   facilityType  Mon  Tues  Wed  Thurs  Fri  Sat  Sun  reportedZip  

In [315]:
print(data["userId"].unique())

[35897499 65023200 27283509 29165598 77373351 78533433 30828105 97867440
 90692118 26618922 88561539 49241808 33081741 57882330 19555569 78908148
 36768303 72512154 37392894 37412595 48585042 47158353 45267948 32751774
 76114467 77088033 84660741 98345808 81375624 32015313 92192265 92911698
 95980995 46667907 82888443 46009656 92283246 11299464 75922902 30464676
 33295482 94947534 87444027 50725917 58140225 24920478 30296079 10909503
 85580550 27476262 68581656 29309940 25628328 54832140 48821751 24478344
 81701631 50986683 86810130 81880524 74843010 66233970 41493375 39133512
 39279042 29845530 45460701 81295434 14260257 10427670 93202560 24408549
 17969193 13066218 90546786 41222907 95411349 75009330 83573325 39241917
 14996520 59574735 32070852 58023207 26098875]


In [293]:
data.head()

Unnamed: 0,kwhTotal,created,ended,startTime,endTime,chargeTimeHrs,weekday,stationId,locationId,managerVehicle,...,Sat,Sun,created_hour,created_day,created_month,created_weekday,ended_hour,ended_day,ended_month,ended_weekday
0,7.78,2014-11-18 15:40:26,2014-11-18 17:11:04,15,17,1.510556,Tue,582873,461655,0,...,0,0,15,18,11,1,17,18,11,1
1,9.74,2014-11-19 17:40:26,2014-11-19 19:51:04,17,19,2.177222,Wed,549414,461655,0,...,0,0,17,19,11,2,19,19,11,2
2,6.76,2014-11-21 12:05:46,2014-11-21 16:46:04,12,16,4.671667,Fri,129465,461655,0,...,0,0,12,21,11,4,16,21,11,4
3,6.17,2014-12-03 19:16:12,2014-12-03 21:02:18,19,21,1.768333,Wed,569889,461655,0,...,0,0,19,3,12,2,21,3,12,2
4,0.93,2014-12-11 20:56:11,2014-12-11 21:14:06,20,21,0.298611,Thu,414088,566549,0,...,0,0,20,11,12,3,21,11,12,3


In [5]:
# Define the total number of unique userIds and locationIdss
num_users = data['userId'].nunique()  # E.g., 85 unique userIds
num_locations = data['locationId'].nunique()  # E.g., 25 unique locationIds

In [7]:
# Label Encoding for userId and locationId
user_encoder = LabelEncoder()
location_encoder = LabelEncoder()

# conert them into numbers ranging from 0 to user-1
data['userId'] = user_encoder.fit_transform(data['userId'])
data['locationId'] = location_encoder.fit_transform(data['locationId'])

# Convert `userId` and `locationId` columns to TensorFlow tensors
user_input = tf.convert_to_tensor(data['userId'].values, dtype=tf.int32)  # Shape: (num_samples,)
location_input = tf.convert_to_tensor(data['locationId'].values, dtype=tf.int32)  # Shape: (num_samples,)

#the number of unique user IDs
num_users = len(user_encoder.classes_)
num_locations = len(location_encoder.classes_)


# Define embedding layers
user_embedding_dim = 8  # You can choose any dimension
location_embedding_dim = 8  # You can choose any dimension

user_embedding_layer = tf.keras.layers.Embedding(input_dim=num_users, output_dim=user_embedding_dim, name='user_embedding')
location_embedding_layer = tf.keras.layers.Embedding(input_dim=num_locations, output_dim=location_embedding_dim, name='location_embedding')

# Generate embeddings
user_embedding = user_embedding_layer(user_input)  # Shape: (num_samples, user_embedding_dim)
location_embedding = location_embedding_layer(location_input)  # Shape: (num_samples, location_embedding_dim)

# Flatten embeddings for concatenation
user_embedding = tf.reshape(user_embedding, [-1, user_embedding_dim])
location_embedding = tf.reshape(location_embedding, [-1, location_embedding_dim])

# Now, userIds will be integers from 0 to (num_users - 1)

In [9]:
X_one_hot = tf.convert_to_tensor(data[['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']].values, dtype=tf.float32)

In [17]:
### normalize the continuous variables
# List of continuous features
continuous_features = ['chargeTimeHrs', 'startTime', 'endTime']

# Normalize the features
scaler = MinMaxScaler()
X_continuous = scaler.fit_transform(data[continuous_features])
X_continuous = tf.convert_to_tensor(X_continuous, dtype=tf.float32)

In [27]:
# Final input tensor
X_tf = tf.concat([user_embedding, location_embedding, X_one_hot, X_continuous], axis=1) 

In [29]:
X = tf.data.Dataset.from_tensor_slices(X_tf)

In [25]:
print(user_embedding.shape)
print(location_embedding.shape)
print(X_one_hot.shape)
print(X_continuous.shape)

(3395, 8)
(3395, 8)
(3395, 7)
(3395, 3)


In [33]:
next(iter(X))

<tf.Tensor: shape=(26,), dtype=float32, numpy=
array([ 0.04555136,  0.02652695, -0.00805613, -0.01889117,  0.04915551,
        0.01613381,  0.00942421, -0.00199133, -0.04398698, -0.0167832 ,
        0.03193933,  0.04744608, -0.02144028, -0.00955456,  0.04770451,
        0.04516765,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.02712613,  0.65217394,
        0.73913044], dtype=float32)>

In [42]:
# Target variable
y_tf = data['kwhTotal'].values
y = tf.data.Dataset.from_tensor_slices(y_tf)

Ignore the stuff below

In [35]:
# ignore this
#########need revision in the coontext of the whole code
# Replace "0014" with "2014" in the 'created' and 'ended' columns
data["created"] = data["created"].str.replace(r"^00(\d{2})", r"20\1", regex=True)
data["ended"] = data["ended"].str.replace(r"^00(\d{2})", r"20\1", regex=True)
#data["created"] = data["created"].str.replace(r"^0014", "2014", regex=True)
#data["ended"] = data["ended"].str.replace(r"^0014", "2014", regex=True)
#data["created"] = data["created"].str.replace(r"^0015", "2015", regex=True)
#data["ended"] = data["ended"].str.replace(r"^0015", "2015", regex=True)

# Now convert the columns to datetime
data["created"] = pd.to_datetime(data["created"], errors="coerce")
data["ended"] = pd.to_datetime(data["ended"], errors="coerce")

data['created_hour'] = data['created'].dt.hour
data['created_day'] = data['created'].dt.day
data['created_month'] = data['created'].dt.month
data['created_weekday'] = data['created'].dt.weekday

data['ended_hour'] = data['ended'].dt.hour
data['ended_day'] = data['ended'].dt.day
data['ended_month'] = data['ended'].dt.month
data['ended_weekday'] = data['ended'].dt.weekday

#drop the irrelevant tags
data = data.drop(["sessionId", "userId", "platform", "distance", "reportedZip", "dollars"], axis=1)