<a href="https://colab.research.google.com/github/xfo-03/Sequential-POI-recommendation/blob/main/LabModelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [None]:
features_df = pd.read_csv('/content/drive/MyDrive/Spatio_Temporal_Lab/Session_df.tsv', delimiter='\t')

In [None]:
features_df.drop(columns=['Date', 'Year', 'Day'], inplace=True)

In [None]:
features_df.head()

Unnamed: 0,UserId,VenueId,VenueType,GeoHash,Month,hour,Season,Part of Day,dayofweek,is_weekday,SessionId
0,103,1755,1,131916353535,7,5,3,1,0,1,1
1,656,1773,2,131916353522,7,5,3,1,0,1,1
2,787,1752,1,13235112511,7,5,3,1,0,1,1
3,828,1752,1,13235112511,7,5,3,1,0,1,1
4,244,1754,1,1323511259,7,5,3,1,0,1,1


In [None]:
unique_venue_ids = features_df['VenueId'].explode().unique()
len(unique_venue_ids)

994

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


##Data Loader



*   Discretizing Data
*   Convert to sequences and pad data
*   Train Test Split
*   Padding train and test df
*   X, Y data split








In [None]:
##Discretizing Data - Encoding the GeoHash values
# Get unique GeoHash values
unique_geohash_values = features_df['GeoHash'].unique()

# Create a mapping from each unique GeoHash value to a unique bin index
geohash_to_bins = {value: idx for idx, value in enumerate(unique_geohash_values)}

# Map the bins back to the original GeoHash column
features_df['GeoHash_binned'] = features_df['GeoHash'].map(geohash_to_bins)

#print(features_df[['GeoHash', 'GeoHash_binned']])

# Encode VenueId
label_encoder = LabelEncoder()
features_df['VenueId_encoded'] = label_encoder.fit_transform(features_df['VenueId'])

# Print the DataFrame to see the encoded VenueId
#print(features_df[['VenueId', 'VenueId_encoded']].head())

features_df.drop(columns=['VenueId', 'GeoHash'], inplace= True)

In [None]:
print(len(unique_geohash_values))

35


In [None]:
features_df.head()

Unnamed: 0,UserId,VenueType,Month,hour,Season,Part of Day,dayofweek,is_weekday,SessionId,GeoHash_binned,VenueId_encoded
0,103,1,7,5,3,1,0,1,1,0,968
1,656,2,7,5,3,1,0,1,1,1,986
2,787,1,7,5,3,1,0,1,1,2,965
3,828,1,7,5,3,1,0,1,1,2,965
4,244,1,7,5,3,1,0,1,1,3,967


In [None]:
venue_lookup = features_df.groupby('VenueType')['VenueId_encoded'].apply(lambda x: list(set(x))).to_dict()

# Print the lookup table
for venue_type, venue_ids in venue_lookup.items():
    print(f"VenueType {venue_type}: {venue_ids}")


'''# Step 1: Access the venue_id column
venue_ids = features_df['VenueId']

# Step 2: Retrieve unique venue IDs
unique_venue_ids = venue_ids.unique()

print("Unique Venue IDs:")
print(unique_venue_ids) '''

VenueType 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 2

'# Step 1: Access the venue_id column\nvenue_ids = features_df[\'VenueId\']\n\n# Step 2: Retrieve unique venue IDs\nunique_venue_ids = venue_ids.unique()\n\nprint("Unique Venue IDs:")\nprint(unique_venue_ids) '

In [None]:
##Convert to sequences and pad data
#Join data depending on each session
# Group by UserId and SessionId and aggregate to lists

features_df = features_df.groupby(['UserId', 'SessionId']).agg({
    'VenueId_encoded': list,
    'VenueType': list,
    'GeoHash_binned': list,
    'Month': list,
    'hour': list,
    'Season': list,
    'Part of Day': list,
    'dayofweek': list,
    'is_weekday': list
}).reset_index()




data_df = features_df.copy()

# Filter out sequences below length 30 in 'VenueId_encoded' column
data_df = data_df[data_df['VenueId_encoded'].apply(lambda x: len(x) >= 30)]

# Example usage:
#print(f"Number of rows after filtering: {len(data_df)}")


In [None]:
##Train Test Split
def split_sequence(sequence, train_ratio=0.8):

    split_point = int(len(sequence) * train_ratio)
    return sequence[:split_point], sequence[split_point:]

# Initialize lists to hold split data
train_data = []
test_data = []

# Iterate over each row and split the sequences for all columns
for _, row in data_df.iterrows():
    train_row = row.copy()
    test_row = row.copy()
    for column in data_df.columns:
        if isinstance(row[column], list):
            train_seq, test_seq = split_sequence(row[column])
            train_row[column] = train_seq
            test_row[column] = test_seq
        else:
            train_row[column] = row[column]
            test_row[column] = row[column]
    train_data.append(train_row)
    test_data.append(test_row)

# Create DataFrames for train and test splits
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)


In [None]:
print("Train DataFrame:")
train_df.head()

Train DataFrame:


Unnamed: 0,UserId,SessionId,VenueId_encoded,VenueType,GeoHash_binned,Month,hour,Season,Part of Day,dayofweek,is_weekday
0,0,1,"[921, 979, 921, 1, 921, 1, 965, 1, 965, 1, 966...","[3, 2, 3, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 1, ...","[3, 6, 3, 16, 3, 16, 2, 16, 2, 16, 2, 16, 3, 1...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[7, 12, 13, 16, 7, 16, 16, 19, 19, 20, 21, 23,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 3, 3, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 3, ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,0,2,"[921, 1, 966, 1, 965, 1, 921, 1, 965, 1, 965, ...","[3, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, 0, 1, 0, 2, ...","[3, 16, 2, 16, 2, 16, 3, 16, 2, 16, 2, 16, 0, ...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[7, 16, 16, 18, 18, 21, 7, 16, 16, 18, 18, 20,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 3, 4, 4, 4, 2, 3, 3, 4, 4, 4, 4, 4, 1, ...","[3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
2,0,3,"[982, 1, 921, 1, 968, 1, 968, 1, 964, 1, 968, ...","[2, 0, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, ...","[11, 16, 3, 16, 0, 16, 0, 16, 16, 16, 0, 16, 3...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[0, 0, 7, 16, 16, 17, 18, 19, 19, 20, 20, 22, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 3, 2, ...","[6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0,4,"[982, 1, 921, 1, 973, 1, 965, 1, 965, 1, 921, ...","[2, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, ...","[11, 16, 3, 16, 2, 16, 2, 16, 2, 16, 3, 16, 2,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 8, 7, 16, 16, 18, 18, 19, 20, 21, 7, 16, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3, ...","[6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0,5,"[921, 1, 921, 1, 921, 1, 982, 1, 982, 1, 982, ...","[3, 0, 3, 0, 3, 0, 2, 0, 2, 0, 2, 0, 3, 0, 1, ...","[3, 16, 3, 16, 3, 16, 11, 16, 11, 16, 11, 16, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 16, 7, 16, 7, 16, 0, 0, 7, 8, 0, 0, 7, 16,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 2, 3, 2, 3, 1, 1, 2, 2, 1, 1, 2, 3, 3, ...","[2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."


In [None]:
print("\nTest DataFrame:")
test_df.head()


Test DataFrame:


Unnamed: 0,UserId,SessionId,VenueId_encoded,VenueType,GeoHash_binned,Month,hour,Season,Part of Day,dayofweek,is_weekday
0,0,1,"[1, 965, 1, 921, 1, 973, 1, 965, 1, 964, 1, 92...","[0, 1, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0]","[16, 2, 16, 3, 16, 2, 16, 2, 16, 16, 16, 3, 16]","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]","[19, 19, 21, 7, 16, 16, 18, 18, 20, 20, 22, 7,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]","[4, 4, 4, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3]","[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,0,2,"[966, 1, 964, 1, 921, 1, 982, 1]","[1, 0, 1, 0, 3, 0, 2, 0]","[2, 16, 16, 16, 3, 16, 11, 16]","[7, 7, 7, 7, 7, 7, 7, 7]","[18, 19, 20, 21, 7, 16, 0, 0]","[3, 3, 3, 3, 3, 3, 3, 3]","[4, 4, 4, 4, 2, 3, 1, 1]","[3, 3, 3, 3, 4, 4, 5, 5]","[1, 1, 1, 1, 1, 1, 0, 0]"
2,0,3,"[921, 1, 921, 1, 921, 1, 964, 1, 965, 1, 968, ...","[3, 0, 3, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, ...","[3, 16, 3, 16, 3, 16, 16, 16, 2, 16, 0, 16, 3,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 16, 7, 16, 7, 16, 16, 18, 18, 20, 20, 21, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 2, 3, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 2, ...","[0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,0,4,"[965, 1, 965, 1, 965, 1, 921, 1, 921, 1]","[1, 0, 1, 0, 1, 0, 3, 0, 3, 0]","[2, 16, 2, 16, 2, 16, 3, 16, 3, 16]","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]","[8, 10, 10, 12, 12, 14, 7, 16, 7, 16]","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]","[2, 2, 2, 3, 3, 3, 2, 3, 2, 3]","[6, 6, 6, 6, 6, 6, 0, 0, 1, 1]","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1]"
4,0,5,"[1, 982, 1, 966, 1, 964, 1, 965, 1, 921, 1, 92...","[0, 2, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, 0]","[16, 11, 16, 2, 16, 16, 16, 2, 16, 3, 16, 3, 16]","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]","[0, 7, 8, 8, 10, 10, 11, 12, 14, 7, 16, 7, 16]","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]","[1, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 2, 3]","[6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]"


In [None]:
## Padding train and test df

# Calculate max and min sequence lengths for train_df
train_df['VenueId_length'] = train_df['VenueId_encoded'].apply(len)
train_max_length = train_df['VenueId_length'].max()
train_min_length = train_df['VenueId_length'].min()

# Calculate max and min sequence lengths for test_df
test_df['VenueId_length'] = test_df['VenueId_encoded'].apply(len)
test_max_length = test_df['VenueId_length'].max()
test_min_length = test_df['VenueId_length'].min()

# Display the results
print(f"Train DataFrame - Max sequence length: {train_max_length}, Min sequence length: {train_min_length}")

print(f"Test DataFrame - Max sequence length: {test_max_length}, Min sequence length: {test_min_length}")


def pad_or_truncate_sequence(seq, target_length):
    """
    Pad or truncate a sequence to the target length.
    """
    if len(seq) > target_length:
        return seq[:target_length]
    else:
        return seq + [0] * (target_length - len(seq))

# Set the desired length for padding and truncating
target_length_train = 125

for column in train_df.columns:
    if isinstance(train_df[column].iloc[0], list):
        train_df[column] = train_df[column].apply(lambda x: pad_or_truncate_sequence(x, target_length_train))


# Set the desired length for padding and truncating
target_length_test = 125

for column in test_df.columns:
    if isinstance(test_df[column].iloc[0], list):
        test_df[column] = test_df[column].apply(lambda x: pad_or_truncate_sequence(x, target_length_test))


# Calculate max and min sequence lengths for train_df
train_df['VenueId_length'] = train_df['VenueId_encoded'].apply(len)
train_max_length = train_df['VenueId_length'].max()
train_min_length = train_df['VenueId_length'].min()

# Calculate max and min sequence lengths for test_df
test_df['VenueId_length'] = test_df['VenueId_encoded'].apply(len)
test_max_length = test_df['VenueId_length'].max()
test_min_length = test_df['VenueId_length'].min()

# Display the results
print(f"Train DataFrame - Max sequence length: {train_max_length}, Min sequence length: {train_min_length}")

print(f"Test DataFrame - Max sequence length: {test_max_length}, Min sequence length: {test_min_length}")


Train DataFrame - Max sequence length: 203, Min sequence length: 24
Test DataFrame - Max sequence length: 51, Min sequence length: 6
Train DataFrame - Max sequence length: 125, Min sequence length: 125
Test DataFrame - Max sequence length: 125, Min sequence length: 125


In [None]:
## X, y Data split of train and test data
def split_X_y(df):
    # Extract X (features) and y (targets)
    X = df.drop(columns=['VenueId_encoded'])
    y = df['VenueId_encoded']
    return X, y

X_train, y_train = split_X_y(train_df)
X_test, y_test = split_X_y(test_df)

In [None]:
X_train.head()

Unnamed: 0,UserId,SessionId,VenueType,GeoHash_binned,Month,hour,Season,Part of Day,dayofweek,is_weekday,VenueId_length
0,0,1,"[3, 2, 3, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 1, ...","[3, 6, 3, 16, 3, 16, 2, 16, 2, 16, 2, 16, 3, 1...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[7, 12, 13, 16, 7, 16, 16, 19, 19, 20, 21, 23,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 3, 3, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 3, ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",125
1,0,2,"[3, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, 0, 1, 0, 2, ...","[3, 16, 2, 16, 2, 16, 3, 16, 2, 16, 2, 16, 0, ...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[7, 16, 16, 18, 18, 21, 7, 16, 16, 18, 18, 20,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 3, 4, 4, 4, 2, 3, 3, 4, 4, 4, 4, 4, 1, ...","[3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",125
2,0,3,"[2, 0, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, ...","[11, 16, 3, 16, 0, 16, 0, 16, 16, 16, 0, 16, 3...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[0, 0, 7, 16, 16, 17, 18, 19, 19, 20, 20, 22, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 3, 2, ...","[6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",125
3,0,4,"[2, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 1, 0, 1, ...","[11, 16, 3, 16, 2, 16, 2, 16, 2, 16, 3, 16, 2,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 8, 7, 16, 16, 18, 18, 19, 20, 21, 7, 16, 1...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3, ...","[6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",125
4,0,5,"[3, 0, 3, 0, 3, 0, 2, 0, 2, 0, 2, 0, 3, 0, 1, ...","[3, 16, 3, 16, 3, 16, 11, 16, 11, 16, 11, 16, ...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 16, 7, 16, 7, 16, 0, 0, 7, 8, 0, 0, 7, 16,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 2, 3, 2, 3, 1, 1, 2, 2, 1, 1, 2, 3, 3, ...","[2, 2, 3, 3, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...",125


In [None]:
y_train.head()

0    [921, 979, 921, 1, 921, 1, 965, 1, 965, 1, 966...
1    [921, 1, 966, 1, 965, 1, 921, 1, 965, 1, 965, ...
2    [982, 1, 921, 1, 968, 1, 968, 1, 964, 1, 968, ...
3    [982, 1, 921, 1, 973, 1, 965, 1, 965, 1, 921, ...
4    [921, 1, 921, 1, 921, 1, 982, 1, 982, 1, 982, ...
Name: VenueId_encoded, dtype: object

In [None]:
X_test.head()

Unnamed: 0,UserId,SessionId,VenueType,GeoHash_binned,Month,hour,Season,Part of Day,dayofweek,is_weekday,VenueId_length
0,0,1,"[0, 1, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 0, 0, ...","[16, 2, 16, 3, 16, 2, 16, 2, 16, 16, 16, 3, 16...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, ...","[19, 19, 21, 7, 16, 16, 18, 18, 20, 20, 22, 7,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, ...","[4, 4, 4, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 0, 0, ...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",125
1,0,2,"[1, 0, 1, 0, 3, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 16, 16, 16, 3, 16, 11, 16, 0, 0, 0, 0, 0, ...","[7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, ...","[18, 19, 20, 21, 7, 16, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, ...","[4, 4, 4, 4, 2, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[3, 3, 3, 3, 4, 4, 5, 5, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",125
2,0,3,"[3, 0, 3, 0, 3, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, ...","[3, 16, 3, 16, 3, 16, 16, 16, 2, 16, 0, 16, 3,...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...","[7, 16, 7, 16, 7, 16, 16, 18, 18, 20, 20, 21, ...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...","[2, 3, 2, 3, 2, 3, 3, 4, 4, 4, 4, 4, 2, 3, 2, ...","[0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",125
3,0,4,"[1, 0, 1, 0, 1, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, ...","[2, 16, 2, 16, 2, 16, 3, 16, 3, 16, 0, 0, 0, 0...","[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, ...","[8, 10, 10, 12, 12, 14, 7, 16, 7, 16, 0, 0, 0,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, ...","[2, 2, 2, 3, 3, 3, 2, 3, 2, 3, 0, 0, 0, 0, 0, ...","[6, 6, 6, 6, 6, 6, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",125
4,0,5,"[0, 2, 0, 1, 0, 1, 0, 1, 0, 3, 0, 3, 0, 0, 0, ...","[16, 11, 16, 2, 16, 16, 16, 2, 16, 3, 16, 3, 1...","[9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, ...","[0, 7, 8, 8, 10, 10, 11, 12, 14, 7, 16, 7, 16,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, ...","[1, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 2, 3, 0, 0, ...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 1, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, ...",125


In [None]:
y_test.head()

0    [1, 965, 1, 921, 1, 973, 1, 965, 1, 964, 1, 92...
1    [966, 1, 964, 1, 921, 1, 982, 1, 0, 0, 0, 0, 0...
2    [921, 1, 921, 1, 921, 1, 964, 1, 965, 1, 968, ...
3    [965, 1, 965, 1, 965, 1, 921, 1, 921, 1, 0, 0,...
4    [1, 982, 1, 966, 1, 964, 1, 965, 1, 921, 1, 92...
Name: VenueId_encoded, dtype: object

##Model - LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
def get_max_sequence_length(df):
    max_length = 0
    for column in df.columns:
        if isinstance(df[column].iloc[0], list):
            max_length = max(max_length, max(len(seq) for seq in df[column]))
    return max_length

max_sequence_length = get_max_sequence_length(train_df)
print(f"Maximum sequence length in the dataset: {max_sequence_length}")

max_sequence_length = get_max_sequence_length(test_df)
print(f"Maximum sequence length in the dataset: {max_sequence_length}")


Maximum sequence length in the dataset: 125
Maximum sequence length in the dataset: 125


In [None]:
sequence_columns = ['VenueType', 'GeoHash_binned', 'Month', 'hour', 'Season', 'Part of Day', 'dayofweek', 'is_weekday' ]

# Convert sequences in X_train to numpy arrays and stack
X_train_array = np.stack([np.stack(X_train[column].values) for column in sequence_columns], axis=-1)

# Convert sequences in X_test to numpy arrays and stack
X_test_array = np.stack([np.stack(X_test[column].values) for column in sequence_columns], axis=-1)

# Verify shapes
print(f"X_train_array shape: {X_train_array.shape}")
print(f"X_test_array shape: {X_test_array.shape}")

y_train_array = np.stack(y_train.values)
y_test_array = np.stack(y_test.values)

X_train_array shape: (30122, 125, 8)
X_test_array shape: (30122, 125, 8)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, TimeDistributed, Input

# Example maximum sequence lengths and other parameters
max_sequence_length = X_train_array.shape[1]
num_features = X_train_array.shape[2]
num_classes = 994

# Define and compile the LSTM model
model = Sequential()
model.add(Input(shape=(max_sequence_length, num_features)))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.1))
model.add(TimeDistributed(Dense(num_classes, activation='softmax')))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 125, 128)          70144     
                                                                 
 dropout (Dropout)           (None, 125, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 125, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 125, 128)          0         
                                                                 
 time_distributed (TimeDist  (None, 125, 994)          128226    
 ributed)                                                        
                                                                 
Total params: 329954 (1.26 MB)
Trainable params: 329954 (1.26 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [None]:
# Train the model
history = model.fit(X_train_array, y_train_array,
                    epochs=10,
                    batch_size=64,
                    validation_data=(X_test_array, y_test_array))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Train the model
history = model.fit(X_train_array, y_train_array,
                    epochs=10,
                    batch_size=64,
                    validation_data=(X_test_array, y_test_array))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Save the model
model.save('/content/drive/MyDrive/Spatio_Temporal_Lab/model_saved')

In [None]:
# Load the model
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/Spatio_Temporal_Lab/model_saved')

# Verify the loaded model
loaded_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 125, 128)          70144     
                                                                 
 dropout (Dropout)           (None, 125, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 125, 128)          131584    
                                                                 
 dropout_1 (Dropout)         (None, 125, 128)          0         
                                                                 
 time_distributed (TimeDist  (None, 125, 994)          128226    
 ributed)                                                        
                                                                 
Total params: 329954 (1.26 MB)
Trainable params: 329954 (1.26 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [None]:
# Evaluate the loaded model on the test data
loss, accuracy = loaded_model.evaluate(X_test_array, y_test_array)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Test Loss: 0.2143285721540451, Test Accuracy: 0.9459002614021301


In [None]:
venue_lookup

{0: [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
 

In [None]:
import numpy as np

# Assuming y_test_array and X_test_array are numpy arrays
# Filter indices where the length of sequences in y_test_array is greater than 30 (excluding padding)
indices_to_predict = [i for i in range(len(y_test_array)) if np.count_nonzero(y_test_array[i]) > 30]

# Select the corresponding sequences from X_test_array
X_test_subset = X_test_array[indices_to_predict]

# Generate predictions with the loaded model
y_pred = loaded_model.predict(X_test_subset)

# Convert predicted values to normal
predicted_ids = np.argmax(y_pred, axis=-1)

# Print sample predictions and true data for sequences longer than 30
for i, idx in enumerate(indices_to_predict[:10]):  # Limiting to first 10 samples for display
    print(f"Sample {i + 1}:")

    # Retrieve non-padded data from y_test_array and predicted_ids
    true_data = y_test_array[idx][y_test_array[idx] != 0]
    predicted_data = predicted_ids[i][:len(true_data)]  # Only take predicted values for non-padded data

    print("Predicted:", predicted_data)
    print("True Data:", true_data)
    print()  # Empty print to add spacing between samples


Sample 1:
Predicted: [972   0 964 976 964 654 966 654 972  73 964  97 869 986 897  10 964  10
 897 986 897  10 968  10 897 986 897  10 964  10 964  10]
True Data: [972  10 964 976 964  10 966  10 972  10 964  10 897 986 897  10 964  10
 897 986 897  10 968  10 897 986 897  10 964  10 964  10]

Sample 2:
Predicted: [471 972  73 984  51 964   6 964   6 964 976 964   6 968 664 966 664 964
 664 869 986 897  10 869 986 897  10 897 986 897  10]
True Data: [ 10 972  10 984  10 964  10 964  10 964 976 964  10 968  10 966  10 964
  10 897 986 897  10 897 986 897  10 897 986 897  10]

Sample 3:
Predicted: [984  45 964   6 869 986 850  10 984  10 964  10 964  10 966  10 984   6
 964   6 964   6 968 333 869 986 897  10 897 986 897  10]
True Data: [897  10 964  10 897 986 897  10 984  10 964  10 964  10 966  10 984  10
 964  10 964  10 968  10 897 986 897  10 897 986 897  10]

Sample 4:
Predicted: [667 922   1 922   1 965 993 965  18 965   1 965   1 965   1 921   1 965
 993 965   1 966   1 973   1 

In [None]:
print(len(indices_to_predict))

656


In [None]:
import numpy as np
import pandas as pd

# Assuming y_test_array and X_test_array are numpy arrays
# Filter indices where the length of sequences in y_test_array is greater than 30 (excluding padding)
indices_to_predict = [i for i in range(len(y_test_array)) if np.count_nonzero(y_test_array[i]) > 30]

# Select the corresponding sequences from X_test_array
X_test_subset = X_test_array[indices_to_predict]

# Generate predictions with the loaded model
y_pred = loaded_model.predict(X_test_subset)

# Convert predicted values to normal
predicted_ids = np.argmax(y_pred, axis=-1)

# Initialize lists to store all results
all_samples = []
all_predicted_venues = []
all_true_venues = []

# Process all sequences with length greater than 30 for classification report
for i, idx in enumerate(indices_to_predict):
    true_data = y_test_array[idx][y_test_array[idx] != 0]
    predicted_data = predicted_ids[i][:len(true_data)]

    # Store all processed sequences
    all_samples.append(f"Sample {i + 1}")
    all_predicted_venues.append(predicted_data.tolist())
    all_true_venues.append(true_data.tolist())

    # Store only the first 10 samples for display
    if i < 10:
        samples.append(f"Sample {i + 1}")
        predicted_venues.append(predicted_data.tolist())
        true_venues.append(true_data.tolist())

# Create a dataframe for all processed sequences
df_all = pd.DataFrame({
    "Sample": all_samples,
    "Predicted_Venues": all_predicted_venues,
    "True_Venues": all_true_venues
})



In [None]:
# Function to map venue ids to venue types
def map_to_venue_types(venue_ids):
    venue_types = []
    for venue_id in venue_ids:
        found = False
        for v_type, v_ids in venue_lookup.items():
            if venue_id in v_ids:
                venue_types.append(v_type)
                found = True
                break
        if not found:
            venue_types.append(None)  # Handle cases where venue ID is not found in venue_lookup
    return venue_types

# Map Predicted_Venues to Predicted_Venue_Types for display dataframe
df_all['Predicted_Venue_Types'] = df_all['Predicted_Venues'].apply(map_to_venue_types)

# Map True_Venues to True_Venue_Types for display dataframe
df_all['True_Venue_Types'] = df_all['True_Venues'].apply(map_to_venue_types)


In [None]:
# Print the final dataframe
print('Predicted_Venue_Types')
print(df_all['Predicted_Venue_Types'])
print('True_Venue_Types')
print(df_all['True_Venue_Types'])

Predicted_Venue_Types
0      [1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 3, 2, 3, ...
1      [0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, ...
2      [2, 0, 1, 0, 3, 2, 3, 0, 2, 0, 1, 0, 1, 0, 1, ...
3      [0, 3, 0, 3, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, ...
4      [2, 2, 1, 2, 0, 1, 0, 3, 2, 3, 0, 1, 2, 1, 0, ...
                             ...                        
651    [2, 0, 2, 0, 2, 0, 2, 0, 3, 2, 3, 0, 1, 0, 1, ...
652    [0, 2, 0, 2, 0, 3, 2, 3, 0, 1, 0, 1, 0, 1, 0, ...
653    [2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 3, 2, 3, ...
654    [0, 3, 2, 3, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, ...
655    [2, 0, 2, 0, 3, 2, 3, 0, 2, 0, 2, 0, 1, 0, 1, ...
Name: Predicted_Venue_Types, Length: 656, dtype: object
True_Venue_Types
0      [1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 3, 2, 3, ...
1      [0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, ...
2      [3, 0, 1, 0, 3, 2, 3, 0, 2, 0, 1, 0, 1, 0, 1, ...
3      [0, 3, 0, 3, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, ...
4      [2, 1, 0, 1, 0, 1, 0, 3, 2, 3, 0, 1, 2, 1, 

In [None]:
print('Predicted_Venues')
print(df_all['Predicted_Venues'])
print('True_Venues')
print(df_all['True_Venues'])

Predicted_Venues
0      [972, 0, 964, 976, 964, 654, 966, 654, 972, 73...
1      [471, 972, 73, 984, 51, 964, 6, 964, 6, 964, 9...
2      [984, 45, 964, 6, 869, 986, 850, 10, 984, 10, ...
3      [667, 922, 1, 922, 1, 965, 993, 965, 18, 965, ...
4      [989, 989, 969, 989, 312, 972, 641, 956, 986, ...
                             ...                        
651    [984, 40, 992, 529, 992, 5, 992, 5, 844, 979, ...
652    [0, 992, 0, 992, 0, 796, 979, 796, 539, 967, 5...
653    [984, 0, 967, 539, 967, 539, 967, 529, 967, 52...
654    [436, 922, 979, 796, 252, 967, 539, 967, 539, ...
655    [984, 40, 992, 529, 844, 979, 796, 539, 992, 5...
Name: Predicted_Venues, Length: 656, dtype: object
True_Venues
0      [972, 10, 964, 976, 964, 10, 966, 10, 972, 10,...
1      [10, 972, 10, 984, 10, 964, 10, 964, 10, 964, ...
2      [897, 10, 964, 10, 897, 986, 897, 10, 984, 10,...
3      [18, 834, 18, 834, 18, 965, 993, 965, 18, 965,...
4      [989, 968, 19, 968, 19, 972, 19, 956, 986, 956...
        

In [None]:
from sklearn.metrics import classification_report

# Flatten the lists of venue ids
true_venue_flat = [item for sublist in df_all['True_Venues'] for item in sublist]
predicted_venue_flat = [item for sublist in df_all['Predicted_Venues'] for item in sublist]

# Generate the classification report as a dictionary, suppressing undefined metric warnings
report_dict = classification_report(true_venue_flat, predicted_venue_flat, output_dict=True, zero_division=0)

# Extract the overall metrics
overall_metrics = report_dict['accuracy'], report_dict['macro avg']['precision'], report_dict['macro avg']['recall'], report_dict['macro avg']['f1-score'], report_dict['weighted avg']['precision'], report_dict['weighted avg']['recall'], report_dict['weighted avg']['f1-score']

# Print the overall metrics
print("Classification Report for Predicted and True Venue Id's:")
print(f"Accuracy: {overall_metrics[0]:.2f}")
print(f"Macro Avg Precision: {overall_metrics[1]:.2f}")
print(f"Macro Avg Recall: {overall_metrics[2]:.2f}")
print(f"Macro Avg F1-score: {overall_metrics[3]:.2f}")
print(f"Weighted Avg Precision: {overall_metrics[4]:.2f}")
print(f"Weighted Avg Recall: {overall_metrics[5]:.2f}")
print(f"Weighted Avg F1-score: {overall_metrics[6]:.2f}")

Classification Report for Predicted and True Venue Id's:
Accuracy: 0.58
Macro Avg Precision: 0.34
Macro Avg Recall: 0.27
Macro Avg F1-score: 0.27
Weighted Avg Precision: 0.67
Weighted Avg Recall: 0.58
Weighted Avg F1-score: 0.58


In [None]:
# Flatten the lists of venue types
true_venuetype_flat = [item for sublist in df_all['True_Venue_Types'] for item in sublist]
predicted_venuetype_flat = [item for sublist in df_all['Predicted_Venue_Types'] for item in sublist]

#print(true_venue_flat)

# Print precision, recall, and F1-score
print("Classification Report:")
print(classification_report(true_venuetype_flat, predicted_venuetype_flat))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      8676
           1       0.97      0.99      0.98      5394
           2       0.97      1.00      0.99      4142
           3       1.00      0.98      0.99      3550

    accuracy                           0.99     21762
   macro avg       0.98      0.99      0.99     21762
weighted avg       0.99      0.99      0.99     21762

