In [1]:
import numpy as np
import pandas as pd
import ast
import seaborn as sns

In [2]:
travel = pd.read_csv("image_dataset.csv") 

In [3]:
travel.head()

Unnamed: 0,image_id,image_title,hashtag,location,image_url
0,1,Pulau Aur,beach vacations,Johor,images/aur.jpg
1,2,Kung Pao,foodie adventure,Johor,images/johorfood.jpg
2,3,Johor Bahru City,city exploration,Johor,images/johorcity.jpg
3,4,Pulau Sibu,beach vacations,Johor,images/sibu.jpg
4,5,Wat Koh Wanararm,cultural tours,Kedah,images/watkohwanararm.jpg


In [4]:
travel.columns
travel.shape

(49, 5)

In [5]:
travel.isnull().sum()

image_id       0
image_title    0
hashtag        0
location       0
image_url      0
dtype: int64

In [6]:
travel['image_title'].head()

0           Pulau Aur
1            Kung Pao
2    Johor Bahru City
3          Pulau Sibu
4    Wat Koh Wanararm
Name: image_title, dtype: object

In [7]:
travel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     49 non-null     int64 
 1   image_title  49 non-null     object
 2   hashtag      49 non-null     object
 3   location     49 non-null     object
 4   image_url    49 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.0+ KB


In [8]:
travel.duplicated().sum()

0

In [9]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
travel['hashtag'] = travel['hashtag'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(travel['hashtag'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(49, 11)

In [10]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
cosine_sim.shape

(49, 49)

In [12]:
for i in range(6):
    print(cosine_sim[i][:6])

[1. 0. 0. 1. 0. 1.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[1. 0. 0. 1. 0. 1.]
[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 1. 0. 1.]


In [13]:
# Remove commas from hashtags
travel['hashtag'] = travel['hashtag'].str.replace(',', '')

# Example: Access hashtags without commas for all entries (using a loop)
for index, row in travel.iterrows():
    clean_hashtag = row['hashtag']
    print(f"Index: {index}, Clean hashtag: {clean_hashtag}")

# Alternative: Access all hashtags as a list (optional)
all_clean_hashtags = travel['hashtag'].tolist()

Index: 0, Clean hashtag: beach vacations
Index: 1, Clean hashtag: foodie adventure
Index: 2, Clean hashtag: city exploration
Index: 3, Clean hashtag: beach vacations
Index: 4, Clean hashtag: cultural tours
Index: 5, Clean hashtag: beach vacations
Index: 6, Clean hashtag: foodie adventure
Index: 7, Clean hashtag: cultural tours
Index: 8, Clean hashtag: foodie adventure
Index: 9, Clean hashtag: hiking
Index: 10, Clean hashtag: foodie adventure
Index: 11, Clean hashtag: city exploration
Index: 12, Clean hashtag: city exploration
Index: 13, Clean hashtag: foodie adventure
Index: 14, Clean hashtag: cultural tours
Index: 15, Clean hashtag: hiking
Index: 16, Clean hashtag: cultural tours
Index: 17, Clean hashtag: city exploration
Index: 18, Clean hashtag: foodie adventure
Index: 19, Clean hashtag: beach vacations
Index: 20, Clean hashtag: hiking
Index: 21, Clean hashtag: hiking
Index: 22, Clean hashtag: hiking
Index: 23, Clean hashtag: cultural tours
Index: 24, Clean hashtag: cultural tours
I

# hashtag similarity

In [14]:
def get_recommendations(location, hashtags_str):

  # Handle empty hashtags string
  if not hashtags_str:
      return []

  # Split the provided hashtags string into a list
  hashtags = hashtags_str.strip().split()

  # Filter the dataframe based on location (if provided)
  if location:
    filtered_df = travel[travel['location'] == location].copy()  # Make a copy of the filtered DataFrame
    if not filtered_df.empty:
      # Calculate hashtag similarity scores for filtered entries
      filtered_df['hashtag_sim_score'] = filtered_df['hashtag'].apply(
          lambda x: len(set(x.split()) & set(hashtags))
      )
      # Sort entries based on hashtag similarity score
      sorted_df = filtered_df.sort_values(by='hashtag_sim_score', ascending=False)
      # Get top 10 recommendations
      recommendations = sorted_df[['location', 'hashtag', 'image_url']].head(10).to_dict('records')
      return recommendations
  return []  # Return empty list if no recommendations found

In [15]:
recommendations = get_recommendations('Kedah', 'beach')
print(recommendations)

[{'location': 'Kedah', 'hashtag': 'cultural tours', 'image_url': 'images/watkohwanararm.jpg'}, {'location': 'Kedah', 'hashtag': 'cultural tours', 'image_url': 'images/langkawi.jpg'}]


# KNN K Nearest Neighbour

In [16]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder

csv_file_path = "image_dataset.csv"
df = pd.read_csv(csv_file_path)
encoder = OneHotEncoder(handle_unknown='ignore')

encoded_features = encoder.fit_transform(df[['location', 'hashtag']])
knn = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(encoded_features)


user_location = input("Enter your preferred location: ")
user_hashtag = input("Enter your preferred hashtag: ")
user_input_df = pd.DataFrame({
    'location': [user_location],
    'hashtag': [user_hashtag]
})


encoded_user_input = encoder.transform(user_input_df)

distances, indices = knn.kneighbors(encoded_user_input)

recoms = df.iloc[indices[0]]

print(recoms[['image_title', 'location', 'hashtag', 'image_url']])

Enter your preferred location: perak
Enter your preferred hashtag: hiking
                   image_title         location hashtag  \
20               Kota Kinabalu            Sabah  hiking   
21             Mount Santubong          Sarawak  hiking   
31              Puteri Harbour            Johor  hiking   
15                 Wang Gunung           Perlis  hiking   
44  Taman Negara National Park         Selangor  hiking   
22               Gunung Serapi          Sarawak  hiking   
43                Taman Negara           Pahang  hiking   
45            Cameron Highland           Pahang  hiking   
29           Desaru fruit farm            Johor  hiking   
9             Bukit Batu Putih  Negeri Sembilan  hiking   

                     image_url  
20         images/kinabalu.jpg  
21        images/santubong.jpg  
31    images/puteriharbour.jpg  
15       images/wanggunung.jpg  
44  images/tamannegarapark.jpg  
22           images/serapi.jpg  
43      images/tamannegara.jpg  
45          

In [19]:
import pickle

In [20]:
pickle.dump(travel,open('travel.lis.pkl','wb'))