In [2]:
import numpy as np
import pandas as pd
import ast
import seaborn as sns

In [3]:
travel = pd.read_csv("image_dataset.csv") 

In [4]:
travel.head()

Unnamed: 0,image_id,image_title,hashtag,location,image_url
0,1,Pulau Aur,beach vacations,Johor,images/aur.jpg
1,2,Kung Pao,foodie adventure,Johor,images/johorfood.jpg
2,3,Johor Bahru City,city exploration,Johor,images/johorcity.jpg
3,4,Pulau Sibu,beach vacations,Johor,images/sibu.jpg
4,5,Wat Koh Wanararm,cultural tours,Kedah,images/watkohwanararm.jpg


In [5]:
travel.columns
travel.shape

(27, 5)

In [6]:
travel.isnull().sum()

image_id       0
image_title    0
hashtag        0
location       0
image_url      0
dtype: int64

In [7]:
travel['image_title'].head()

0           Pulau Aur
1            Kung Pao
2    Johor Bahru City
3          Pulau Sibu
4    Wat Koh Wanararm
Name: image_title, dtype: object

In [8]:
travel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     27 non-null     int64 
 1   image_title  27 non-null     object
 2   hashtag      27 non-null     object
 3   location     27 non-null     object
 4   image_url    27 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.2+ KB


In [9]:
travel.duplicated().sum()

0

In [10]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
travel['hashtag'] = travel['hashtag'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(travel['hashtag'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape


(27, 9)

In [11]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
cosine_sim.shape

(27, 27)

In [13]:
for i in range(6):
    print(cosine_sim[i][:6])

[1. 0. 0. 1. 0. 1.]
[0. 1. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0.]
[1. 0. 0. 1. 0. 1.]
[0. 0. 0. 0. 1. 0.]
[1. 0. 0. 1. 0. 1.]


In [14]:
# Remove commas from hashtags
travel['hashtag'] = travel['hashtag'].str.replace(',', '')

# Example: Access hashtags without commas for all entries (using a loop)
for index, row in travel.iterrows():
    clean_hashtag = row['hashtag']
    print(f"Index: {index}, Clean hashtag: {clean_hashtag}")

# Alternative: Access all hashtags as a list (optional)
all_clean_hashtags = travel['hashtag'].tolist()

Index: 0, Clean hashtag: beach vacations
Index: 1, Clean hashtag: foodie adventure
Index: 2, Clean hashtag: city exploration
Index: 3, Clean hashtag: beach vacations
Index: 4, Clean hashtag: cultural tours
Index: 5, Clean hashtag: beach vacations
Index: 6, Clean hashtag: beach vacations
Index: 7, Clean hashtag: foodie adventure
Index: 8, Clean hashtag: cultural tours
Index: 9, Clean hashtag: foodie adventure
Index: 10, Clean hashtag: hiking
Index: 11, Clean hashtag: foodie adventure
Index: 12, Clean hashtag: city exploration
Index: 13, Clean hashtag: city exploration
Index: 14, Clean hashtag: foodie adventure
Index: 15, Clean hashtag: cultural tours
Index: 16, Clean hashtag: hiking
Index: 17, Clean hashtag: cultural tours
Index: 18, Clean hashtag: city exploration
Index: 19, Clean hashtag: foodie adventure
Index: 20, Clean hashtag: beach vacations
Index: 21, Clean hashtag: hiking
Index: 22, Clean hashtag: hiking
Index: 23, Clean hashtag: hiking
Index: 24, Clean hashtag: cultural tours


In [15]:
def get_recommendations(location, hashtags_str):

  # Handle empty hashtags string
  if not hashtags_str:
      return []

  # Split the provided hashtags string into a list
  hashtags = hashtags_str.strip().split()

  # Filter the dataframe based on location (if provided)
  if location:
    filtered_df = travel[travel['location'] == location].copy()  # Make a copy of the filtered DataFrame
    if not filtered_df.empty:
      # Calculate hashtag similarity scores for filtered entries
      filtered_df['hashtag_sim_score'] = filtered_df['hashtag'].apply(
          lambda x: len(set(x.split()) & set(hashtags))
      )
      # Sort entries based on hashtag similarity score
      sorted_df = filtered_df.sort_values(by='hashtag_sim_score', ascending=False)
      # Get top 10 recommendations
      recommendations = sorted_df[['location', 'hashtag', 'image_url']].head(10).to_dict('records')
      return recommendations
  return []  # Return empty list if no recommendations found

In [16]:
recommendations = get_recommendations('Kedah', 'beach')
print(recommendations)

[{'location': 'Kedah', 'hashtag': 'beach vacations', 'image_url': 'images/langkawi.jpg'}, {'location': 'Kedah', 'hashtag': 'cultural tours', 'image_url': 'images/watkohwanararm.jpg'}]


In [17]:
import pickle

In [None]:
pickle.dump(travel,open('travel.lis.pkl','wb'))