In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import requests
import os
from PIL import Image
import os
import random
import numpy as np
import cv2
from matplotlib import pyplot as plt
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pickle
import math

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def pre_process_sentence(sentence):
    sentence = sentence.lower()
    tokens = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    filtered_tokens = [word for word in filtered_tokens if word.isalnum()]
    stemming = PorterStemmer()
    filtered_tokens = [stemming.stem(word) for word in filtered_tokens]
    lemmatizer = WordNetLemmatizer()
    filtered_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    filtered_tokens = list(filter(lambda token: token.strip() != '', filtered_tokens))
    return filtered_tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/utkarshpal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/utkarshpal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/utkarshpal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Q-2a) Reading the data and pre-processing

In [2]:
# Read the CSV file into a DataFrame
dataset_df = pd.read_csv('A2_Data.csv', names=["ID" , "Image" , "Review Text"])
data = {}

# Iterate over the rows of the DataFrame
for index, row in dataset_df[1:].iterrows():
    images = row["Image"]
    entry = {"Text":[] , "Images":[] , "Original": ""} 
    urls_list = images.strip("[]").split(", ")
    urls_list = [url.strip("'") for url in urls_list]
    entry["Images"] = urls_list
    if pd.isnull(row["Review Text"]):
        review = ["<no-review>"] # This is a placeholder for the missing review
        entry["Original"] = ""
    else:
        entry["Original"] = row["Review Text"]
        review = pre_process_sentence(row["Review Text"])
    entry["Text"] = review
    data[row["ID"]] = entry

In [3]:
features = {} # store features corresponding to a file_name

In [4]:
#strip a link 
def strip_link(link):
    return link.split("/")[-1]

In [5]:
# Create a directory to store the images
if not os.path.exists('images'):
    os.makedirs('images')

ct =0
for product_id in data.keys():
    temp_del = []
    for link in data[product_id]["Images"]:
        response = requests.get(link)
        if response.status_code == 200:
            file_name = "images/"+ strip_link(link)
            with open(file_name, 'wb') as image_file:
                image_file.write(response.content)
            try:
                img = Image.open(file_name)
                ct += 1
            except:
                temp_del.append(link)
                os.remove(file_name)

        else:
            temp_del.append(link)
    for link in temp_del:
        data[product_id]["Images"].remove(link)

# removes entries with no images
for product_id in list(data.keys()):
    if len(data[product_id]["Images"]) == 0:
        del data[product_id]


Q-1 a) Pre-Process Image

In [21]:
def rotate_image(image, angle):
    return image.rotate(angle)

def flip_image(image, flip):
    return image.transpose(flip)

def alter_brightness(image, factor):
    np_image = np.array(image)
    new_image = np_image + factor
    new_image = np.clip(new_image, 0, 255)
    new_image = Image.fromarray(np.uint8(new_image))
    return new_image

def alter_image(image):
    angle = random.uniform(-45, 45)
    flip = random.choice([Image.FLIP_LEFT_RIGHT, Image.FLIP_TOP_BOTTOM])
    brightness_factor = random.uniform(-50, 50)
    new_image = rotate_image(image, angle)
    new_image = flip_image(new_image, flip)
    new_image = alter_brightness(new_image, brightness_factor)
    return new_image

# Create a directory to store the altered images
os.makedirs('altered_images', exist_ok=True)
# Alter all the images and save them to the new directory
for i in data.keys():
    for link in data[i]["Images"]:        
        try:   
            image = Image.open('images/' + strip_link(link))
            new_image = alter_image(image)
            new_image.save('altered_images/' + strip_link(link))
        except:
            print("Error in image: ", i)
            continue


Q-1 b/c) Use PreTrained Resnet50 to extract features and normalize them

In [22]:
def normalize_tensor(tensor):
    norm = math.sqrt(sum([x**2 for x in tensor]))
    return tensor / norm

In [55]:
import torch
from torchvision import transforms
from torchvision.models import resnet50
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = resnet50(pretrained=True)
model.to(device)
transform = transforms.Compose([
    transforms.Resize(256),  # Resize the image to 256x256
    transforms.ToTensor()
])

def extract_features(image_path):
  img = Image.open(image_path)
  img = transform(img)
  img = img.unsqueeze(0)  
  model.eval()
  with torch.no_grad():
    features = model(img.to(device))
  features = features.squeeze(0).flatten()
  features = features/features.norm()
  # features = features / normalize_tensor(features) # Normalize the features
  return features




Pickle the features corresponding to every image

In [56]:
for i in data.keys():
    for link in data[i]["Images"]:
        try:
            features[strip_link(link)] = extract_features('altered_images/' + strip_link(link))
        except:
            print("Error in image: ", i)
            continue
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)


In [52]:
# read the features
with open('features.pkl', 'rb') as f:
    features = pickle.load(f)

Q-2 b) Caluclation of tf-idf matrix

In [28]:
vocabulary = set()
for product_id in data.keys():
    words = data[product_id]["Text"]
    vocabulary.update(words)
vocabulary = list(vocabulary)
vocabulary.sort()

df = {word: 0 for word in vocabulary}
for product_id in data.keys():
    words = data[product_id]["Text"]
    for word in set(words):
        df[word] += 1

In [29]:
def calculate_tfidf(data):
    tfidf = []
    for product_id in data.keys():
        words = data[product_id]["Text"]
        tfidf_vector = []
        for word in vocabulary:
            tf = words.count(word) / (1+len(words)) # Add 1 to the denominator to prevent division by zero
            idf = math.log(len(data) / df[word])
            tfidf_value = tf * idf
            tfidf_vector.append(tfidf_value)
        data[product_id]["tfidf"] = tfidf_vector
        tfidf.append(tfidf_vector)
    
    return tfidf

tfidf = calculate_tfidf(data)
print(f"TF-IDF matrix: {len(tfidf)}x{len(tfidf[0])}")


TF-IDF matrix: 994x4431


In [30]:
# pickle tfidf
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [31]:
# Load the TF-IDF matrix from the file
with open('tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)

In [32]:
def cosine_similarity(v1, v2):
    dot_product = sum(x * y for x, y in zip(v1, v2))
    norm1 = sum(x ** 2 for x in v1) ** 0.5
    norm2 = sum(x ** 2 for x in v2) ** 0.5
    cosine_similarity = dot_product / (norm1 * norm2)
    return cosine_similarity

In [33]:
def get_text_tfidf_vector(text):
    words = pre_process_sentence(text)
    tfidf_vector = []
    for word in vocabulary:
        tf = words.count(word) / (1+len(words)) # Add 1 to the denominator to prevent division by zero
        idf = math.log(len(data) / df[word])
        tfidf_value = tf * idf
        tfidf_vector.append(tfidf_value)
    return tfidf_vector

Q-3) Text Based Retrieval

In [61]:
def search_text_query(query, data, tfidf, k=3):
    similarities = [ (product , cosine_similarity(get_text_tfidf_vector(query), data[product]["tfidf"])) for product in data.keys()]
    similarities.sort(key=lambda x: x[1], reverse=True)
    results = []
    for i in range(k):
        product_id = similarities[i][0]
        product = data[product_id]
        results.append({"Index": product_id, "Text": product["Original"], "Images": product["Images"], "Similarity": similarities[i][1]})
    return results

def text_based_retreival(query):
    results = search_text_query(query[0], data, tfidf)
    for result in results:
        print("----------------------------------------------------------------")
        print(f"Images URL: {result['Images']}")
        query_features = features[strip_link(query[1])]
        similarity = sum([cosine_similarity(query_features, features[strip_link(image_link)] ) for image_link in result['Images']])/len(result['Images'])
        print(f"Product ID: {(result['Index'])}")
        print(f"Review: {result['Text']}")
        print(f"Cosine similarity of images: {similarity:.2f}")
        print(f"Cosine similarity of text: {result['Similarity']:.2f}")
        print(f"Composite similarity score:: {similarity*0.5 + result['Similarity']*0.5:.2f} ")    
        print()
        print()

In [62]:
query =  ["I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring ifthere is a break." 
          ,"https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg"]
text_based_retreival(query)

----------------------------------------------------------------
Images URL: ['https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg']
Product ID: 654.0
Review: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Cosine similarity of images: 1.00
Cosine similarity of text: 1.00
Composite similarity score:: 1.00 


----------------------------------------------------------------
Images URL: ['https://images-na.ssl-images-amazon.com/images/I/61DvLcapd8L._SY88.jpg']
Product ID: 644.0
Review: I went from fender chrome non-locking to fender gold locking. It made my guitar look beautiful and play beautiful. I think locking tuners are the way to go. If you are new to locking tuners look on YouTube for instructions.
Cosine similarity of images: 0.76
Cosine similarity of text: 0.27
Composite similarity score:: 0.52 


-------------------------------------

Q-4) Image Based Query

In [63]:
def search_image_query(query, k=5):
    query_features = features[strip_link(query[1])]
    similarities = []
    for product_id in data.keys():
        similarities_local = [ (cosine_similarity(query_features, features[strip_link(image_link)]) , image_link) for image_link in data[product_id]["Images"]]
        similarities_local.sort(key=lambda x: x[0], reverse=True)
        similarities.append(( similarities_local[0][0] , similarities_local[0][1] , product_id))
    similarities.sort(key=lambda x: x[0], reverse=True)
    
    results = []
    for i in range(k):
        print("----------------------------------------------------------------")
        print(f"Images URL: {similarities[i][1]}")
        print(f"Product ID: {(similarities[i][2])}")
        print(f"Review: {data[similarities[i][2]]['Original']}")
        print(f"Cosine similarity of images: {similarities[i][0]:.2f}")
        text_similarity = (cosine_similarity(get_text_tfidf_vector(query[0]) , data[similarities[i][2]]['tfidf']) )
        print(f"Cosine similarity of text: { text_similarity}")
        # print(f"Cosine similarity of text: { text_similarity:.2f}")
        print(f"Composite similarity score:: {text_similarity*0.5 + similarities[i][0]*0.5:.2f} ")    
        print()
        print()
    return results

In [None]:
query =  ["I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring ifthere is a break." 
          ,"https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg"]
search_image_query(query)

Q-4 Combined Retrival

In [66]:
def search_combined_query(query , k =3):
    query_features = features[strip_link(query[1])]
    similarities = []
    for product_id in data.keys():
        similarities_local = [ (cosine_similarity(query_features, features[strip_link(image_link)]) , image_link) for image_link in data[product_id]["Images"]]
        similarities_local.sort(key=lambda x: x[0], reverse=True)
        similarities.append(( similarities_local[0][0] + cosine_similarity(get_text_tfidf_vector(query[0]), data[product_id]["tfidf"]) , similarities_local[0][1] , product_id))
    
    # Get the  top k most similar documents
    similarities.sort(key=lambda x: x[0], reverse=True)
    results = []
    for i in range(k):
        # product = data[product_id]
        # results.append({"Index": product_id, "Text": ' '.join(product["Text"]), "Image": similarities[i][1], "Similarity": similarities[i][0]})
        print("----------------------------------------------------------------")
        print(f"Images URL: {similarities[i][1]}")
        print(f"Product ID: {(similarities[i][2])}")
        print(f"Review: {data[similarities[i][2]]['Original']}")
        print(f"Cosine similarity of images:{cosine_similarity(query_features,features[strip_link(similarities[i][1])]):.2f}")

        text_similarity = (cosine_similarity(get_text_tfidf_vector(query[0]),data[similarities[i][2]]['tfidf']) )
        print(f"Cosine similarity of text: { text_similarity}")
        print(f"Composite similarity score:: {text_similarity*0.5 + similarities[i][0]*0.5:.2f} ")    
        print()
        print()
    return results


In [67]:
query =  ["I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring ifthere is a break." ,"https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg"]
search_combined_query(query)

----------------------------------------------------------------
Images URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Product ID: 654.0
Review: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Cosine similarity of images:1.00
Cosine similarity of text: 1.0000000000000002
Composite similarity score:: 1.50 


----------------------------------------------------------------
Images URL: https://images-na.ssl-images-amazon.com/images/I/61g0lol4mUL._SY88.jpg
Product ID: 3772.0
Review: Nice tuners.  Installed on a strat neck and they are working great. Nice and smooth and has stayed in tune very well. Nothing wrong with these.
Cosine similarity of images:0.90
Cosine similarity of text: 0.16327964800350983
Composite similarity score:: 0.61 


----------------------------------------------------------------
Images URL: https://images-na.ss

[]