In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
dataset_path = '/content/drive/My Drive/IR/A2_Data.csv'
df = pd.read_csv(dataset_path)

# 1. Image Feature Extraction

In [None]:
import torch
import os
from torchvision import models, transforms
import requests
from PIL import Image, UnidentifiedImageError
import requests
from io import BytesIO
import pandas as pd
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
from collections import defaultdict, Counter
from math import log

nltk.download('stopwords')
nltk.download('wordnet')

output_dir = '/content/drive/My Drive/IR'

# Define image preprocessing
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load a pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
resnet.eval()  # Set the model to evaluation mode

# Function to extract features from an image
def extract_image_features(url):
    try:
        # Check if the URL is in a list format as a string, and convert if necessary
        import ast
        if url.startswith("[") and url.endswith("]"):
            url = ast.literal_eval(url)[0]  # Safely evaluates the string as a list and gets the first element

        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_t = image_transforms(img)
        img_t = img_t.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = resnet(img_t)
        return features.cpu().numpy().flatten()
    except requests.exceptions.RequestException as e:
        print(f"RequestException for URL {url}: {e}")
    except UnidentifiedImageError:  # Use the exception directly without the 'PIL.' prefix
        print(f"UnidentifiedImageError: cannot identify image file from URL {url}.")
    except Exception as e:
        print(f"Unexpected error for URL {url}: {e}")
    return None

# Extract features
image_features = []


for index, row in df.iterrows():
    image_feature = extract_image_features(row['Image'])
    if image_feature is not None:
        image_features.append(image_feature)

# Save the results
with open(os.path.join(output_dir, 'image_features.pkl'), 'wb') as f:
    pickle.dump(image_features, f)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 64.1MB/s]


UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/710a2Pyh5lL._SY88.jpg.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/816NMd0LexL._SY88.jpg.


# 2. Text Feature Extraction

In [None]:
import pandas as pd
import numpy as np
import math
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    # Remove URLs, hashtags, and mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Stopwords removal, stemming, and lemmatization
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if not word in stop_words]
    return tokens

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Assuming df is your DataFrame
text_data = df['Review Text'].fillna('').tolist()

# Preprocess text data
tokenized_texts = [preprocess_text(text) for text in text_data]

# Manual TF-IDF Calculation
def compute_tf_idf(tokenized_docs):
    # Calculate TF (term frequency)
    tf = [{word: doc.count(word) / len(doc) for word in doc} for doc in tokenized_docs]

    # Calculate document frequency (DF)
    df = {}
    for doc in tokenized_docs:
        for word in set(doc):
            df[word] = df.get(word, 0) + 1

    # Calculate IDF (inverse document frequency)
    idf = {word: math.log(len(tokenized_docs) / freq) for word, freq in df.items()}

    # Calculate TF-IDF
    tf_idf = [{word: freq * idf[word] for word, freq in doc.items()} for doc in tf]
    return tf_idf

tf_idf_scores = compute_tf_idf(tokenized_texts)

# Save the tokenized texts and TF-IDF scores using pickle
output_dir = '/content/drive/My Drive/IR'  # Ensure this directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tokenized_texts_path = os.path.join(output_dir, 'tokenized_texts.pkl')
tf_idf_scores_path = os.path.join(output_dir, 'tf_idf_scores_manual_text.pkl')

with open(tokenized_texts_path, 'wb') as f:
    pickle.dump(tokenized_texts, f)

with open(tf_idf_scores_path, 'wb') as f:
    pickle.dump(tf_idf_scores, f)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 3. Image Retrieval and Text Retrieval

In [2]:
import numpy as np
import pandas as pd
import pickle
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
from scipy.spatial.distance import cosine
import os
import torch
from torchvision import models, transforms
import requests
from PIL import Image, UnidentifiedImageError
import requests
from io import BytesIO
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
from collections import defaultdict, Counter
from math import log
import math
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

# Define the path to your dataset and precomputed features
dataset_path = '/content/drive/My Drive/IR/A2_Data.csv'
output_dir = '/content/drive/My Drive/IR'
image_features_path = os.path.join(output_dir, 'image_features.pkl')
tf_idf_scores_path = os.path.join(output_dir, 'tf_idf_scores_manual_text.pkl')

# Load the dataset
df = pd.read_csv(dataset_path)

# Load precomputed image features and TF-IDF scores
with open(image_features_path, 'rb') as f:
    image_features = pickle.load(f)
with open(tf_idf_scores_path, 'rb') as f:
    tf_idf_scores = pickle.load(f)

resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set the model to evaluation mode

# Define image transformations
transform_pipeline = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


def cosine_similarity(v1, v2):

  if isinstance(v1, np.ndarray) and all(isinstance(v, np.ndarray) for v in v2):
          # Convert list of numpy arrays (v2) to a single 2D numpy array
          v2 = np.array(v2)
          # Normalize v1 and v2
          v1_norm = v1 / np.linalg.norm(v1)
          v2_norm = v2 / np.linalg.norm(v2, axis=1)[:, np.newaxis]
          # Calculate cosine similarity
          similarities = np.dot(v1_norm, v2_norm.T)

      # Case for sparse vectors (TF-IDF scores)
  elif isinstance(v1, dict) and all(isinstance(v, dict) for v in v2):
          similarities = []
          for tfidf_dict in v2:
              # Intersection of keys (terms present in both vectors)
              common_terms = set(v1.keys()) & set(tfidf_dict.keys())
              # Manual dot product for common terms
              dot_product = sum(v1[term] * tfidf_dict[term] for term in common_terms)
              # Norms of the vectors
              norm_v1 = np.sqrt(sum(value ** 2 for value in v1.values()))
              norm_v2 = np.sqrt(sum(value ** 2 for value in tfidf_dict.values()))
              # Cosine similarity
              if norm_v1 == 0 or norm_v2 == 0:
                  similarity = 0
              else:
                  similarity = dot_product / (norm_v1 * norm_v2)
              similarities.append(similarity)
          similarities = np.array(similarities)

  else:
          raise ValueError("Unsupported input types.")

  return similarities



def find_most_similar_images(processed_image, precomputed_features, top_n=3):
    # Assuming direct comparison of processed_image array to precomputed feature vectors
    similarities = cosine_similarity(processed_image, precomputed_features).flatten()
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    return top_indices, [similarities[i] for i in top_indices]

def find_most_similar_reviews(input_tfidf, precomputed_tfidf_scores, top_n=3):
    # Calculate cosine similarity between the input TF-IDF vector and each precomputed TF-IDF vector
    similarities = cosine_similarity(input_tfidf, precomputed_tfidf_scores).flatten()
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    top_similarities = [similarities[i] for i in top_indices]
    return top_indices, top_similarities

def preprocess_text(text):
    """Basic text preprocessing"""
    text = text.lower()  # Lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Tokenize by splitting on whitespace
    # Optionally remove stopwords here
    return tokens
def compute_tf(tokenized_review):
    tf = {}
    for word in tokenized_review:
        tf[word] = tf.get(word, 0) + 1

    # Normalize term frequencies by the total number of words in the document
    total_words = len(tokenized_review)
    tf = {word: count / total_words for word, count in tf.items()}

    return tf

def preprocess_image(image_url):
    """Fetch and preprocess an image from a URL, then extract features."""
    try:
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        # Apply preprocessing transformations
        processed_image = transform_pipeline(image).unsqueeze(0)  # Add batch dimension

        # Extract features with the model
        with torch.no_grad():
            features = resnet_model(processed_image)

        # Convert features to a numpy array
        features_np = features.numpy().flatten()
        return features_np
    except Exception as e:
        print(f"Error processing image from URL {image_url}: {e}")
        return None
## INPUT ##
input_image_url = input("Enter the image URL: ")
input_review_text = input("Enter the review text: ")


# Preprocess review text
processed_tokens = preprocess_text(input_review_text)
input_review_tfidf = compute_tf(processed_tokens)

# Preprocess the image from URL
processed_image = preprocess_image(input_image_url)

if processed_image is not None:

    # Find the most similar images and reviews
    similar_image_indices, image_similarities = find_most_similar_images(processed_image, image_features)
    similar_review_indices, review_similarities = find_most_similar_reviews(input_review_tfidf, tf_idf_scores)

    print("Similar Image Indices:", similar_image_indices)
    print("Image Similarities:", image_similarities)
    print("Similar Review Indices:", similar_review_indices)
    print("Review Similarities:", review_similarities)
else:
    print("The specified image URL and review were not found in the dataset, or image processing failed.")

# Save the retrieval results
retrieval_results = {
    'similar_image_indices': similar_image_indices,
    'image_similarities': image_similarities,
    'similar_review_indices': similar_review_indices,
    'review_similarities': review_similarities,
}
results_path = os.path.join(output_dir, 'retrieval_results.pkl')
with open(results_path, 'wb') as f:
    pickle.dump(retrieval_results, f)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 124MB/s]


Enter the image URL: https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Enter the review text: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Similar Image Indices: [  0  62 193]
Image Similarities: [0.8524184, 0.836084, 0.8211998]
Similar Review Indices: [  0 794 750]
Review Similarities: [0.14389762284190002, 0.14002800840280097, 0.1264111506377715]


# 4. Combined Retrieval

In [3]:
import numpy as np
import pickle
from scipy.spatial.distance import cdist
import os
import pandas as pd
output_dir = '/content/drive/My Drive/IR'
dataset_path = '/content/drive/My Drive/IR/A2_Data.csv'
df = pd.read_csv(dataset_path)

# Assuming output_dir is already defined
results_path = os.path.join(output_dir, 'retrieval_results.pkl')

with open(results_path, 'rb') as f:
    retrieval_results = pickle.load(f)

# Extract individual components from the loaded retrieval results
similar_image_indices = retrieval_results['similar_image_indices']
image_similarities = retrieval_results['image_similarities']
similar_review_indices = retrieval_results['similar_review_indices']
review_similarities = retrieval_results['review_similarities']


def calculate_composite_scores(image_similarities, review_similarities):
    composite_scores = []
    for image_similarity, review_similarity in zip(image_similarities, review_similarities):
        # Calculate the average similarity score for each pair
        composite_score = (image_similarity + review_similarity) / 2
        composite_scores.append(composite_score)
    return composite_scores

composite_scores = calculate_composite_scores(image_similarities, review_similarities)

# Create a list of tuples (composite_score, image_index, review_index) and sort it
ranked_pairs = sorted(zip(composite_scores, similar_image_indices, similar_review_indices), reverse=True, key=lambda x: x[0])

# Display the ranked results
print("Ranked Combined Retrieval Results:")
for rank, (comp_score, img_idx, rev_idx) in enumerate(ranked_pairs, start=1):
    print(f"Rank: {rank}, Image Index: {img_idx}, Review Index: {rev_idx}, Composite Score: {comp_score:.4f}")


ranked_results_path = os.path.join(output_dir, 'ranked_combined_retrieval_results.pkl')
with open(ranked_results_path, 'wb') as f:
    pickle.dump(ranked_pairs, f)

print(f"Ranked combined retrieval results saved to: {ranked_results_path} \n")


def get_data_by_indices(df, image_indices, review_indices):
    # Extract image URLs and reviews by indices
    image_urls = df.loc[image_indices, 'Image'].tolist()
    reviews = df.loc[review_indices, 'Review Text'].tolist()
    return image_urls, reviews


image_urls, reviews = get_data_by_indices(df, similar_image_indices, similar_review_indices)



Ranked Combined Retrieval Results:
Rank: 1, Image Index: 0, Review Index: 0, Composite Score: 0.4982
Rank: 2, Image Index: 62, Review Index: 794, Composite Score: 0.4881
Rank: 3, Image Index: 193, Review Index: 750, Composite Score: 0.4738
Ranked combined retrieval results saved to: /content/drive/My Drive/IR/ranked_combined_retrieval_results.pkl 



# 5. Results and Analysis

In [4]:
output_dir = '/content/drive/My Drive/IR'
import pandas as pd
dataset_path = '/content/drive/My Drive/IR/A2_Data.csv'
df = pd.read_csv(dataset_path)

def calculate_composite_scores(image_similarities, review_similarities):
    composite_scores = []
    # Assuming image_similarities and review_similarities are aligned and of equal length
    for i in range(len(image_similarities)):
        comp_score = (image_similarities[i] + review_similarities[i]) / 2
        composite_scores.append((i, i, comp_score))  # Use i for both indices, or adjust as needed
    composite_scores.sort(key=lambda x: x[2], reverse=True)
    return composite_scores



## Calculate composite scores using only similarity scores
composite_scores = calculate_composite_scores(image_similarities, review_similarities)

## Display the combined retrieval results
print("USING IMAGE RETRIEVAL")
for i, (img_idx, rev_idx, comp_score) in enumerate(composite_scores, start=1):
     # Assuming each index points to relevant data in placeholder lists
     print(f"{i}) Image URL: {image_urls[img_idx]}")  # Example: Single URL or a list if applicable
     print(f"Review: {reviews[rev_idx]}")
     print(f"Cosine similarity of images - {image_similarities[img_idx]:.4f}")
     print(f"Cosine similarity of text - {review_similarities[rev_idx]:.4f}\n")

 # Assuming image_similarities and review_similarities are lists of scores from which composite scores were derived
composite_image_score = sum(image_similarities) / len(image_similarities)
composite_text_score = sum(review_similarities) / len(review_similarities)
final_composite_score = (composite_image_score + composite_text_score) / 2

print("Composite similarity scores of images:", f"{composite_image_score:.4f}")
print("Composite similarity scores of text:", f"{composite_text_score:.4f}")
print("Final composite similarity score:", f"{final_composite_score:.4f}\n")


print("USING TEXT RETRIEVAL")
for i, (img_idx, rev_idx, comp_score) in enumerate(composite_scores, start=1):
     # Assuming 'image_urls[img_idx]' fetches URLs of similar images based on text query
     # and 'reviews[rev_idx]' fetches the corresponding review
     print(f"{i}) Image URL: {image_urls[img_idx]}")  # List of Image URLs could be just this one for simplicity
     print(f"Review: {reviews[rev_idx]}")  # Extracted Review
     print(f"Cosine similarity of images - {image_similarities[img_idx]:.4f}")
     print(f"Cosine similarity of text - {review_similarities[rev_idx]:.4f}\n")

# # To compute final composite scores across all images and reviews:
final_composite_image_score = sum(image_similarities) / len(image_similarities)
final_composite_text_score = sum(review_similarities) / len(review_similarities)
final_composite_score = (final_composite_image_score + final_composite_text_score) / 2

print("Composite similarity scores of images:", final_composite_image_score)
print("Composite similarity scores of text:", final_composite_text_score)
print("Final composite similarity score:", final_composite_score)

USING IMAGE RETRIEVAL
1) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg']
Review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Cosine similarity of images - 0.8524
Cosine similarity of text - 0.1439

2) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/71nSUnv7znL._SY88.jpg']
Review: Good
Cosine similarity of images - 0.8361
Cosine similarity of text - 0.1400

3) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/81Eq6y34BYL._SY88.jpg']
Review: Great Quality, adjustable tension. Well made.
Cosine similarity of images - 0.8212
Cosine similarity of text - 0.1264

Composite similarity scores of images: 0.8366
Composite similarity scores of text: 0.1368
Final composite similarity score: 0.4867

USING TEXT RETRIEVAL
1) Image URL: ['https://images-na.ssl-images-amazon.com/images/I/81q