In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
## Mount the Google Drive folder for accessing data etc
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/ALA assingment'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

Mounted at /content/drive


In [3]:
FILENAME = DATA_DIR + 'moviereviews.csv'
df = pd.read_csv(FILENAME)
df.head()

Unnamed: 0,movie,review
0,The Lord of the Rings The Two Towers,remarkable display of fantasy action powerful ...
1,Inception,implanting stealing idea destroy gripping acti...
2,Spiderman Across the spider verse,mind bending wild action sequences intimate em...
3,The Dark Knight,Best live action portrayal beat organized crim...
4,Three colors red,mesmerising friendship turned love profound un...


In [None]:
# Calculate movie review vectors
movie_reviews = df.pivot_table(index='movie', columns='review', aggfunc='size', fill_value=0)
movie_reviews

In [None]:
# Calculate similarity using dot product formula
similarity_matrix = np.dot(movie_reviews, movie_reviews.T)
similarity_matrix

In [25]:
# Find top 3 pairs of similar movies
top_pairs = []
for i in range(len(similarity_matrix)):
    for j in range(i + 1, len(similarity_matrix)):
        top_pairs.append((df['movie'].iloc[i], df['movie'].iloc[j], similarity_matrix[i, j]))

# Sort the pairs by similarity in descending order
top_pairs.sort(key=lambda x: x[2], reverse=True)

# Print the top 3 pairs of similar movies
for i, pair in enumerate(top_pairs[:3]):
    movie1, movie2, similarity = pair
    print(f"Top {i+1}: {movie1} and {movie2}")


Top 1: The Lord of the Rings The Two Towers and Inception
Top 2: The Lord of the Rings The Two Towers and Spiderman Across the spider verse
Top 3: The Lord of the Rings The Two Towers and The Dark Knight


In [27]:
# Calculate dot product using hand-coded multiplication and addition
def dot_product_hand_coded(vector1, vector2):
    result = 0
    for i in range(len(vector1)):
        result += vector1[i] * vector2[i]
    return result

In [28]:
# Compare dot product speedup with numpy.dot()
import time

# Generate random vectors for testing
vector_size = 10000
vector1 = np.random.rand(vector_size)
vector2 = np.random.rand(vector_size)

# Measure time for numpy.dot()
start_time = time.time()
np_dot_result = np.dot(vector1, vector2)
np_dot_time = time.time() - start_time

# Measure time for hand-coded dot product
start_time = time.time()
hand_coded_dot_result = dot_product_hand_coded(vector1, vector2)
hand_coded_dot_time = time.time() - start_time

# Print results
print(f"Result of numpy.dot(): {np_dot_result} (Time: {np_dot_time:.6f} seconds)")
print(f"Result of hand-coded dot product: {hand_coded_dot_result} (Time: {hand_coded_dot_time:.6f} seconds)")

Result of numpy.dot(): 2527.690088037899 (Time: 0.000144 seconds)
Result of hand-coded dot product: 2527.690088037905 (Time: 0.006597 seconds)
