# Project 7: Norms, angles, and your movie choices

# Task 1: Load and Understand the Data

What happens here?
-- You load the provided .mat file
-- Extract different matrices and arrays from the file

In [2]:
# Import necessary libraries
import scipy.io
import numpy as np

# Load the .mat file
data = scipy.io.loadmat('users_movies.mat')

# Extract variables from the loaded data
movies = data['movies']  # Array of movie titles
users_movies = data['users_movies']  # Matrix of user ratings for movies
users_movies_sort = data['users_movies_sort']  # Extracted ratings for 20 most popular movies
index_small = data['index_small']  # Indexes of the popular movies
trial_user = data['trial_user']  # Ratings of the popular movies by a trial user

# Get the dimensions of the users_movies matrix
m, n = users_movies.shape

# Print the variables and their dimensions to verify
print(f"Movies: {movies.shape}")
print(f"Users Movies: {users_movies.shape}")
print(f"Users Movies Sort: {users_movies_sort.shape}")
print(f"Index Small: {index_small.shape}")
print(f"Trial User: {trial_user.shape}")
print(f"Dimensions of users_movies: {m} rows, {n} columns")

# Variables: movies, users_movies, users_movies_sort, index_small, trial_user, m, n


Movies: (3952, 1)
Users Movies: (6040, 3952)
Users Movies Sort: (6040, 20)
Index Small: (1, 20)
Trial User: (1, 20)
Dimensions of users_movies: 6040 rows, 3952 columns


# Task 2: Print the Titles of the 20 Most Popular Movies

In [3]:
# Print the titles of the 20 most popular movies
print('Rating is based on movies:')

# Loop through the index_small array and print the corresponding movie titles
for idx in index_small.flatten():
    print(movies[idx][0])

print('\n')


Rating is based on movies:
['Search for One-eye Jimmy, The (1996)']
['Little Women (1994)']
['Princess Bride, The (1987)']
['Wings of Desire (Der Himmel über Berlin) (1987)']
['Kalifornia (1993)']
["Billy's Hollywood Screen Kiss (1997)"]
['Dances with Wolves (1990)']
['10 Things I Hate About You (1999)']
['Fried Green Tomatoes (1991)']
['Snow White and the Seven Dwarfs (1937)']
['Love Letter, The (1999)']
['Brazil (1985)']
['Homeward Bound II']
['Thomas Crown Affair, The (1999)']
['Taxi Driver (1976)']
['Mass Appeal (1984)']
['Raiders of the Lost Ark (1981)']
['Scout, The (1994)']
['Shallow Grave (1994)']
['Unforgiven (1992)']




# Task 3: Select Users Who Rated All 20 Popular Movies

In [4]:
# Get the dimensions of the users_movies_sort matrix
m1, n1 = users_movies_sort.shape

# Initialize an empty list to store the ratings of users who have rated all 20 popular movies
ratings = []

# Loop through each row in users_movies_sort
for j in range(m1):
    # Check if the product of the elements in the row is not zero (meaning no zeros in the row)
    if np.prod(users_movies_sort[j, :]) != 0:
        # Append the row to the ratings list
        ratings.append(users_movies_sort[j, :])

# Convert the ratings list to a NumPy array
ratings = np.array(ratings)

# Print the resulting ratings array
print(f"Ratings: {ratings.shape}")

# Variables: ratings, m1, n1


Ratings: (125, 20)


# Task 4: Compute Euclidean Distances

In [5]:
# Get the dimensions of the ratings matrix
m2, n2 = ratings.shape

# Initialize an empty list to store the Euclidean distances
eucl = []

# Loop through each row in ratings
for i in range(m2):
    # Calculate the Euclidean distance between the trial_user vector and the current row of ratings
    distance = np.linalg.norm(ratings[i, :] - trial_user.flatten())
    # Append the distance to the eucl list
    eucl.append(distance)

# Convert the eucl list to a NumPy array
eucl = np.array(eucl)

# Print the resulting Euclidean distances
print(f"Euclidean distances: {eucl}")

# Variables: eucl


Euclidean distances: [359.99861111 255.11174022 510.05783986 360.73813217 360.71179631
 441.73068718 360.71041016 674.32484753 509.05795348 570.24468432
 510.05489901 761.37507183 568.4470072  359.95555281 720.91330963
   8.66025404 441.72389566 441.72955527 719.15227873   8.77496439
 510.04117481 624.64629992 360.72981579 360.69377594 624.26436708
 673.18942357 510.04705665 360.71041016 255.14505678 510.04117481
   8.42614977 255.11370014 569.79557738 360.0180551  843.94668078
 441.18816847 441.14736767 624.65670572 441.15530145 673.18422441
 360.72981579 510.06764257 673.93545685 569.33118657 441.15190128
 441.15416806 672.80829365 509.04027346 441.15303467 255.12349951
 441.74653366 624.65350395 441.15190128 568.90596763 673.94139211
 570.23854658 509.55863254 441.14963448 569.33909053 360.02361034
 255.10782034 440.59051283 360.7145686  255.15681453   8.24621125
 624.25315378 672.80160523 360.01111094 255.13917771 569.81400474
 255.13721798 254.10627698 360.01944392 673.16788396 50

# Task 5: Find the Most Similar User

In [6]:
# Sort the Euclidean distances in ascending order
DistIndex = np.argsort(eucl)
MinDist = np.sort(eucl)

# Find the index of the closest user
closest_user_Dist = DistIndex[0]

# Print the results
print(f"Sorted Euclidean distances: {MinDist}")
print(f"Indices of users sorted by distance: {DistIndex}")
print(f"Index of closest user: {closest_user_Dist}")

# Variables: MinDist, DistIndex, closest_user_Dist


Sorted Euclidean distances: [  7.28010989   7.87400787   8.18535277   8.18535277   8.24621125
   8.36660027   8.42614977   8.42614977   8.66025404   8.77496439
   9.05538514   9.05538514   9.11043358 253.09286833 254.10627698
 254.12005037 254.13579047 255.10782034 255.11174022 255.11370014
 255.11370014 255.12349951 255.12349951 255.13721798 255.13917771
 255.14505678 255.15093572 255.15681453 359.95555281 359.99861111
 360.01111094 360.0180551  360.01944392 360.02361034 360.69377594
 360.69654836 360.69932076 360.71041016 360.71041016 360.71179631
 360.7145686  360.72149922 360.72149922 360.72288533 360.72288533
 360.72981579 360.72981579 360.73120187 360.73813217 440.57802941
 440.5848386  440.59051283 441.14623426 441.14736767 441.14963448
 441.15190128 441.15190128 441.15303467 441.15416806 441.15416806
 441.15530145 441.16436846 441.18816847 441.72389566 441.72955527
 441.73068718 441.74540179 441.74653366 441.7487974  441.75106112
 508.08660679 508.53908404 508.55481514 508.5666

# Task 6: Understanding Pearson Correlation for Similarity

In [7]:
# Centralize the columns of the matrix ratings
ratings_cent = ratings - np.mean(ratings, axis=1).reshape(-1, 1)

# Centralize the trial_user vector
trial_user_cent = trial_user - np.mean(trial_user)

# Print the centralized ratings and trial_user vectors
print(f"Centralized ratings: \n{ratings_cent}")
print(f"Centralized trial_user: \n{trial_user_cent}")

# Variables: ratings_cent, trial_user_cent


Centralized ratings: 
[[-0.4   0.6   0.6  ...  0.6  -0.4  -1.4 ]
 [-0.55  0.45  0.45 ...  0.45  0.45  0.45]
 [ 0.6   0.6   0.6  ...  0.6   0.6  -0.4 ]
 ...
 [ 0.55  0.55  0.55 ...  0.55  0.55 -0.45]
 [-0.5   0.5   0.5  ...  0.5   0.5   0.5 ]
 [-0.4  -0.4  -0.4  ...  0.6   0.6   0.6 ]]
Centralized trial_user: 
[[-0.45  1.55  1.55  1.55 -0.45 -2.45  0.55  1.55 -1.45  1.55 -1.45  1.55
  -0.45  0.55  0.55 -1.45 -1.45 -2.45 -0.45  1.55]]


# Task 7: Compute Pearson Correlation with Other Users

Steps:
* Loop through each user in ratings_cent.
* Compute Pearson correlation coefficient using np.corrcoef().
* Store results in a vector pearson.

In [8]:


# Initialize the pearson array
pearson = np.zeros(m2)

# Compute Pearson correlation coefficients
for i in range(m2):
    pearson[i] = np.corrcoef(ratings_cent[i, :], trial_user_cent.flatten())[0, 1]

# Print the resulting Pearson correlation coefficients
print(f"Pearson correlation coefficients: {pearson}")

# Variables: pearson


Pearson correlation coefficients: [ 0.07561935  0.36369309  0.0245783  -0.16556179  0.03582872  0.2140846
  0.14763779 -0.02178972 -0.0100936   0.09702463  0.05321628 -0.2062086
  0.16821448  0.60769118  0.32403366  0.19674451  0.36226722  0.33180702
 -0.0053019   0.12647862  0.30960763  0.44063497 -0.06205716  0.4257963
 -0.183647   -0.15140397  0.24374604  0.14284374  0.19674451  0.39302868
  0.33727631  0.46562671  0.13151475 -0.01950268  0.15690109 -0.34105647
  0.24896632  0.21031844  0.15617376 -0.01194291 -0.11582368 -0.08011545
  0.10748615  0.39938107  0.21573203  0.16556179 -0.24313206  0.20259223
  0.27942446  0.33700654  0.08682524  0.13122024  0.32557134  0.0533586
  0.14811562  0.36933894 -0.00739965  0.28900603  0.13876403 -0.14800564
  0.33653018 -0.15539269  0.19444977 -0.10748615  0.35165724  0.06931045
  0.28756254  0.00267796  0.21716794 -0.32403366  0.13291394  0.24573705
 -0.11582368  0.42128131 -0.07397705  0.00267796  0.1170161   0.16556179
  0.3024463   0.23837

# Task 8: Finding the Most Similar User (Using Pearson Correlation)

* The correlation coefficient ranges from -1 to 1.
* Closer to 1 → More similar user
* Closer to -1 → Opposite tastes
* Sort users by Pearson correlation (highest first).
* Identify the most similar user (highest Pearson correlation).

In [9]:
# Sort the Pearson correlation coefficients in descending order
PearsonIndex = np.argsort(pearson)[::-1]
MaxPearson = np.sort(pearson)[::-1]

# Find the index of the user with the highest correlation coefficient
closest_user_Pearson = PearsonIndex[0]

# Print the results
print(f"Sorted Pearson correlation coefficients: {MaxPearson}")
print(f"Indices of users sorted by Pearson correlation: {PearsonIndex}")
print(f"Index of user with highest Pearson correlation: {closest_user_Pearson}")

# Variables: MaxPearson, PearsonIndex, closest_user_Pearson


Sorted Pearson correlation coefficients: [ 0.62896156  0.60769118  0.59988588  0.51951641  0.47449447  0.46562671
  0.44063497  0.43115223  0.4257963   0.42128131  0.39938107  0.39302868
  0.36933894  0.36369309  0.36226722  0.35165724  0.33727631  0.33700654
  0.33653018  0.33180702  0.32924378  0.32557134  0.32403366  0.31402432
  0.30960763  0.3024463   0.30217235  0.28900603  0.28858643  0.28756254
  0.27942446  0.2549193   0.2533473   0.24896632  0.24573705  0.24374604
  0.24222691  0.23837048  0.23837048  0.23580013  0.21716794  0.21691509
  0.21573203  0.21452952  0.2140846   0.21031844  0.20259223  0.19674451
  0.19674451  0.19444977  0.1800582   0.16909002  0.16821448  0.16556179
  0.16556179  0.15690109  0.15617376  0.15107865  0.14811562  0.14763779
  0.14284374  0.13876403  0.13545965  0.13291394  0.13151475  0.13122024
  0.12869663  0.12647862  0.12459704  0.1170161   0.10748615  0.10551014
  0.09871488  0.09702463  0.09057869  0.08682524  0.08682524  0.07561935
  0.069310

# Task 9: Comparing Pearson and Euclidean Distance Results

* closest_user_Dist → Most similar user by Euclidean distance
* closest_user_Pearson → Most similar user by Pearson correlation

In [10]:
# Compare the elements of the vectors DistIndex and PearsonIndex
print("Indices sorted by Euclidean distance:", DistIndex)
print("Indices sorted by Pearson correlation:", PearsonIndex)

# Check if the variables closest_user_Pearson and closest_user_Dist are the same
if closest_user_Pearson == closest_user_Dist:
    print("The variables closest_user_Pearson and closest_user_Dist are the same.")
else:
    print("The variables closest_user_Pearson and closest_user_Dist are different.")


Indices sorted by Euclidean distance: [ 87 112  78  98  64 120 107  30  15  19  96  79  84  95  71 123 101  60
   1 102  31  49  92  70  68  28  94  63  13   0  67  33  72  59  23  80
  81   6  27   4  62 110 111 121 104  40  22  90   3  83 116  61 108  36
  57  52  44  48  45  77  38 115  35  16  17   5  76  50  86  75 122  99
  89  74  47 114   8 113  56  88  97  29  20  26 100  10   2  41 105  12
  53  43  58  82  91  32  69 118  55   9 124  65  24  21  51  37 117  93
  66  46  73  85  39  25  42  54 119   7 103  18  14 109  11 106  34]
Indices sorted by Pearson correlation: [ 87  13 112  98 118  31  21  81  23  73  43  29  55   1  16  64  30  49
  60  17  95  52  14 106  20  78 120  57  80  66  48 108 113  36  71  26
 110  96  79 107  68  92  44 102   5  37  47  28  15  62  91 123  12  45
  77  34  38 101  54   6  27  58  94  70  32  51 100  19 115  76  42  83
  99   9 114  93  50   0  65 103  86 104  53  10 116  82   4   2 109 111
  67  75  18  56   8  88  39  84 117  90  33   7  

# Task 10: Recommending Movies to the Trial User
we recommend movies based on their highest-rated movies (rating = 5).

Steps
* Find movies that the most similar user (by distance) rated 5 → recommend_dist
* Find movies that the most similar user (by Pearson) rated 5 → recommend_pearson
* Find movies the trial user already liked (rated 5 on their 20 movies list) → liked

In [11]:
print("index_small shape:", index_small.shape)
print("trial_user shape:", trial_user.shape)


index_small shape: (1, 20)
trial_user shape: (1, 20)


In [12]:
import scipy.io
import numpy as np

# Load the .mat file
data = scipy.io.loadmat('users_movies.mat')

# Extract variables from the loaded data
movies = data['movies']  # Array of movie titles
users_movies = data['users_movies']  # Matrix of user ratings for movies
users_movies_sort = data['users_movies_sort']  # Extracted ratings for 20 most popular movies
index_small = data['index_small'].flatten()  # Flatten index_small to 1D array
trial_user = data['trial_user'].flatten()  # Ensure trial_user is 1D array

# Variables: movies, users_movies, users_movies_sort, index_small, trial_user
m, n = users_movies.shape

# Recommendations based on the distance criterion
recommend_dist = []
for k in range(n):
    if users_movies[closest_user_Dist, k] == 5:
        recommend_dist.append(k)

# Recommendations based on the Pearson correlation coefficient criterion
recommend_pearson = []
for k in range(n):
    if users_movies[closest_user_Pearson, k] == 5:
        recommend_pearson.append(k)

# Movies liked by the trial user
liked = []
for k in range(20):
    if trial_user[k] == 5:
        # Convert 2D index_small to 1D index and add to liked list
        if k < len(index_small):
            liked.append(index_small[k])

# Convert indices to movie titles
liked_titles = [movies[i][0] for i in liked]
recommend_dist_titles = [movies[i][0] for i in recommend_dist]
recommend_pearson_titles = [movies[i][0] for i in recommend_pearson]

# Print the results
print("Movies liked by the trial user:", liked_titles)
print("Recommended movies based on distance criterion:", recommend_dist_titles)
print("Recommended movies based on Pearson correlation criterion:", recommend_pearson_titles)

# Variables: liked, recommend_dist, recommend_pearson


Movies liked by the trial user: [array(['Little Women (1994)'], dtype='<U19'), array(['Princess Bride, The (1987)'], dtype='<U26'), array(['Wings of Desire (Der Himmel über Berlin) (1987)'], dtype='<U47'), array(['10 Things I Hate About You (1999)'], dtype='<U33'), array(['Snow White and the Seven Dwarfs (1937)'], dtype='<U38'), array(['Brazil (1985)'], dtype='<U13'), array(['Unforgiven (1992)'], dtype='<U17')]
Recommended movies based on distance criterion: [array(['Taxi Driver (1976)'], dtype='<U18'), array(["Schindler's List (1993)"], dtype='<U23'), array(['Fargo (1996)'], dtype='<U12'), array(['Godfather, The (1972)'], dtype='<U21'), array(['North by Northwest (1959)'], dtype='<U25'), array(['Casablanca (1942)'], dtype='<U17'), array(['Citizen Kane (1941)'], dtype='<U19'), array(['Mr. Smith Goes to Washington (1939)'], dtype='<U35'), array(['Bonnie and Clyde (1967)'], dtype='<U23'), array(['Bob Roberts (1992)'], dtype='<U18'), array(['Paris Is Burning (1990)'], dtype='<U23'), array

# Task 11K Display the Movie Titles

In [13]:
# Function to print movie titles based on indices
def print_movie_titles(indices, movie_titles):
    print("Movie Titles:")
    for index in indices:
        print(movie_titles[index])
    print()

# Print titles of movies liked by the trial user
print("Movies liked by the trial user:")
print_movie_titles(liked, movies)

# Print recommendations based on the distance criterion
print("Recommended movies based on distance criterion:")
print_movie_titles(recommend_dist, movies)

# Print recommendations based on the Pearson correlation criterion
print("Recommended movies based on Pearson correlation criterion:")
print_movie_titles(recommend_pearson, movies)


Movies liked by the trial user:
Movie Titles:
[array(['Little Women (1994)'], dtype='<U19')]
[array(['Princess Bride, The (1987)'], dtype='<U26')]
[array(['Wings of Desire (Der Himmel über Berlin) (1987)'], dtype='<U47')]
[array(['10 Things I Hate About You (1999)'], dtype='<U33')]
[array(['Snow White and the Seven Dwarfs (1937)'], dtype='<U38')]
[array(['Brazil (1985)'], dtype='<U13')]
[array(['Unforgiven (1992)'], dtype='<U17')]

Recommended movies based on distance criterion:
Movie Titles:
[array(['Taxi Driver (1976)'], dtype='<U18')]
[array(["Schindler's List (1993)"], dtype='<U23')]
[array(['Fargo (1996)'], dtype='<U12')]
[array(['Godfather, The (1972)'], dtype='<U21')]
[array(['North by Northwest (1959)'], dtype='<U25')]
[array(['Casablanca (1942)'], dtype='<U17')]
[array(['Citizen Kane (1941)'], dtype='<U19')]
[array(['Mr. Smith Goes to Washington (1939)'], dtype='<U35')]
[array(['Bonnie and Clyde (1967)'], dtype='<U23')]
[array(['Bob Roberts (1992)'], dtype='<U18')]
[array(['Pa

# Task 12

In [14]:
import numpy as np

# Manually specify your ratings for the 20 popular movies
# Example: Dislike some movies (1), Like some movies (5), Random ratings for others
# Replace these values with your own ratings
myratings = np.array([5, 1, 4, 3, 2, 5, 1, 4, 3, 5, 2, 5, 1, 3, 4, 5, 2, 1, 4, 3])

# Ensure myratings is a row vector (1D array with 20 elements)
print("My Ratings Vector:")
print(myratings)


My Ratings Vector:
[5 1 4 3 2 5 1 4 3 5 2 5 1 3 4 5 2 1 4 3]


# Task 13: Generate Personal Recommendations

In [15]:
import numpy as np
import scipy.io

# Load the .mat file
data = scipy.io.loadmat('users_movies.mat')

# Extract variables from the loaded data
movies = data['movies']  # Array of movie titles
users_movies = data['users_movies']  # Matrix of user ratings for movies
users_movies_sort = data['users_movies_sort']  # Extracted ratings for 20 most popular movies
index_small = data['index_small'].flatten()  # Flatten index_small to 1D array

# Define your own ratings vector (myratings)
myratings = np.array([5, 1, 4, 3, 2, 5, 1, 4, 3, 5, 2, 5, 1, 3, 4, 5, 2, 1, 4, 3])

# Step 4: Select users who rated all 20 popular movies
[m1, n1] = users_movies_sort.shape
ratings = [users_movies_sort[j, :] for j in range(m1) if np.all(users_movies_sort[j, :] > 0)]
ratings = np.array(ratings)

# Step 5: Compute Euclidean distances
eucl = np.linalg.norm(ratings - myratings, axis=1)

# Step 6: Find the closest user based on Euclidean distance
MinDist, DistIndex = np.sort(eucl), np.argsort(eucl)
closest_user_Dist = DistIndex[0]

# Step 7: Centralize ratings and myratings for Pearson correlation
ratings_cent = ratings - np.mean(ratings, axis=1, keepdims=True)
myratings_cent = myratings - np.mean(myratings)

# Step 8: Compute Pearson correlation coefficients
pearson = np.sum(ratings_cent * myratings_cent, axis=1) / (
    np.sqrt(np.sum(ratings_cent ** 2, axis=1)) * np.sqrt(np.sum(myratings_cent ** 2))
)

# Step 9: Find the closest user based on Pearson correlation
MaxPearson, PearsonIndex = np.sort(pearson)[::-1], np.argsort(pearson)[::-1]
closest_user_Pearson = PearsonIndex[0]

# Step 10: Create recommendations based on the distance criterion
recommend_dist = [k for k in range(users_movies.shape[1]) if users_movies[closest_user_Dist, k] == 5]

# Step 11: Create recommendations based on the Pearson correlation criterion
recommend_pearson = [k for k in range(users_movies.shape[1]) if users_movies[closest_user_Pearson, k] == 5]

# Create the list of movies liked by the trial user
liked = [index_small[k] for k in range(20) if myratings[k] == 5]

# Convert indices to movie titles
liked_titles = [movies[i][0] for i in liked]
recommend_dist_titles = [movies[i][0] for i in recommend_dist]
recommend_pearson_titles = [movies[i][0] for i in recommend_pearson]

# Print the results
print("Movies liked by the trial user:")
print(liked_titles)

print("Recommended movies based on distance criterion:")
print(recommend_dist_titles)

print("Recommended movies based on Pearson correlation criterion:")
print(recommend_pearson_titles)


Movies liked by the trial user:
[array(['Search for One-eye Jimmy, The (1996)'], dtype='<U36'), array(["Billy's Hollywood Screen Kiss (1997)"], dtype='<U36'), array(['Snow White and the Seven Dwarfs (1937)'], dtype='<U38'), array(['Brazil (1985)'], dtype='<U13'), array(['Mass Appeal (1984)'], dtype='<U18')]
Recommended movies based on distance criterion:
[array(['Star Wars Episode IV - A New Hope (1977)'], dtype='<U40'), array(['Silence of the Lambs, The (1991)'], dtype='<U32'), array(['Casablanca (1942)'], dtype='<U17'), array(['Maltese Falcon, The (1941)'], dtype='<U26'), array(['Wizard of Oz, The (1939)'], dtype='<U24'), array(['Gone with the Wind (1939)'], dtype='<U25'), array(['Citizen Kane (1941)'], dtype='<U19'), array(['2001'], dtype='<U4'), array(['Paris Is Burning (1990)'], dtype='<U23'), array(['Raiders of the Lost Ark (1981)'], dtype='<U30'), array(['Bridge on the River Kwai, The (1957)'], dtype='<U36'), array(['Patton (1970)'], dtype='<U13'), array(['High Noon (1952)'], dt