## a)	Utility matrix as a Boolean, compute Jaccard distance between each pair

In [5]:
import pandas as pd
import numpy as np

# Sample Data: Replace this with your actual data load method
# Assuming 'data' is your DataFrame with users as rows and books as columns
data = pd.DataFrame({
    'User vs. Book Reading Table': ['User 1', 'User 2', 'User 3', 'User 4', 'User 5'],
    'Built to Last': [5.0, np.nan, np.nan, np.nan, 5.0],
    'The HP Way': [np.nan, np.nan, np.nan, 5.0, 3.0],
    'Physics Made Easy': [np.nan, 5.0, 2.0, np.nan, np.nan],
    'The Wisdom of Teams': [np.nan, np.nan, np.nan, 4.0, 2.0],
    'Baby Bear': [2.0, np.nan, 4.0 , np.nan, 4.0],
    'Experimental Chemistry': [np.nan, 4.0, 5.0, np.nan, np.nan],
    'Charlie the Ranch Dog': [3.0, np.nan, 5.0, np.nan, np.nan] ,
    'The Good to Great': [5.0, np.nan, np.nan, 5.0, 4.0],
    'Electronic devices and circuits – Millman & Halkias': [np.nan, 5.0, 5.0, 1.0, np.nan]
})

# Convert the ratings to a Boolean format
boolean_data = data.set_index('User vs. Book Reading Table').notnull()

# Function to compute Jaccard distance
def jaccard_distance(user1, user2):
    intersection = np.sum(user1 & user2)
    union = np.sum(user1 | user2)
    return 1 - intersection / union if union != 0 else 1

# Compute the Jaccard distance for each pair of users
n_users = boolean_data.shape[0]
jaccard_distances = np.zeros((n_users, n_users))

for i in range(n_users):
    for j in range(n_users):
        jaccard_distances[i, j] = jaccard_distance(boolean_data.iloc[i], boolean_data.iloc[j])

# Creating a DataFrame for the Jaccard distances
jaccard_distances_df = pd.DataFrame(jaccard_distances,
                                    index=boolean_data.index,
                                    columns=boolean_data.index)

# Display the Jaccard distance matrix
print(jaccard_distances_df)

User vs. Book Reading Table    User 1    User 2    User 3    User 4    User 5
User vs. Book Reading Table                                                  
User 1                       0.000000  1.000000  0.714286  0.857143  0.500000
User 2                       1.000000  0.000000  0.400000  0.833333  1.000000
User 3                       0.714286  0.400000  0.000000  0.875000  0.888889
User 4                       0.857143  0.833333  0.875000  0.000000  0.500000
User 5                       0.500000  1.000000  0.888889  0.500000  0.000000


## b)	Repeat a with Cosine distance

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Sample Data: Replace this with your actual data load method
# Assuming 'data' is your DataFrame with users as rows and books as columns
data = pd.DataFrame({
    'User vs. Book Reading Table': ['User 1', 'User 2', 'User 3', 'User 4', 'User 5'],
    'Built to Last': [5.0, np.nan, np.nan, np.nan, 5.0],
    'The HP Way': [np.nan, np.nan, np.nan, 5.0, 3.0],
    'Physics Made Easy': [np.nan, 5.0, 2.0, np.nan, np.nan],
    'The Wisdom of Teams': [np.nan, np.nan, np.nan, 4.0, 2.0],
    'Baby Bear': [2.0, np.nan, 4.0 , np.nan, 4.0],
    'Experimental Chemistry': [np.nan, 4.0, 5.0, np.nan, np.nan],
    'Charlie the Ranch Dog': [3.0, np.nan, 5.0, np.nan, np.nan] ,
    'The Good to Great': [5.0, np.nan, np.nan, 5.0, 4.0],
    'Electronic devices and circuits – Millman & Halkias': [np.nan, 5.0, 5.0, 1.0, np.nan]
})

# Fill NaN values with 0
data_filled = data.set_index('User vs. Book Reading Table').fillna(0)

# Compute the Cosine similarity matrix
cosine_sim = cosine_similarity(data_filled)

# Convert similarity to distance
cosine_dist = 1 - cosine_sim

# Creating a DataFrame for the Cosine distances
cosine_distances_df = pd.DataFrame(cosine_dist,
                                   index=data_filled.index,
                                   columns=data_filled.index)

# Display the Cosine distance matrix
print(cosine_distances_df)


User vs. Book Reading Table    User 1        User 2        User 3  \
User vs. Book Reading Table                                         
User 1                       0.000000  1.000000e+00  7.026994e-01   
User 2                       1.000000  2.220446e-16  3.054094e-01   
User 3                       0.702699  3.054094e-01 -2.220446e-16   
User 4                       0.615202  9.248099e-01  9.373284e-01   
User 5                       0.201901  1.000000e+00  8.037954e-01   

User vs. Book Reading Table        User 4        User 5  
User vs. Book Reading Table                              
User 1                       6.152024e-01  2.019014e-01  
User 2                       9.248099e-01  1.000000e+00  
User 3                       9.373284e-01  8.037954e-01  
User 4                       1.110223e-16  3.721122e-01  
User 5                       3.721122e-01  1.110223e-16  


## c)	Treat 3, 4, 5 as 1 and 1,2, and blank as 0. Compute Jaccard distance between each pair

In [3]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# After mounting, use the file path to your dataset on Google Drive
file_path = '/content/drive/My Drive/Final Exam 256/Final Exam 256.xlsx'  # Update this path

# Function to map ratings to 1 and 0
def map_rating(rating):
    if pd.isna(rating) or rating < 3:
        return 0
    else:
        return 1

# Function to compute Jaccard distance
def jaccard_distance(user1, user2):
    intersection = np.sum(user1 & user2)
    union = np.sum(user1 | user2)
    return 1 - intersection / union if union != 0 else 1

# Load your data
data = pd.read_excel(file_path)

# Apply the mapping (3, 4, 5 as 1 and 1, 2, and blank as 0)
data_mapped = data.set_index('User vs. Book Reading Table').applymap(map_rating)

# Compute the Jaccard distance for each pair of users
n_users = data_mapped.shape[0]
jaccard_distances = np.zeros((n_users, n_users))

for i in range(n_users):
    for j in range(n_users):
        jaccard_distances[i, j] = jaccard_distance(data_mapped.iloc[i], data_mapped.iloc[j])

# Creating a DataFrame for the Jaccard distances
jaccard_distances_df = pd.DataFrame(jaccard_distances,
                                    index=data_mapped.index,
                                    columns=data_mapped.index)

# Display the Jaccard distance matrix
print(jaccard_distances_df)

Mounted at /content/drive
User vs. Book Reading Table    User 1  User 2    User 3  User 4    User 5
User vs. Book Reading Table                                              
User 1                       0.000000     1.0  0.833333     0.8  0.600000
User 2                       1.000000     0.0  0.600000     1.0  1.000000
User 3                       0.833333     0.6  0.000000     1.0  0.857143
User 4                       0.800000     1.0  1.000000     0.0  0.600000
User 5                       0.600000     1.0  0.857143     0.6  0.000000


## d)	Repeat C with Cosine distance

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the data from the Excel file
# Make sure to replace 'path_to_your_file.xlsx' with the actual path to your Excel file
data_full = pd.read_excel('/content/drive/My Drive/Final Exam 256/Final Exam 256.xlsx')

# Function to map ratings to 1 and 0
def map_rating(rating):
    if pd.isna(rating) or rating < 3:
        return 0
    else:
        return 1

# Apply the mapping to the data
data_full_mapped = data_full.set_index('User vs. Book Reading Table').applymap(map_rating)

# Compute the Cosine similarity matrix
cosine_sim = cosine_similarity(data_full_mapped)

# Convert similarity to distance
cosine_dist = 1 - cosine_sim

# Creating a DataFrame for the Cosine distances
cosine_distances_df = pd.DataFrame(cosine_dist,
                                   index=data_full_mapped.index,
                                   columns=data_full_mapped.index)

# Display the Cosine distance matrix
print(cosine_distances_df)


User vs. Book Reading Table        User 1        User 2    User 3  \
User vs. Book Reading Table                                         
User 1                      -2.220446e-16  1.000000e+00  0.711325   
User 2                       1.000000e+00 -2.220446e-16  0.422650   
User 3                       7.113249e-01  4.226497e-01  0.000000   
User 4                       6.666667e-01  1.000000e+00  1.000000   
User 5                       4.226497e-01  1.000000e+00  0.750000   

User vs. Book Reading Table        User 4   User 5  
User vs. Book Reading Table                         
User 1                       6.666667e-01  0.42265  
User 2                       1.000000e+00  1.00000  
User 3                       1.000000e+00  0.75000  
User 4                      -2.220446e-16  0.42265  
User 5                       4.226497e-01  0.00000  


## e)	Normalize the matrix by subtracting from each nonblank entry the average value for its user.

In [10]:
import pandas as pd
import numpy as np

# Load the data from the Excel file
# Make sure to replace 'path_to_your_file.xlsx' with the actual path to your Excel file
data_full = pd.read_excel('/content/drive/My Drive/Final Exam 256/Final Exam 256.xlsx')

# Set the index to user names
data_full.set_index('User vs. Book Reading Table', inplace=True)

# Calculate the average rating for each user
user_averages = data_full.mean(axis=1)

# Normalize the matrix
normalized_data = data_full.sub(user_averages, axis=0)

# Replace NaN values back to the normalized matrix
normalized_data.fillna(value=np.nan, inplace=True)

# Display the normalized data
print(normalized_data)

                             Built to Last  The HP Way  Physics Made Easy  \
User vs. Book Reading Table                                                 
User 1                                1.25         NaN                NaN   
User 2                                 NaN         NaN           0.333333   
User 3                                 NaN         NaN          -2.200000   
User 4                                 NaN        1.25                NaN   
User 5                                1.40       -0.60                NaN   

                             The Wisdom of Teams  Baby Bear  \
User vs. Book Reading Table                                   
User 1                                       NaN      -1.75   
User 2                                       NaN        NaN   
User 3                                       NaN      -0.20   
User 4                                      0.25        NaN   
User 5                                     -1.60       0.40   

                  

## f)	Using the normalized matrix from e, compute the cosine distance between each pair?

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# Load the data from the Excel file
# Make sure to replace 'path_to_your_file.xlsx' with the actual path to your Excel file
data_full = pd.read_excel('/content/drive/My Drive/Final Exam 256/Final Exam 256.xlsx')

# Set the index to user names
data_full.set_index('User vs. Book Reading Table', inplace=True)

# Calculate the average rating for each user
user_averages = data_full.mean(axis=1)

# Normalize the matrix
normalized_data = data_full.sub(user_averages, axis=0)

# Replace NaN values with 0 for the purpose of cosine similarity calculation
normalized_data_for_cosine = normalized_data.fillna(0)

# Compute the Cosine similarity matrix
cosine_sim = cosine_similarity(normalized_data_for_cosine)

# Convert similarity to distance
cosine_dist = 1 - cosine_sim

# Creating a DataFrame for the Cosine distances
cosine_distances_df = pd.DataFrame(cosine_dist,
                                   index=normalized_data.index,
                                   columns=normalized_data.index)

# Display the Cosine distance matrix
print(cosine_distances_df)

User vs. Book Reading Table    User 1        User 2    User 3        User 4  \
User vs. Book Reading Table                                                   
User 1                       0.000000  1.000000e+00  1.036901  8.165727e-01   
User 2                       1.000000  1.110223e-16  1.469668  1.342415e+00   
User 3                       1.036901  1.469668e+00  0.000000  1.257314e+00   
User 4                       0.816573  1.342415e+00  1.257314 -2.220446e-16   
User 5                       0.738376  1.000000e+00  1.013453  1.086938e+00   

User vs. Book Reading Table        User 5  
User vs. Book Reading Table                
User 1                       7.383757e-01  
User 2                       1.000000e+00  
User 3                       1.013453e+00  
User 4                       1.086938e+00  
User 5                       1.110223e-16  
