# Homework 4 - Recommendation systems and clustering everywhere

This work as been done by : Vianney Desbazeille, mailing adress : vianney.desbazeille@eleve.isep.fr.
Do not hesitate to reach me if you have any questions about my work. 

## 1. Recommendation sytem

### 1.2 Minhash Signatures

In [None]:
import hashlib
import pandas as pd

def minhash_signature(set_elements, num_perm):
    # Step 1: Set Parameters
    hash_functions = [hashlib.sha256(str(i).encode()).hexdigest() for i in range(num_perm)]
    minhash_sig = [float('inf')] * num_perm  # Step 3: Initialize MinHash Signature

    # Step 4: Update Signature
    for element in set_elements:
        for i, hash_function in enumerate(hash_functions):
            hash_val = int(hashlib.sha256((str(element) + hash_function).encode()).hexdigest(), 16)
            minhash_sig[i] = min(minhash_sig[i], hash_val)

    # Step 5: Final Signature
    return minhash_sig

# Importing data 
csv_file_path = 'vodclickstream_uk_movies_03.csv'  
data = pd.read_csv(csv_file_path)
print(data)
# Number of hash functions for MinHash
num_perm = 128

# Create MinHash signatures for each user
minhashes = {}
for index, row in data.iterrows():
    user_id = row['user_id']
    genres = row['genres'].split(',')  # Assuming genres are stored as a comma-separated string

    minhash = minhash_signature(genres, num_perm)
    minhashes[user_id] = minhash

# Simple function to calculate Jaccard similarity between two sets
def jaccard_similarity(set1, set2):
    intersection = len(set(set1) & set(set2))
    union = len(set(set1) | set(set2))
    return intersection / union if union != 0 else 0.0

# Threshold for similarity
threshold = 0.6

# Group users with similar interests into buckets
buckets = {}
for user_id, minhash in minhashes.items():
    similar_users = [other_user for other_user, other_minhash in minhashes.items()
                     if jaccard_similarity(minhash, other_minhash) >= threshold]
    buckets[user_id] = similar_users
    print(buckets)

# Display the buckets
for user_id, similar_users in buckets.items():
    print(f"User {user_id} is in a bucket with similar users: {similar_users}")


        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0.0   
671735      730508  2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                          

### 1.3 Locality-Sensitive Hashing (LSH)

In [69]:
import pandas as pd

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03.csv'
data = pd.read_csv(csv_file_path)

title_counts = data['title'].value_counts()

# Add a new column based on the key-value mapping
data['clicks'] = data['title'].map(title_counts)

# Save the updated DataFrame to a new CSV file
output_csv_path = 'vodclickstream_uk_movies_03_2.csv'  
data.to_csv(output_csv_path, index=False)

data = pd.read_csv(output_csv_path)
print(data)


        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0.0   
671735      730508  2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                          

In [23]:
import pandas as pd
import hashlib

# Function to create MinHash signature
def minhash_signature(set_elements, num_perm):
    hash_functions = [hashlib.sha256(str(i).encode()).hexdigest() for i in range(num_perm)]
    minhash_sig = [float('inf')] * num_perm

    for element in set_elements:
        for i, hash_function in enumerate(hash_functions):
            hash_val = int(hashlib.sha256((str(element) + hash_function).encode()).hexdigest(), 16)
            minhash_sig[i] = min(minhash_sig[i], hash_val)

    return minhash_sig

# Function to calculate Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = len(set(set1) & set(set2))
    union = len(set(set1) | set(set2))
    return intersection / union if union != 0 else 0.0

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_2.csv'
data = pd.read_csv(csv_file_path)

# Number of hash functions for MinHash
num_perm = 128

# Create MinHash signatures for each user
minhashes = {}
for index, row in data.iterrows():
    user_id = row['user_id']
    genres = row['genres'].split(',')  # Assuming genres are stored as a comma-separated string

    minhash = minhash_signature(genres, num_perm)
    minhashes[user_id] = minhash

# Function to find similar users based on MinHash similarity
def find_similar_users(query_user_id, minhashes, threshold=0.6):
    query_minhash = minhashes[query_user_id]
    similar_users = [(other_user, jaccard_similarity(query_minhash, other_minhash))
                     for other_user, other_minhash in minhashes.items()
                     if other_user != query_user_id and jaccard_similarity(query_minhash, other_minhash) >= threshold]

    # Sort by similarity in descending order
    similar_users.sort(key=lambda x: x[1], reverse=True)
    
    return similar_users[:2]  # Return the two most similar users

# Function to recommend movies to a user
def recommend_movies(user_id, minhashes, data):
    similar_users = find_similar_users(user_id, minhashes)
    
    recommended_movies = []
    
    # Check common movies between the two most similar users
    common_movies = set(data[data['user_id'] == similar_users[0][0]]['movie_id']).intersection(
                    set(data[data['user_id'] == similar_users[1][0]]['movie_id']))
    
    if common_movies:
        # Sort common movies based on the total number of clicks by similar users
        common_movies = sorted(common_movies, key=lambda x: data[(data['user_id'] == similar_users[0][0]) |
                                                                 (data['user_id'] == similar_users[1][0])]['clicks'].sum(),
                               reverse=True)
        
        recommended_movies.extend(common_movies[:min(5, len(common_movies))])
    
    # If no common movies, recommend most clicked movies by the most similar user first, then the other user
    else:
        most_clicked_by_similar = data[data['user_id'] == similar_users[0][0]].groupby('movie_id')['clicks'].sum()
        most_clicked_by_other = data[data['user_id'] == similar_users[1][0]].groupby('movie_id')['clicks'].sum()
        
        # Sort by clicks in descending order
        most_clicked_by_similar = most_clicked_by_similar.sort_values(ascending=False)
        most_clicked_by_other = most_clicked_by_other.sort_values(ascending=False)
        
        # Get movie IDs from the sorted Series
        most_clicked_by_similar = most_clicked_by_similar.index.tolist()
        most_clicked_by_other = most_clicked_by_other.index.tolist()
        
        recommended_movies.extend(most_clicked_by_similar[:min(3, len(most_clicked_by_similar))])
        recommended_movies.extend(most_clicked_by_other[:min(2, len(most_clicked_by_other))])
    
    return recommended_movies

# Example: Recommend movies for a specific user
user_id_to_recommend = '1dea19f6fe'  # Replace with the user ID for which you want to recommend movies
recommended_movies = recommend_movies(user_id_to_recommend, minhashes, data)

print(f"Recommended movies for User {user_id_to_recommend}: {recommended_movies}")


Recommended movies for User 1dea19f6fe: ['f33856f257', 'eb91c694cc', 'e0203fcd07', '254b5432f4', '2d01b1371c']


## 2. Grouping Users together!

### 2.1 Getting your data + feature engineering

Favorite genre :

In [134]:
import pandas as pd

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_2.csv'
data = pd.read_csv(csv_file_path)

# Split genres into a list
data['genres'] = data['genres'].str.split(',')

# Duplicate rows based on the number of genres
data_expanded = data.explode('genres')

# Calculate the most frequently occurring genre for each user
favorite_genre = data_expanded.groupby('user_id')['genres'].agg(lambda x: x.value_counts().idxmax()).reset_index()

# Create a dictionary mapping user IDs to their favorite genres
user_favorite_genres = dict(zip(favorite_genre['user_id'], favorite_genre['genres']))

# Add a new column based on the key-value mapping
data['favorite genre'] = data['user_id'].map(user_favorite_genres)

# Reassemble genres as a single string
data['genres'] = data['genres'].apply(','.join)

# Save the updated DataFrame to a new CSV file
output_csv_path = 'vodclickstream_uk_movies_03_3.csv'  
data.to_csv(output_csv_path, index=False)

data = pd.read_csv(output_csv_path)
print(data)


        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0.0   
671735      730508  2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                          

Average duration click :

In [135]:
import pandas as pd

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_3.csv'
data = pd.read_csv(csv_file_path)

# Calculate the average duration per user
average_duration = data.groupby('user_id')['duration'].mean().reset_index()
print(average_duration)

# Add a new column based on the key-value mapping
average_duration_per_user = dict(zip(average_duration['user_id'], average_duration['duration']))

data['average duration'] = data['user_id'].map(average_duration_per_user)

# Save the updated DataFrame to a new CSV file
output_csv_path = 'vodclickstream_uk_movies_03_3.csv'  
data.to_csv(output_csv_path, index=False)

data = pd.read_csv(output_csv_path)
print(data)



           user_id      duration
0       00004e2862      0.000000
1       000052a0a0   2024.166667
2       000090e7c8      0.000000
3       000118a755     -0.250000
4       000296842d   9663.375000
...            ...           ...
161913  fffd9bf758   8495.000000
161914  fffe7b777b   1785.000000
161915  fffeac83be  40606.272727
161916  ffff2c5f9e      0.000000
161917  ffffd36adf      0.000000

[161918 rows x 2 columns]
        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0

Time of the day : 
- 6 - 14 h morning 
- 14 - 20 h afternoon 
- 20 - 6 h night

In [136]:
import pandas as pd
from datetime import datetime, time

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_3.csv'
data = pd.read_csv(csv_file_path)

def categorize_time(row):
    # Extract only the time part
    time_part = row['datetime'].time()

    # Categorize the time
    if time(6, 0, 0) <= time_part < time(13, 0, 0):
        return 'Morning'
    elif time(13, 0, 0) <= time_part < time(20, 0, 0):
        return 'Afternoon'
    else:
        return 'Night'

def timeframe():
    # Convert 'datetime' column to datetime type
    data['datetime'] = pd.to_datetime(data['datetime'])

    # Create a new column 'time_category' based on the categorization function
    data['time_category'] = data.apply(categorize_time, axis=1)

    # Print the DataFrame with the new category
    print(data)
    output_csv_path = 'vodclickstream_uk_movies_03_3.csv'  
    data.to_csv(output_csv_path, index=False)

timeframe()


        Unnamed: 0            datetime  duration  \
0            58773 2017-01-01 01:15:09       0.0   
1            58774 2017-01-01 13:56:02       0.0   
2            58775 2017-01-01 15:17:47   10530.0   
3            58776 2017-01-01 16:04:13      49.0   
4            58777 2017-01-01 19:16:37       0.0   
...            ...                 ...       ...   
671731      730504 2019-06-30 21:37:08     851.0   
671732      730505 2019-06-30 21:49:34   91157.0   
671733      730506 2019-06-30 22:00:44       0.0   
671734      730507 2019-06-30 22:04:23       0.0   
671735      730508 2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                           ...   
6717

Movie preference :

In [143]:
import pandas as pd
import numpy as np

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_3.csv'
data = pd.read_csv(csv_file_path)

# Convert 'datetime' column to datetime type
data['release_date'] = pd.to_datetime(data['release_date'], errors='coerce')

# Extract only the year
data['year'] = data['release_date'].dt.year

# Use np.where to conditionally assign values
data['movie_preference'] = np.where(data['year'] > 2010, 'recent movies', 'old movies')

output_csv_path = 'vodclickstream_uk_movies_03_3.csv'  
data.to_csv(output_csv_path, index=False)

# Print the result
print(data)

        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0.0   
671735      730508  2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                          

### 2.2 Choose your features (variables)!

Here are some reasons why normalization can be a good idea:

Scale Sensitivity : Some machine learning algorithms, such as k-nearest neighbors or gradient descent-based optimization algorithms, are sensitive to the scale of features. Normalizing the features ensures that no single feature dominates the others.

Convergence : Normalization can help algorithms converge faster during training. It can prevent large-scale features from overshadowing small-scale features, allowing the algorithm to make progress more consistently.

Distance-Based Algorithms : Algorithms that rely on distances, such as k-means clustering or support vector machines, can benefit from normalization. It ensures that the distances between data points are calculated based on the relative importance of features.

Now, let's apply normalization and dimensionality reduction to your data. For demonstration purposes, I'll use the MinMaxScaler for normalization and Principal Component Analysis (PCA) for dimensionality reduction.

In [145]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Read data from CSV file
csv_file_path = 'vodclickstream_uk_movies_03_3.csv'
data = pd.read_csv(csv_file_path)

# Update some features to use them 
data['movie_preference'] = data['movie_preference'].replace({'old movies': 0, 'recent movies': 1})
data['time_category'] = data['time_category'].replace({'Morning': 0, 'Afternoon': 1, 'Night': 2})

# Extract features from dataframe
feature_columns = ['movie_preference', 'time_category', 'average duration']
X = data[feature_columns]

# Normalize features using Min-Max scaling
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
n_components = 2  # Adjust the number of components as needed
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_normalized)

# Create a new DataFrame with normalized and reduced features
columns_pca = [f'PC{i+1}' for i in range(n_components)]
data_normalized_pca = pd.DataFrame(X_pca, columns=columns_pca)

# Concatenate the new DataFrame with the original data
data = pd.concat([data, data_normalized_pca], axis=1)

# Print the result
print(data)


        Unnamed: 0             datetime  duration  \
0            58773  2017-01-01 01:15:09       0.0   
1            58774  2017-01-01 13:56:02       0.0   
2            58775  2017-01-01 15:17:47   10530.0   
3            58776  2017-01-01 16:04:13      49.0   
4            58777  2017-01-01 19:16:37       0.0   
...            ...                  ...       ...   
671731      730504  2019-06-30 21:37:08     851.0   
671732      730505  2019-06-30 21:49:34   91157.0   
671733      730506  2019-06-30 22:00:44       0.0   
671734      730507  2019-06-30 22:04:23       0.0   
671735      730508  2019-06-30 22:35:24       0.0   

                                            title  \
0              Angus, Thongs and Perfect Snogging   
1                    The Curse of Sleeping Beauty   
2                               London Has Fallen   
3                                        Vendetta   
4                 The SpongeBob SquarePants Movie   
...                                          