In [1]:
import numpy as np
import pandas as pd



In [2]:
user_movie_ratings = pd.io.parsers.read_csv('ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter=',')


In [3]:
# user_movie_ratings.drop(0, inplace=True)


In [4]:
# user_movie_ratings

user_movie_ratings['movie_id'] = user_movie_ratings['movie_id'].astype(int)
user_movie_ratings['user_id'] = user_movie_ratings['user_id'].astype(int)
user_movie_ratings = user_movie_ratings[['user_id', 'movie_id', 'rating']]
# user_movie_ratings = user_movie_ratings.sort_values(by='movie_id')

user_movie_ratings


Unnamed: 0,user_id,movie_id,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
...,...,...,...
26024284,270896,58559,5.0
26024285,270896,60069,5.0
26024286,270896,63082,4.5
26024287,270896,64957,4.5


In [5]:

try:
    movie_data = pd.io.parsers.read_csv('movies.dat',
                                        names=['genres','movie_id', 'original_title', 'overview', 'release_date','title'],
                                        engine='python', delimiter=',', encoding='utf-8')
except UnicodeDecodeError:
    print("UTF-8 decoding failed. Trying with 'latin1' encoding.")
    movie_data = pd.io.parsers.read_csv('movies.dat',
                                        names=['genres','movie_id', 'original_title', 'overview', 'release_date','title'],
                                        engine='python', delimiter=',', encoding='latin1')


movie_data['movie_id'] = pd.to_numeric(movie_data['movie_id'], errors='coerce')

movie_data = movie_data.dropna(subset=['movie_id'])

movie_data['movie_id'] = movie_data['movie_id'].astype(int)
movie_data['title'] = movie_data['title'].astype(str)  # Ensure title column is string type
movie_data = movie_data.sort_values(by='movie_id')

movie_data['serial_number'] = range(1, len(movie_data) + 1)
movie_data['serial_number'] = movie_data['serial_number'].astype(int)
movie_data = movie_data[['movie_id', 'title', 'serial_number']]
movie_data


movie_data

Unnamed: 0,movie_id,title,serial_number
4342,2,Ariel,1
12947,3,Shadows in Paradise,2
17,5,Four Rooms,3
474,6,Judgment Night,4
256,11,Star Wars,5
...,...,...,...
45078,465044,Abduction,45459
45273,467731,Tragedy in a Temporary Town,45460
21891,468343,Silja - nuorena nukkunut,45461
45398,468707,Thick Lashes of Lauri Mäntyvaara,45462


In [6]:
merged_df = pd.merge(user_movie_ratings, movie_data, on='movie_id', how='left')
merged_df = merged_df.dropna(subset=['title'])
merged_df = merged_df.dropna(subset=['movie_id'])
merged_df['serial_number'] = merged_df['serial_number'].astype(int)

merged_df

Unnamed: 0,user_id,movie_id,rating,title,serial_number
0,1,110,1.0,Three Colors: Red,74
1,1,147,4.5,The 400 Blows,106
2,1,858,5.0,Sleepless in Seattle,679
4,1,1246,5.0,Rocky Balboa,832
5,1,1968,4.0,Fools Rush In,1256
...,...,...,...,...,...
26025338,270896,48780,5.0,Boat,19869
26025340,270896,49530,4.0,In Time,20090
26025347,270896,54001,4.0,The Traveler,21298
26025349,270896,54503,4.0,The Mystery of Chess Boxing,21433


In [7]:
user_movie_ratings = merged_df[['user_id', 'serial_number', 'rating']]
user_movie_ratings = user_movie_ratings.sort_values(by='serial_number')
user_movie_ratings


Unnamed: 0,user_id,serial_number,rating
4317246,44742,1,4.0
2759339,28611,1,3.0
9864815,101678,1,3.5
21197820,220194,1,3.0
21196980,220178,1,1.0
...,...,...,...
19358893,201077,35216,3.0
14410634,149751,35216,3.0
3385417,35192,35219,3.5
17175157,178311,35220,3.0


In [8]:
movie_data = merged_df[[ 'serial_number', 'title']]
movie_data = movie_data.drop_duplicates(subset='title', keep='first')
movie_data = movie_data.dropna(subset=['title'])
movie_data = movie_data.sort_values(by='serial_number')

movie_data

Unnamed: 0,serial_number,title
1711,1,Ariel
3793,2,Shadows in Paradise
27,3,Four Rooms
905,4,Judgment Night
147,5,Star Wars
...,...,...
3385406,35214,Enter the Dangerous Mind
14410634,35216,White Reindeer
3385417,35219,Behind the Rising Sun
17175157,35220,Parsifal


In [9]:
movie_data['movie_id'] = range(1, len(movie_data) + 1)
merged_df = pd.merge(user_movie_ratings, movie_data, on='serial_number', how='left')
merged_df = merged_df.dropna(subset=['title'])



merged_df

Unnamed: 0,user_id,serial_number,rating,title,movie_id
0,44742,1,4.0,Ariel,1.0
1,28611,1,3.0,Ariel,1.0
2,101678,1,3.5,Ariel,1.0
3,220194,1,3.0,Ariel,1.0
4,220178,1,1.0,Ariel,1.0
...,...,...,...,...,...
11437632,201077,35216,3.0,White Reindeer,7406.0
11437633,149751,35216,3.0,White Reindeer,7406.0
11437634,35192,35219,3.5,Behind the Rising Sun,7407.0
11437635,178311,35220,3.0,Parsifal,7408.0


In [10]:

merged_df['movie_id'] = merged_df['movie_id'].astype(int)
merged_df

Unnamed: 0,user_id,serial_number,rating,title,movie_id
0,44742,1,4.0,Ariel,1
1,28611,1,3.0,Ariel,1
2,101678,1,3.5,Ariel,1
3,220194,1,3.0,Ariel,1
4,220178,1,1.0,Ariel,1
...,...,...,...,...,...
11437632,201077,35216,3.0,White Reindeer,7406
11437633,149751,35216,3.0,White Reindeer,7406
11437634,35192,35219,3.5,Behind the Rising Sun,7407
11437635,178311,35220,3.0,Parsifal,7408


In [11]:
user_movie_ratings = merged_df[['user_id', 'movie_id', 'rating']]   
user_movie_ratings = user_movie_ratings.sort_values(by='movie_id')
unique_serial_numbers = user_movie_ratings['movie_id'].nunique()
print("Number of unique serial number values:", unique_serial_numbers)
user_movie_ratings

Number of unique serial number values: 7409


Unnamed: 0,user_id,movie_id,rating
0,44742,1,4.0
17379,270376,1,2.0
17378,190548,1,4.0
17377,45781,1,5.0
17376,188465,1,4.0
...,...,...,...
11437632,201077,7406,3.0
11437633,149751,7406,3.0
11437634,35192,7407,3.5
11437635,178311,7408,3.0


In [12]:
movie_data = merged_df[['movie_id','title']]
movie_data = movie_data.drop_duplicates(subset='title', keep='first')
movie_data.to_csv('output.csv', index=False)  # Change 'output.csv' to the desired file path and nam
movie_data

Unnamed: 0,movie_id,title
0,1,Ariel
26060,2,Shadows in Paradise
41557,3,Four Rooms
56815,4,Judgment Night
84710,5,Star Wars
...,...,...
11437631,7405,Enter the Dangerous Mind
11437632,7406,White Reindeer
11437634,7407,Behind the Rising Sun
11437635,7408,Parsifal


In [13]:
# import numpy as np

# def svd_on_batches(data, batch_size):
#     num_users = data['user_id'].nunique()
#     num_movies = data['movie_id'].nunique()

#     U_list = []
#     Sigma_list = []
#     V_list = []

#     for start_idx in range(0, num_users, batch_size):
#         end_idx = min(start_idx + batch_size, num_users)
#         user_batch = data[data['user_id'].between(start_idx + 1, end_idx)]

#         # Create a dictionary to store ratings for each movie
#         movie_ratings = {movie_id: [] for movie_id in range(1, num_movies + 1)}

#         # Collect ratings for each movie in the batch
#         for _, row in user_batch.iterrows():
#             movie_ratings[int(row['movie_id'])].append(row['rating'])

#         # Process each movie separately to reduce memory consumption
#         for movie_id, ratings in movie_ratings.items():
#             ratings_vec = np.array(ratings)
#             U, Sigma, V = np.linalg.svd(ratings_vec.reshape(1, -1))  # SVD on a single row matrix
#             U_list.append(U)
#             Sigma_list.append(Sigma)
#             V_list.append(V)

#     # Combine results from all movies
#     U_combined = np.concatenate(U_list, axis=0)
#     Sigma_combined = np.concatenate(Sigma_list, axis=0)
#     V_combined = np.concatenate(V_list, axis=0)

#     return U_combined, Sigma_combined, V_combined

# # Example usage
# batch_size = 1000  # Adjust batch size based on memory constraints
# U, Sigma, V = svd_on_batches(user_movie_ratings, batch_size)


In [14]:
# ratings_mat = np.ndarray(
#     shape = (np.max(user_movie_ratings.user_id.values),np.max(user_movie_ratings.movie_id.values))
# )
# ratings_mat[user_movie_ratings.user_id.values-1, user_movie_ratings.movie_id.values-1] = user_movie_ratings.rating.values
# print(np.shape(ratings_mat))

In [15]:
# import numpy as np

# # Assuming ratings_mat is your original matrix

# chunk_size = 100
# num_rows = ratings_mat.shape[0]
# num_batches = num_rows // chunk_size + 1 if num_rows % chunk_size != 0 else num_rows // chunk_size

# # Initialize an empty list to store normalized chunks
# normalized_chunks = []

# # Calculate mean and standard deviation across the entire dataset
# mean = np.mean(ratings_mat)
# std_dev = np.std(ratings_mat)

# # Loop through the data in chunks
# for i in range(num_batches):
#     start_idx = i * chunk_size
#     end_idx = min((i + 1) * chunk_size, num_rows)
    
#     # Extract the current chunk
#     chunk = ratings_mat[start_idx:end_idx, :]
    
#     # Normalize the chunk using mean and standard deviation of the entire dataset
#     normalized_chunk = (chunk - mean) / std_dev
    
#     # Append the normalized chunk to the list
#     normalized_chunks.append(normalized_chunk)

# # Concatenate the normalized chunks into a single matrix
# normalized_mat = np.vstack(normalized_chunks)

# print(normalized_mat)
# print(normalized_mat.shape)


In [16]:
# df_normalized = pd.DataFrame(normalized_mat)

# # Save DataFrame to a CSV file
# df_normalized.to_csv('normalized_matrix.csv', index=False)

In [18]:
import pandas as pd
import numpy as np
import os

# Function to save SVD results to CSV file
def save_svd_results(U, s, VT, filepath):
    df_U = pd.DataFrame(U)
    df_s = pd.DataFrame(np.diag(s))
    df_VT = pd.DataFrame(VT)
    
    # Check if file exists, if not, write header
    write_header = not os.path.exists(filepath)
    
    # Append to CSV file
    with open(filepath, 'a') as f:
        df_U.to_csv(f, header=write_header, index=False)
        df_s.to_csv(f, header=False, index=False)
        df_VT.to_csv(f, header=False, index=False)

# Load the matrix from the CSV file into a DataFrame
chunk_size = 1000
svd_filepath = 'svd_results.csv'
reader = pd.read_csv('normalized_matrix.csv', chunksize=chunk_size)

# Iterate over chunks and compute SVD
for i, chunk in enumerate(reader):
    # Convert DataFrame to numpy array
    matrix_chunk = chunk.to_numpy()

    # Apply Singular Value Decomposition (SVD)
    U, s, VT = np.linalg.svd(matrix_chunk, full_matrices=False)
    
    # Save SVD results to CSV file
    save_svd_results(U, s, VT, svd_filepath)

    print(f"Processed chunk {i + 1}")

print("SVD results saved to 'svd_results.csv'.")


Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr

KeyboardInterrupt: 

In [None]:
# U, sigma, V = np.linalg.svd(A)
# print(np.shape(U))
# print(np.shape(sigma))
# print(np.shape(V))
# print(V)


NameError: name 'A' is not defined

In [None]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]


In [None]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    movie_subset = movie_data[movie_data.movie_id == movie_id]
    if not movie_subset.empty:
        print('Recommendations for {0}: \n'.format(movie_subset.title.values[0]))
        for id in top_indexes + 1:
            id = type(movie_data.movie_id.values[0])(id)
            similar_movie_subset = movie_data[movie_data.movie_id == id]
            if not similar_movie_subset.empty:
                print(similar_movie_subset.title.values[0])
    else:
        print('Movie with ID {0} not found.'.format(movie_id))


In [None]:
id = 41
top_n = 10
movie_id = type(movie_data.movie_id.values[0])(id)


indexes = top_cosine_similarity(V.T, id, top_n)
print_similar_movies(movie_data, movie_id, indexes)


Recommendations for Toy Story (1995): 

Toy Story (1995)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
Get Shorty (1995)
Raising Arizona (1987)
Big (1988)
Beetlejuice (1988)
Dead Man Walking (1995)
Thelma & Louise (1991)
Cool Hand Luke (1967)
Ghostbusters (1984)
