In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix, csr_matrix, coo_matrix, hstack, save_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import ast
import gc
import os

----

In [2]:
ratings = pd.read_csv("raw_data/ratings_small.csv")[['rating', 'userId', 'movieId']]

movie_links = pd.read_csv("raw_data/links_small.csv")
movie_links.dropna(inplace=True)
movie_links.drop_duplicates(subset='movieId', inplace=True)
movie_links.drop_duplicates(subset='tmdbId', inplace=True)
movie_links = movie_links.astype(int)

movies_metadata = pd.read_csv("raw_data/movies_metadata.csv", low_memory=False, encoding="utf-8")
movies_metadata.dropna(subset='id', inplace=True)
movies_metadata.drop_duplicates(subset='id', inplace=True)
movies_metadata.drop([19730, 29503, 35587], inplace=True)
movies_metadata['id'] = movies_metadata['id'].astype(int)
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [3]:
movies_metadata = movies_metadata[['id', 'adult', 'genres', 'original_title', 'overview', 'production_companies', 'release_date', 'runtime', 'spoken_languages', 'vote_average', 'vote_count']]
movies_metadata.rename(columns={'id': 'tmdbId'}, inplace=True)
movie_links.drop_duplicates(subset='tmdbId', inplace=True)

In [4]:
movie_links = movie_links[movie_links['movieId'].isin(ratings['movieId'])]
movies_metadata = pd.merge(movie_links, movies_metadata, on='tmdbId')
movies_metadata.drop_duplicates(subset='movieId', inplace=True)
ratings = ratings[ratings['movieId'].isin(movies_metadata['movieId'])]

ratings.sort_values(by='movieId', inplace=True)
movies_metadata.sort_values(by='movieId', inplace=True)

original_movies_metadata = movies_metadata.copy()

movies_metadata.head(2)

Unnamed: 0,movieId,imdbId,tmdbId,adult,genres,original_title,overview,production_companies,release_date,runtime,spoken_languages,vote_average,vote_count
0,1,114709,862,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",7.7,5415.0
1,2,113497,8844,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6.9,2413.0


In [5]:
ratings['movieId'].unique().shape, movies_metadata['movieId'].unique().shape

((9025,), (9025,))

In [6]:
movies_metadata.drop(columns=['movieId', 'imdbId', 'tmdbId'], inplace=True)

In [7]:
movies_metadata[(movies_metadata['runtime'] > 0) & (movies_metadata['vote_average'] >= 1)].describe()

Unnamed: 0,runtime,vote_average,vote_count
count,8964.0,8964.0,8964.0
mean,105.958724,6.396408,443.703034
std,29.988813,0.92947,1001.384331
min,2.0,1.0,1.0
25%,93.0,5.8,30.0
50%,102.0,6.5,99.0
75%,115.0,7.1,370.0
max,1140.0,10.0,14075.0


In [8]:
runtime_median = movies_metadata[movies_metadata['runtime'] > 0]['runtime'].median()
vote_avg_mean = movies_metadata[movies_metadata['vote_average'] >= 1]['vote_average'].mean()
vote_count_median = movies_metadata[movies_metadata['vote_count'] > 0]['vote_count'].median()

In [9]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 9025 non-null   object 
 1   genres                9025 non-null   object 
 2   original_title        9025 non-null   object 
 3   overview              9013 non-null   object 
 4   production_companies  9025 non-null   object 
 5   release_date          9025 non-null   object 
 6   runtime               9025 non-null   float64
 7   spoken_languages      9025 non-null   object 
 8   vote_average          9025 non-null   float64
 9   vote_count            9025 non-null   float64
dtypes: float64(3), object(7)
memory usage: 705.2+ KB


In [10]:
#movies_metadata[movies_metadata['production_companies'].isna()]

In [11]:
#movies_metadata.at[19729, 'production_companies'] = "[{'name': 'Centurion'}, {'name':'Telescene Film Group Productions'}, {'name':'The Carousel Picture Company'}, {'name':'Vision View Entertainment'}]"
#movies_metadata.at[29502, 'production_companies'] = "[{'name': 'Mardock Scramble Production Committee'}, {'name':'GoHands'}, {'name':'King Records'}]"
#movies_metadata.at[35586, 'production_companies'] = "[{'name': 'Odyssey Media'}, {'name':'Pulser Productions'}, {'name':'Rogue State'}]"

In [12]:
movies_metadata['runtime'] = movies_metadata['runtime'].apply(lambda x: x if x != 0 else runtime_median)
movies_metadata['vote_average'] = movies_metadata['vote_average'].apply(lambda x: x if x != 0 else vote_avg_mean)
movies_metadata['vote_count'] = movies_metadata['vote_count'].apply(lambda x: x if x != 0 else vote_count_median)

# usefull for the large dataset version
movies_metadata.fillna({'overview': 'NANULL'}, inplace=True) ## -----------------------------
movies_metadata.fillna({'release_date': '9999'}, inplace=True) ## -----------------------------
movies_metadata.fillna({'runtime': runtime_median}, inplace=True)
movies_metadata.fillna({'spoken_languages': "[{'name':'NANULL'}]"}, inplace=True) ## -----------------------------
movies_metadata.fillna({'vote_average': vote_avg_mean}, inplace=True)
movies_metadata.fillna({'vote_count': vote_count_median}, inplace=True)

In [13]:
movies_metadata['title_and_overview'] = movies_metadata['original_title'] + " " + movies_metadata['overview']

movies_metadata['genres'] = movies_metadata['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

movies_metadata['production_companies'] = movies_metadata['production_companies'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

movies_metadata['release_date'] = movies_metadata['release_date'].apply(lambda x: int(x[:4]))
release_date_median = movies_metadata[movies_metadata['release_date'] < 9999]['release_date'].median()
movies_metadata['release_date'] = movies_metadata['release_date'].apply(lambda x: x if x != 9999 else release_date_median)

movies_metadata['spoken_languages'] = movies_metadata['spoken_languages'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])
movies_metadata['spoken_languages'] = movies_metadata['spoken_languages'].apply(lambda x: [l.replace('??????', 'NANULL').replace('?????', 'NANULL').replace('\x9a', 'š') 
                                                                                           if len(l) > 0 else 'NANULL' \
                                                                                           for l in x])

weighted_average = (movies_metadata['vote_average'] * movies_metadata['vote_count'] + 30 * vote_avg_mean) \
                    / (movies_metadata['vote_count'] + 30)
movies_metadata['vote_weighted_average'] = round(weighted_average, 4)

movies_metadata.drop(columns=['original_title', 'overview', 'vote_average', 'vote_count'], inplace=True)

movies_metadata.head(2)

Unnamed: 0,adult,genres,production_companies,release_date,runtime,spoken_languages,title_and_overview,vote_weighted_average
0,False,"[Animation, Comedy, Family]",[Pixar Animation Studios],1995,81.0,[English],"Toy Story Led by Woody, Andy's toys live happi...",7.6928
1,False,"[Adventure, Fantasy, Family]","[TriStar Pictures, Teitler Film, Interscope Co...",1995,104.0,"[English, Français]",Jumanji When siblings Judy and Peter discover ...,6.8938


In [14]:
credits = pd.read_csv("raw_data/credits.csv")[['id', 'cast']]
credits.drop_duplicates(subset='id', inplace=True)
credits.rename(columns={'id': 'tmdbId'}, inplace=True)
credits.head(2)

Unnamed: 0,tmdbId,cast
0,862,"[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,"[{'cast_id': 1, 'character': 'Alan Parrish', '..."


In [15]:
credits.shape

(45432, 2)

In [16]:
credits = credits[credits['tmdbId'].isin(original_movies_metadata['tmdbId'])]
credits.shape

(9025, 2)

In [17]:
credits['cast'] = credits['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])
credits.head(2)

Unnamed: 0,tmdbId,cast
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,8844,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,..."


In [18]:
keywords = pd.read_csv("raw_data/keywords.csv")
keywords.drop_duplicates(subset='id', inplace=True)
keywords.rename(columns={'id': 'tmdbId'}, inplace=True)
keywords.head(2)

Unnamed: 0,tmdbId,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."


In [19]:
keywords.shape

(45432, 2)

In [20]:
keywords = keywords[keywords['tmdbId'].isin(original_movies_metadata['tmdbId'])]
keywords.shape

(9025, 2)

In [21]:
keywords['keywords'] = keywords['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])
keywords.head(2)

Unnamed: 0,tmdbId,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."


In [22]:
#all_g = set()
#for v in movies_metadata['genres']:
#    for g in v:
#        all_g.add(g)

In [23]:
def onehot_column(df, col, custom=None, delete_col=True, save_cols=False):
    unique_counter = {}
    for i, row in df.iterrows():
        for value in row[col]:
            if custom is None or value in custom:
                unique_counter.setdefault(value, []).append(i)

    if 'NANULL' in unique_counter.keys():
        unique_counter.pop('NANULL')

    onehot_df = pd.DataFrame(False, index=df.index, columns=unique_counter.keys())
    for value, idx in unique_counter.items():
        onehot_df.loc[idx, value] = True

    if delete_col:
        df.drop(columns=col, inplace=True)

    if save_cols:
        return csr_matrix(onehot_df), list(onehot_df.columns)
    else:
        return csr_matrix(onehot_df)

In [24]:
top_prod_comp = {}
for i, v in movies_metadata.iterrows():
    for c in v['production_companies']:
        top_prod_comp.setdefault(c, 0)
        top_prod_comp[c] += 1
top_prod_comp_keys = list(dict(sorted(top_prod_comp.items(), key=lambda item: item[1], reverse=True)))

In [25]:
top_cast = {}
for i, v in credits.iterrows():
    for c in v['cast']:
        top_cast.setdefault(c, 0)
        top_cast[c] += 1
top_cast_keys = list(dict(sorted(top_cast.items(), key=lambda item: item[1], reverse=True)))

In [26]:
movies_metadata.head(2)

Unnamed: 0,adult,genres,production_companies,release_date,runtime,spoken_languages,title_and_overview,vote_weighted_average
0,False,"[Animation, Comedy, Family]",[Pixar Animation Studios],1995,81.0,[English],"Toy Story Led by Woody, Andy's toys live happi...",7.6928
1,False,"[Adventure, Fantasy, Family]","[TriStar Pictures, Teitler Film, Interscope Co...",1995,104.0,"[English, Français]",Jumanji When siblings Judy and Peter discover ...,6.8938


In [27]:
genres_onehot = onehot_column(movies_metadata, 'genres')
prod_comp_onehot = onehot_column(movies_metadata, 'production_companies', custom=top_prod_comp_keys[:100])
spoken_lang_onehot = onehot_column(movies_metadata, 'spoken_languages')
cast_onehot = onehot_column(credits, 'cast', custom=top_cast_keys[:5000])
keywords_onehot = onehot_column(keywords, 'keywords')

In [28]:
max_runtime = movies_metadata['runtime'].describe()['max']
movies_metadata['runtime'] = pd.cut(movies_metadata['runtime'], np.arange(1, max_runtime+20, 20), include_lowest=True, precision=0)

min_release_date, max_release_date = movies_metadata['release_date'].describe()['min'], movies_metadata['release_date'].describe()['max']
movies_metadata['release_date'] = pd.cut(movies_metadata['release_date'], np.arange(min_release_date, max_release_date+5, 5), include_lowest=True, precision=0)

movies_metadata['vote_weighted_average'] = pd.cut(movies_metadata['vote_weighted_average'], np.arange(1, 10.5, 0.5))

movies_metadata.head(2)

Unnamed: 0,adult,release_date,runtime,title_and_overview,vote_weighted_average
0,False,"(1992.0, 1997.0]","(61.0, 81.0]","Toy Story Led by Woody, Andy's toys live happi...","(7.5, 8.0]"
1,False,"(1992.0, 1997.0]","(101.0, 121.0]",Jumanji When siblings Judy and Peter discover ...,"(6.5, 7.0]"


In [29]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   adult                  9025 non-null   object  
 1   release_date           9025 non-null   category
 2   runtime                9025 non-null   category
 3   title_and_overview     9025 non-null   object  
 4   vote_weighted_average  9025 non-null   category
dtypes: category(3), object(2)
memory usage: 172.3+ KB


In [31]:
stop_words_file = pd.read_csv("raw_data/ranksnl_oldgoogle.txt", header=None)
stop_words_set = set(w.strip() for w in stop_words_file[0])

movies_metadata['title_and_overview'] = movies_metadata['title_and_overview'].apply(lambda x: " ".join(set(x.lower().split(" ")) - stop_words_set))

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words=None)
corpus = tfidf.fit_transform(movies_metadata['title_and_overview'])
corpus_sims = cosine_similarity(corpus, corpus, dense_output=False)
corpus_sims = corpus_sims.multiply(corpus_sims >= 0.009)

movies_metadata.drop(columns='title_and_overview', inplace=True)

corpus_sims.shape

(9025, 9025)

In [392]:
#corpus_sims_idxs = corpus_sims[0].argsort().flatten()[:-(10+1):-1]
#corpus_sims_idxs[1:]

In [393]:
## --------------------------------------------------------------------- ##

In [32]:
ohe = OneHotEncoder()
movies_metadata = ohe.fit_transform(movies_metadata)
movies_metadata

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36100 stored elements and shape (9025, 62)>

In [33]:
other_features = hstack([movies_metadata, prod_comp_onehot, spoken_lang_onehot])
other_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 55957 stored elements and shape (9025, 227)>

In [34]:
other_features_sims = cosine_similarity(other_features, other_features, dense_output=False)
other_features_sims = other_features_sims.multiply(other_features_sims >= 0.39)
other_features_sims.shape

(9025, 9025)

In [35]:
genre_sims = cosine_similarity(genres_onehot, genres_onehot, dense_output=False)
genre_sims = genre_sims.multiply(genre_sims >= 0.45)
genre_sims.shape

(9025, 9025)

In [36]:
cast_sims = cosine_similarity(cast_onehot, cast_onehot, dense_output=False)
cast_sims = cast_sims.multiply(cast_sims >= 0.04)
cast_sims.shape

(9025, 9025)

In [37]:
keywords_sims = cosine_similarity(keywords_onehot, keywords_onehot, dense_output=False)
keywords_sims = keywords_sims.multiply(keywords_sims >= 0.05)
keywords_sims.shape

(9025, 9025)

In [38]:
## --------------------------------------------------------------------- ##

In [39]:
cast_w = 0.115
keywords_w = 0.205
other_features_w = 0.06
genre_w = 0.245
corpus_w = 0.375

assert cast_w + keywords_w + other_features_w + corpus_w + genre_w == 1.0, \
    f"Sim weights need to add to 1 ({cast_w + keywords_w + other_features_w + corpus_w + genre_w})"

In [40]:
content_sims = cast_sims * cast_w + keywords_sims * keywords_w + other_features_sims * other_features_w + corpus_sims * corpus_w + genre_sims * genre_w
# 0.05 / 0.65 / 0.3 || 0.1 / 0.275 / 0.625

In [403]:
#K = 10
#results = content_sims[46].argsort().flatten()[:-(K+2):-1][1:]

In [404]:
#original_movies_metadata['original_title'].iloc[results]

-----

In [41]:
# def make_GBAMatrix(M):
#     if not isinstance(M, csr_matrix):
#         M = M.tocsr()
#         
#     global_mean = np.mean(M.data)
# 
#     row_means = []
#     for i in range(M.shape[0]):
#         start_idx = M.indptr[i]
#         end_idx = M.indptr[i+1]
#         row_data = M.data[start_idx:end_idx]
#         if len(row_data) > 0:
#             row_means.append(np.mean(row_data))
#     row_means = np.array(row_means)
#     rows_diff = row_means - global_mean
# 
#     M = M.tocsc()
#     col_means = []
#     for i in range(M.shape[1]):
#         start_idx = M.indptr[i]
#         end_idx = M.indptr[i+1]
#         col_data = M.data[start_idx:end_idx]
#         if len(col_data) > 0:
#             col_means.append(np.mean(col_data))
#     col_means = np.array(col_means)
#     cols_diff = col_means - global_mean
# 
#     M = M.tocsr()
#     return global_mean, rows_diff, cols_diff


def RowCenterMatrix(M):
    for i in range(M.shape[0]):
        start_idx = M.indptr[i]
        end_idx = M.indptr[i+1]
        row_data = M.data[start_idx:end_idx]

        if len(row_data) > 0:
            mean = np.mean(row_data)
            M.data[start_idx:end_idx] -= mean

    return M


def CosSim_Matrix(M):
    M_dot = M.dot(M.T)
    shape = M_dot.shape
    
    norms = np.sqrt(M.multiply(M).sum(axis=1)).A1
    norms[norms < 1e-6] = 1e-6

    # Convert the dot product matrix to COO format for element-wise operations
    M_dot = M_dot.tocoo()
    gc.collect()
    row, col, data = M_dot.row, M_dot.col, M_dot.data
    # Perform element-wise division by norms without creating a dense outer product matrix
    new_data = data / (norms[row] * norms[col])
    del M_dot, norms, data
    gc.collect()

    # Create a new sparse matrix with the divided values
    cossim = coo_matrix((new_data, (row, col)), shape=shape)
    del M, row, col
    gc.collect()
    
    return cossim.tocsr()


# def estimate_score(mat, SM, gba, nn, row, col):
#     N, _ = mat.shape
# 
#     sims = list(zip(SM[row].toarray().flatten(), range(N)))
#     sims.sort(reverse=True)
#     
#     cnt, S, Ssims = 0, 0, 0
#     for  sim, idx in sims[1:]:
#         if (mat[idx, col] != 0) and (sim > 0) and (cnt <= nn):
#             r = gba[0] + gba[1][idx] + gba[2][col]
#             S += sim * (mat[idx, col] - r)
#             Ssims += sim
#             cnt += 1
#     
#     r = gba[0] + gba[1][row] + gba[2][col]
# 
#     return r if Ssims <= 0 else r + S / Ssims


In [42]:
movie_mappings = {m: i for i, m in enumerate(ratings['movieId'].unique())}

In [43]:
ratings['movieId_mapping'] = ratings['movieId'].map(movie_mappings)
ratings.head()

Unnamed: 0,rating,userId,movieId,movieId_mapping
18633,4.5,124,1,0
7142,4.0,43,1,0
70088,3.0,484,1,0
46068,4.0,333,1,0
51144,4.0,380,1,0


In [44]:
ii_matrix = csr_matrix((ratings['rating'], (ratings['movieId_mapping'], ratings['userId']-1)))
ii_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 99810 stored elements and shape (9025, 671)>

In [45]:
ii_matrix = RowCenterMatrix(ii_matrix)
ii_matrix = CosSim_Matrix(ii_matrix)
ii_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14896786 stored elements and shape (9025, 9025)>

In [410]:
#K = 10
#results = ii_matrix.todense()[1].argsort().A1[:-(K+2):-1][1:]
#results_mapped = [idx for result in results for idx, mapping in movie_mappings.items() if mapping == result]

In [411]:
#original_movies_metadata['original_title'][original_movies_metadata['movieId'].isin(results_mapped)]

---

In [46]:
content_w = 0.65
cf_w = 0.35

assert content_w + cf_w == 1.0, f"Sim weights need to add to 1 ({content_w + cf_w})"

In [47]:
final_sims = content_sims * content_w + ii_matrix * cf_w   # 0.35 / 0.65

In [414]:
#final = final_sims[1].argsort().A1[:-(K+2):-1][1:]
#original_movies_metadata['original_title'].iloc[final]

In [48]:
I = 0
K = 10

final_sims_dense = final_sims.getrow(I).todense()
final = final_sims_dense.argsort().A1[:-(K+2):-1][1:]
original_movies_metadata['original_title'].iloc[final]

2501           Toy Story 2
7506           Toy Story 3
1862          A Bug's Life
3795        Monsters, Inc.
4595          Finding Nemo
328          The Lion King
8979              Zootopia
8825            Inside Out
8993          Finding Dory
912     The Wrong Trousers
Name: original_title, dtype: object

---

##### TODO: Turn sims to sparse output, set threshold for zeros -> save memory

In [532]:
#def min_sim_from_top_sims(matrix, top_k=30, title=""):
#    min_sim = np.inf
#    for sim in matrix:
#        top_sims = sorted(sim.flatten(), reverse=True)[:top_k]
#        lowest_sim = top_sims[-1]
#        i = -2
#        if lowest_sim == 0.0:
#            while lowest_sim == 0.0 and i <= -len(top_sims):
#                lowest_sim = top_sims[i]
#                i -= 1
#        elif lowest_sim < min_sim and lowest_sim != 0.0:
#            min_sim = lowest_sim
#    if title != "":
#        title = f"for matrix {title}"
#    return print(f"Minimum similarity score in the top {top_k} most similar items {title}   ->   {min_sim}\n")

In [449]:
#min_sim_from_top_sims(genre_sims, title="GENRE")
#min_sim_from_top_sims(corpus_sims, title="CORPUS")
#min_sim_from_top_sims(keywords_sims, title="KEYWORDS")
#min_sim_from_top_sims(cast_sims, title="CAST")
#min_sim_from_top_sims(other_features_sims, title="OTHER FEATURES")

Minimum similarity score in the top 30 most similar items for matrix GENRE   ->   0.4999999999999999

Minimum similarity score in the top 30 most similar items for matrix CORPUS   ->   0.009983781003186835

Minimum similarity score in the top 30 most similar items for matrix KEYWORDS   ->   0.051434449987363975

Minimum similarity score in the top 30 most similar items for matrix CAST   ->   0.062017367294604234

Minimum similarity score in the top 30 most similar items for matrix OTHER FEATURES   ->   0.39999999999999997



In [None]:
# 0.450
# 0.009
# 0.050
# 0.040
# 0.390

In [None]:
#import pickle
#
#pickle1 = pickle.dumps(csr_matrix(final_sims))
#pickle2 = pickle.dumps(csr_matrix(final_sims2))
#
#memory_size_bytes1 = len(pickle1)
#print(f"Memory size using pickle: {memory_size_bytes1 / 1024**2:.2f} MB")
#memory_size_bytes2 = len(pickle2)
#print(f"Memory size using pickle: {memory_size_bytes2 / 1024**2:.2f} MB")