In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [11]:
df = pd.read_csv('movies_ratings_cleaned.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,user_rating,genres_cleaned
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),1,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),1,Comedy Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),1,Action Crime Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),1,Mystery Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",1,Crime Mystery Thriller
...,...,...,...,...,...,...,...
100831,610,166534,4.0,2017-05-03 21:53:22,Split (2017),1,Drama Horror Thriller
100832,610,168248,5.0,2017-05-03 22:21:31,John Wick: Chapter Two (2017),1,Action Crime Thriller
100833,610,168250,5.0,2017-05-08 19:50:47,Get Out (2017),1,Horror
100834,610,168252,5.0,2017-05-03 21:19:12,Logan (2017),1,Action Sci-Fi


In [12]:
df.drop_duplicates(subset=['title'], inplace=True)
df

Unnamed: 0,userId,movieId,rating,timestamp,title,user_rating,genres_cleaned
0,1,1,4.0,2000-07-30 18:45:03,Toy Story (1995),1,Adventure Animation Children Comedy Fantasy
1,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men (1995),1,Comedy Romance
2,1,6,4.0,2000-07-30 18:37:04,Heat (1995),1,Action Crime Thriller
3,1,47,5.0,2000-07-30 19:03:35,Seven (a.k.a. Se7en) (1995),1,Mystery Thriller
4,1,50,5.0,2000-07-30 18:48:51,"Usual Suspects, The (1995)",1,Crime Mystery Thriller
...,...,...,...,...,...,...,...
100820,610,160341,2.5,2016-11-19 08:55:49,Bloodmoon (1997),0,Action Thriller
100821,610,160527,4.5,2016-11-19 08:43:18,Sympathy for the Underdog (1971),1,Action Crime Drama
100823,610,160836,3.0,2017-05-03 20:53:14,Hazard (2005),0,Action Drama Thriller
100827,610,163937,3.5,2017-05-03 21:59:49,Blair Witch (2016),0,Horror Thriller


In [13]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genres_cleaned'])  # نصوص الأنواع بعد التنظيف


In [14]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [15]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df, top_n=10):
    if title not in indices:
        return f"Movie '{title}' not found in the dataset."
    
    idx = indices[title]  # الحصول على الفهرس المرتبط بالعنوان
    sim_scores = list(enumerate(cosine_sim[idx]))  # الحصول على درجات التشابه مع الأفلام الأخرى
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # ترتيب الأفلام بناءً على التشابه
    sim_scores = sim_scores[1:top_n+1]  # الحصول على أعلى N توصيات (باستثناء الفيلم نفسه)
    movie_indices = [i[0] for i in sim_scores]  # استخراج الفهارس للأفلام الموصى بها
    
    # إرجاع التوصيات مع العنوان والأنواع
    return df[['title', 'genres_cleaned']].iloc[movie_indices]



In [16]:
# اختبار: الحصول على توصيات لفيلم Toy Story (1995)
recommended_movies = get_recommendations("Toy Story (1995)")
print(recommended_movies)

                                                   title  \
927                                   Toy Story 2 (1999)   
947                                Monsters, Inc. (2001)   
2714                                         Antz (1998)   
2962      Adventures of Rocky and Bullwinkle, The (2000)   
3127                    Emperor's New Groove, The (2000)   
3448                              Shrek the Third (2007)   
3639                            The Good Dinosaur (2015)   
13859  Asterix and the Vikings (Astérix et les Viking...   
15860                     Tale of Despereaux, The (2008)   
17856                                       Moana (2016)   

                                    genres_cleaned  
927    Adventure Animation Children Comedy Fantasy  
947    Adventure Animation Children Comedy Fantasy  
2714   Adventure Animation Children Comedy Fantasy  
2962   Adventure Animation Children Comedy Fantasy  
3127   Adventure Animation Children Comedy Fantasy  
3448   Adventure Anim