In [16]:
# ## Hücre 1: Gerekli Kütüphanelerin Yüklenmesi
import pandas as pd

In [17]:
# ## Hücre 2: Veri Setlerinin Yüklenmesi
# MovieLens ve IMDb veri setlerini yükleme
ratings_data = pd.read_csv('ratings.csv')
links_data = pd.read_csv('links.csv')
movies_data = pd.read_csv('movies.csv')
imdb_data = pd.read_csv('movie_metadata.csv')

print("Datasets loaded successfully.")

Datasets loaded successfully.


In [18]:
# ## Hücre 3: MovieLens ve Links Veri Setlerinin Birleştirilmesi
# MovieLens'in movies.csv dosyasını links.csv ile birleştiriyoruz
movie_links = pd.merge(movies_data, links_data, on='movieId', how='inner')

print("MovieLens and Links datasets merged successfully.")
print("\nMovieLinks Dataset Head:")
print(movie_links.head())

MovieLens and Links datasets merged successfully.

MovieLinks Dataset Head:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0  
1                   Adventure|Children|Fantasy  113497   8844.0  
2                               Comedy|Romance  113228  15602.0  
3                         Comedy|Drama|Romance  114885  31357.0  
4                                       Comedy  113041  11862.0  


In [19]:
# ## Hücre 4: IMDb Veri Seti ile Birleştirme
# IMDb veri setinden movie_imdb_link sütunundan imdb_id çıkarma
imdb_data['imdb_id'] = imdb_data['movie_imdb_link'].apply(lambda x: x.split('/')[4][2:] if isinstance(x, str) and 'tt' in x else None)
if imdb_data['imdb_id'].isnull().all():
    print("Error: imdb_id extraction failed. Please check the 'movie_imdb_link' column format.")
else:
    print("imdb_id extraction successful.")
    print("\nExtracted IMDb IDs:")
    print(imdb_data[['movie_imdb_link', 'imdb_id']].head())


# links.csv'deki imdbId formatını uygun hale getirme
movie_links['imdbId'] = movie_links['imdbId'].astype(str).str.zfill(7)

# IMDb veri setiyle birleştirme
final_merge = pd.merge(movie_links, imdb_data, left_on='imdbId', right_on='imdb_id', how='inner')

print("MovieLens-Links dataset successfully merged with IMDb dataset.")
print("\nFinal Merged Dataset Head:")
print(final_merge.head())


imdb_id extraction successful.

Extracted IMDb IDs:
                                     movie_imdb_link  imdb_id
0  http://www.imdb.com/title/tt0499549/?ref_=fn_t...  0499549
1  http://www.imdb.com/title/tt0449088/?ref_=fn_t...  0449088
2  http://www.imdb.com/title/tt2379713/?ref_=fn_t...  2379713
3  http://www.imdb.com/title/tt1345836/?ref_=fn_t...  1345836
4  http://www.imdb.com/title/tt5289954/?ref_=fn_t...  5289954
MovieLens-Links dataset successfully merged with IMDb dataset.

Final Merged Dataset Head:
   movieId                           title  \
0        1                Toy Story (1995)   
1       10                GoldenEye (1995)   
2       11  American President, The (1995)   
3       14                    Nixon (1995)   
4       15         Cutthroat Island (1995)   

                                      genres_x   imdbId   tmdbId  \
0  Adventure|Animation|Children|Comedy|Fantasy  0114709    862.0   
1                    Action|Adventure|Thriller  0113189    710.0   
2   

In [20]:
# ## Hücre 5: Nihai Veri Setinin İncelenmesi
# Veri tipi ve format kontrolü
print("\nFinal Merged Dataset Info:")
print(final_merge.info())


Final Merged Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3649 entries, 0 to 3648
Data columns (total 34 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   movieId                    3649 non-null   int64  
 1   title                      3649 non-null   object 
 2   genres_x                   3649 non-null   object 
 3   imdbId                     3649 non-null   object 
 4   tmdbId                     3647 non-null   float64
 5   color                      3647 non-null   object 
 6   director_name              3644 non-null   object 
 7   num_critic_for_reviews     3649 non-null   float64
 8   duration                   3648 non-null   float64
 9   director_facebook_likes    3644 non-null   float64
 10  actor_3_facebook_likes     3644 non-null   float64
 11  actor_2_name               3648 non-null   object 
 12  actor_1_facebook_likes     3649 non-null   float64
 13  gross               

In [26]:
# ## Hücre 6: Temizlik ve Nihai Düzenlemeler
# Kullanılacak kolonların belirlenmesi
columns_to_keep = [
    'movieId', 'title', 'genres_x', 'director_name', 'imdb_score',
    'duration', 'budget', 'language', 'country', 'content_rating', 
    'num_voted_users', 'plot_keywords'
]

# Final veri setini oluşturma
final_data = final_merge[columns_to_keep]

# Kolonların yeniden adlandırılması
final_data.rename(columns={'genres_x': 'genres'}, inplace=True)

# Eksik değerlerin kontrolü ve temizliği
print("\nMissing Values in Final Dataset:")
print(final_data.isnull().sum())

# Eksik değerleri temizleme
final_data = final_data.dropna()




Missing Values in Final Dataset:
movieId              0
title                0
genres               0
director_name        5
imdb_score           0
duration             1
budget             170
language             4
country              0
content_rating      24
num_voted_users      0
plot_keywords        9
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.rename(columns={'genres_x': 'genres'}, inplace=True)


In [27]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3451 entries, 0 to 3646
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movieId          3451 non-null   int64  
 1   title            3451 non-null   object 
 2   genres           3451 non-null   object 
 3   director_name    3451 non-null   object 
 4   imdb_score       3451 non-null   float64
 5   duration         3451 non-null   float64
 6   budget           3451 non-null   float64
 7   language         3451 non-null   object 
 8   country          3451 non-null   object 
 9   content_rating   3451 non-null   object 
 10  num_voted_users  3451 non-null   int64  
 11  plot_keywords    3451 non-null   object 
dtypes: float64(3), int64(2), object(7)
memory usage: 350.5+ KB


In [28]:
# ## Hücre 7: Nihai Veri Setinin Kaydedilmesi
final_data.to_csv('final_user_movie_dataset.csv', index=False)
print("\nFinal dataset has been saved as 'final_user_movie_dataset.csv'.")


Final dataset has been saved as 'final_user_movie_dataset.csv'.
