<a href="https://colab.research.google.com/github/tolgaerdogmus/KETS/blob/main/movie_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

In [14]:
df = pd.read_csv('/content/movies.csv', low_memory=False) # DtypeWarning kapatmak icin

In [15]:
df['GENRES'] = df['GENRES'].str.lower()

In [17]:
df.head()

Unnamed: 0,TCONST,ORIGINAL_TITLE,TYPE,AVG_RATING,VOTE_COUNT,GENRES,DIRECTORS,YEAR
0,tt0000001,Carmencita,short,5.7,2063,"documentary,short",William K.L. Dickson,1894-01-01
1,tt0000005,Blacksmith Scene,short,6.2,2799,"comedy,short",William K.L. Dickson,1893-01-01
2,tt0000006,Chinese Opium Den,short,5.1,190,short,William K.L. Dickson,1894-01-01
3,tt0000008,Edison Kinetoscopic Record of a Sneeze,short,5.4,2212,"documentary,short",William K.L. Dickson,1894-01-01
4,tt0177707,Dickson Experimental Sound Film,short,6.7,2589,"music,short",William K.L. Dickson,1894-01-01


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249804 entries, 0 to 249803
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   TCONST          249804 non-null  object 
 1   ORIGINAL_TITLE  249804 non-null  object 
 2   TYPE            249804 non-null  object 
 3   AVG_RATING      249804 non-null  float64
 4   VOTE_COUNT      249804 non-null  int64  
 5   GENRES          249804 non-null  object 
 6   DIRECTORS       249804 non-null  object 
 7   YEAR            249804 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 15.2+ MB


In [None]:
df['TYPE'].value_counts()

Unnamed: 0_level_0,count
TYPE,Unnamed: 1_level_1
tvEpisode,121696
movie,87876
short,11159
tvMovie,10701
tvSeries,8329
tvMiniSeries,3223
video,2789
videoGame,1987
tvSpecial,1783
tvShort,261


In [None]:
##################################################################################################
# FILTRELEME YONTEMI ILE TAVSIYE KODLARI
##################################################################################################
def recommend_top(df, genre='comedy', media_type='movie', count=1, vote_threshold = 500):
    # Filter the dataset by the selected genre and type
    # GENRE ve TYPE e gore filtrele
    # VOTE_COUNT u belli bir sayidan fazla olsun
    genre_filter = df['GENRES'].str.contains(genre, case=False, na=False)
    type_filter = df['TYPE'].str.contains(media_type, case=False, na=False)
    vote_count_filter = df['VOTE_COUNT'] > vote_threshold
    filtered_df = df[genre_filter & type_filter & vote_count_filter]

    # Sort the filtered dataset by average rating in descending order and select the top 10
    # Filtrelenmis datasetini AVG_RATING e gore sirala ve en bastan 10 tane getir
    top_10 = filtered_df.sort_values(by='AVG_RATING', ascending=False).head(count)

    # Select relevant columns to display
    top_10_recommendations = top_10[['TCONST', 'ORIGINAL_TITLE', 'TYPE', 'AVG_RATING', 'VOTE_COUNT', 'GENRES']]

    return top_10_recommendations


In [None]:
# Example usage: Recommend top 10 popular movies for the genre 'Horror', vote count > 50k
# Ornek kullanim: en populer 10 adet Horror turu ve oy sayisi 50binin uzerinde
recommend_top(df, 'comedy', 'movie', count= 10, vote_threshold=50000)

Unnamed: 0,TCONST,ORIGINAL_TITLE,TYPE,AVG_RATING,VOTE_COUNT,GENRES
53114,tt0118799,La vita è bella,movie,8.6,749522,"comedy,drama,romance"
774,tt0027977,Modern Times,movie,8.5,261614,"comedy,drama,romance"
72409,tt1853728,Django Unchained,movie,8.5,1714082,"comedy,drama,western"
36848,tt0088763,Back to the Future,movie,8.5,1321558,"adventure,comedy,sci-fi"
773,tt0021749,City Lights,movie,8.5,197622,"comedy,drama,romance"
775,tt0032553,The Great Dictator,movie,8.4,239065,"comedy,drama,war"
126943,tt1187043,3 Idiots,movie,8.4,439489,"comedy,drama"
58406,tt0114709,Toy Story,movie,8.3,1078844,"adventure,animation,comedy"
15101,tt0057012,Dr. Strangelove or: How I Learned to Stop Worr...,movie,8.3,521844,"comedy,war"
228988,tt9052870,Chhichhore,movie,8.3,64481,"comedy,drama,romance"


In [None]:
def recommend_most_popular_per_genre(df):
    # Create an empty list to store recommendations
    # Tavsiye icin bos bir liste tanimla
    recommendations = []

    # Get all unique genres
    # Tum virgul ile ayrilmis genreleri tek tek al
    all_genres = set(genre for sublist in df['GENRES'].dropna().str.split(',') for genre in sublist)

    for genre in all_genres:
        # Filter the dataset by the selected genre
        # genre basina veri setini filtrele
        genre_filter = df['GENRES'].str.contains(genre, case=False, na=False)
        filtered_df = df[genre_filter]

        if not filtered_df.empty:
            # Get the most popular movie for this genre
            # Genre icin en vote_countu ve avg_rating i yuksekleri diz ve birinci elemani al
            most_popular = filtered_df.sort_values(by=['VOTE_COUNT', 'AVG_RATING'], ascending=False).iloc[0]
            recommendations.append(most_popular)

    # Create a DataFrame for the recommendations
    # Data frame e cevir
    recommendations_df = pd.DataFrame(recommendations)

    # Ensure the DataFrame has the required columns
    # Gereken kolonlarin olup olmadigini kontrol et
    if not recommendations_df.empty:
        # Select relevant columns to display, ensuring all columns exist
        # Gosterilecek kolonlari sec
        columns_to_display = ['TCONST', 'ORIGINAL_TITLE', 'TYPE', 'AVG_RATING', 'VOTE_COUNT', 'GENRES']
        recommendations_df = recommendations_df[[
            col for col in columns_to_display if col in recommendations_df.columns
        ]]

    return recommendations_df


In [None]:
print(recommend_most_popular_per_genre(df))

           TCONST                                  ORIGINAL_TITLE          TYPE  AVG_RATING  VOTE_COUNT                         GENRES
221050  tt8420184                                  The Last Dance  tvMiniSeries       9.100      155339  biography,documentary,history
11479   tt0043014                                    Sunset Blvd.         movie       8.400      238539                drama,film-noir
94918   tt1345836                           The Dark Knight Rises         movie       8.400     1844288          action,drama,thriller
60414   tt0910970                                          WALL·E         movie       8.400     1212166     adventure,animation,family
51769   tt0408236  Sweeney Todd: The Demon Barber of Fleet Street         movie       7.300      388932           drama,horror,musical
70378   tt0114369                                           Se7en         movie       8.600     1818770            crime,drama,mystery
121678  tt1286537                                      

In [20]:
#Text alanlarını birleştirme
df['combined_features'] = df['ORIGINAL_TITLE'] + ' ' + df['GENRES'] + ' ' + df['DIRECTORS']
# Replace periods (.) with empty strings and commas (,) with spaces
df['combined_features'] = df['combined_features'].str.replace('.', '', regex=False)
df['combined_features'] = df['combined_features'].str.replace(',', ' ', regex=False)
df['combined_features'] = df['combined_features'].str.lower()

In [21]:
#########################################################
# Shrink dataframe for cosine sim - BURADA KIRPMAK ZORUNDA KALDİM
filt_df = df[(df['VOTE_COUNT'] > 2000) & (df['TYPE'] == 'movie')]

In [22]:
# reset index cunku out of bounds hatasi veriyor sonra
filt_df = filt_df.reset_index(drop=True)
filt_df.shape

(23740, 9)

In [None]:
filt_df.head()

Unnamed: 0,TCONST,ORIGINAL_TITLE,TYPE,AVG_RATING,VOTE_COUNT,GENRES,DIRECTORS,YEAR
0,tt0004972,The Birth of a Nation,movie,6.1,26531,"drama,history,war",D.W. Griffith,1915-01-01
1,tt0006864,Intolerance: Love's Struggle Throughout the Ages,movie,7.7,16868,"drama,history",D.W. Griffith,1916-01-01
2,tt0009968,Broken Blossoms or The Yellow Man and the Girl,movie,7.2,11140,"drama,romance",D.W. Griffith,1919-01-01
3,tt0010806,True Heart Susie,movie,6.9,2009,"comedy,drama,romance",D.W. Griffith,1919-01-01
4,tt0011841,Way Down East,movie,7.3,5937,"drama,romance",D.W. Griffith,1920-01-01


In [24]:
##################################################################################################
# ICERIK TEMELLI FILTRELEME YONTEMI ILE TAVSIYE KODLARI - GENRES
##################################################################################################
# GENRES KISMININ MATEMATIKSEL OLARAK TEMSILI ICIN METIN VEKTORLESTIRME

# Tek basina anlam tasimayan ingilizce kelimeleri cikar orn: and, or, of vs.
tfidf = TfidfVectorizer(stop_words='english')

# df[df['GENRES'].isnull()] # bos yok

# TF-IDF Matrisinin olusturulmasi
tfidf_matrix = tfidf.fit_transform(filt_df['combined_features'])

In [25]:
tfidf_matrix.shape


(23740, 29736)

In [26]:
tfidf.get_feature_names_out()

array(['000', '007', '03', ..., 'üçüncü', 'þig', 'þór'], dtype=object)

In [27]:
# Cosine Similarity Matrisinin Olusturulmasi
cosine_sim = cosine_similarity(tfidf_matrix)

In [28]:
# Benzerliklere gore onerilerin yapilmasi
indices = pd.Series(filt_df.index, index=filt_df['ORIGINAL_TITLE'])

In [29]:
# Ayni isimdekileri sil sadece sonuncusunu birak
indices = indices[~indices.index.duplicated(keep='last')]


In [30]:
movie_index = indices['Se7en']

In [31]:
def get_similar_movies(movie_index, cosine_sim, df, top_n=5):
    if 0 <= movie_index < len(cosine_sim):
        similarity_scores = list(enumerate(cosine_sim[movie_index]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        movie_indices = [i for i, _ in similarity_scores[1:top_n+1]]  # Exclude the movie itself
        return df.iloc[movie_indices]
    else:
        return f"Index {movie_index} is out of bounds."

# Example usage
movie_index = indices['Se7en'] # Replace with the index of the movie you want to find similarities for
similar_movies_df = get_similar_movies(movie_index, cosine_sim, df)

print(similar_movies_df)

         TCONST                      ORIGINAL_TITLE   TYPE  AVG_RATING  VOTE_COUNT           GENRES       DIRECTORS        YEAR                                  combined_features
9703  tt0138792  Sellaisena kuin sinä minut halusit  movie       6.800         261            drama     Teuvo Tulio  1944-01-01  sellaisena kuin sinä minut halusit drama teuvo...
9707  tt0030350           The Law West of Tombstone  movie       5.700         260          western     Glenn Tryon  1938-01-01      the law west of tombstone western glenn tryon
9709  tt0030391                    Love on the Wing  short       6.400         168  animation,short  Norman McLaren  1939-01-01    love on the wing animation short norman mclaren
9712  tt0032408                                Dots  short       6.300        1036  animation,short  Norman McLaren  1940-01-01                dots animation short norman mclaren
9705  tt0139687                 Unelma karjamajalla  movie       6.600         134    drama,romance     T