<a href="https://colab.research.google.com/github/tolgaerdogmus/KETS/blob/main/movie_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)

In [14]:
df = pd.read_csv('/content/movies.csv', low_memory=False) # DtypeWarning kapatmak icin

In [15]:
df['GENRES'] = df['GENRES'].str.lower()

In [17]:
df.head()

In [18]:
df.info()

In [None]:
df['TYPE'].value_counts()

In [None]:
##################################################################################################
# FILTRELEME YONTEMI ILE TAVSIYE KODLARI
##################################################################################################
def recommend_top(df, genre='comedy', media_type='movie', count=1, vote_threshold = 500):
    # Filter the dataset by the selected genre and type
    # GENRE ve TYPE e gore filtrele
    # VOTE_COUNT u belli bir sayidan fazla olsun
    genre_filter = df['GENRES'].str.contains(genre, case=False, na=False)
    type_filter = df['TYPE'].str.contains(media_type, case=False, na=False)
    vote_count_filter = df['VOTE_COUNT'] > vote_threshold
    filtered_df = df[genre_filter & type_filter & vote_count_filter]

    # Sort the filtered dataset by average rating in descending order and select the top 10
    # Filtrelenmis datasetini AVG_RATING e gore sirala ve en bastan 10 tane getir
    top_10 = filtered_df.sort_values(by='AVG_RATING', ascending=False).head(count)

    # Select relevant columns to display
    top_10_recommendations = top_10[['TCONST', 'ORIGINAL_TITLE', 'TYPE', 'AVG_RATING', 'VOTE_COUNT', 'GENRES']]

    return top_10_recommendations


In [None]:
# Example usage: Recommend top 10 popular movies for the genre 'Horror', vote count > 50k
# Ornek kullanim: en populer 10 adet Horror turu ve oy sayisi 50binin uzerinde
recommend_top(df, 'comedy', 'movie', count= 10, vote_threshold=50000)

In [None]:
def recommend_most_popular_per_genre(df):
    # Create an empty list to store recommendations
    # Tavsiye icin bos bir liste tanimla
    recommendations = []

    # Get all unique genres
    # Tum virgul ile ayrilmis genreleri tek tek al
    all_genres = set(genre for sublist in df['GENRES'].dropna().str.split(',') for genre in sublist)

    for genre in all_genres:
        # Filter the dataset by the selected genre
        # genre basina veri setini filtrele
        genre_filter = df['GENRES'].str.contains(genre, case=False, na=False)
        filtered_df = df[genre_filter]

        if not filtered_df.empty:
            # Get the most popular movie for this genre
            # Genre icin en vote_countu ve avg_rating i yuksekleri diz ve birinci elemani al
            most_popular = filtered_df.sort_values(by=['VOTE_COUNT', 'AVG_RATING'], ascending=False).iloc[0]
            recommendations.append(most_popular)

    # Create a DataFrame for the recommendations
    # Data frame e cevir
    recommendations_df = pd.DataFrame(recommendations)

    # Ensure the DataFrame has the required columns
    # Gereken kolonlarin olup olmadigini kontrol et
    if not recommendations_df.empty:
        # Select relevant columns to display, ensuring all columns exist
        # Gosterilecek kolonlari sec
        columns_to_display = ['TCONST', 'ORIGINAL_TITLE', 'TYPE', 'AVG_RATING', 'VOTE_COUNT', 'GENRES']
        recommendations_df = recommendations_df[[
            col for col in columns_to_display if col in recommendations_df.columns
        ]]

    return recommendations_df


In [None]:
print(recommend_most_popular_per_genre(df))

In [20]:
#Text alanlarını birleştirme
df['combined_features'] = df['ORIGINAL_TITLE'] + ' ' + df['GENRES'] + ' ' + df['DIRECTORS']
# Replace periods (.) with empty strings and commas (,) with spaces
df['combined_features'] = df['combined_features'].str.replace('.', '', regex=False)
df['combined_features'] = df['combined_features'].str.replace(',', ' ', regex=False)
df['combined_features'] = df['combined_features'].str.lower()

In [21]:
#########################################################
# Shrink dataframe for cosine sim - BURADA KIRPMAK ZORUNDA KALDİM
filt_df = df[(df['VOTE_COUNT'] > 2000) & (df['TYPE'] == 'movie')]

In [22]:
# reset index cunku out of bounds hatasi veriyor sonra
filt_df = filt_df.reset_index(drop=True)
filt_df.shape

In [None]:
filt_df.head()

In [24]:
##################################################################################################
# ICERIK TEMELLI FILTRELEME YONTEMI ILE TAVSIYE KODLARI - GENRES
##################################################################################################
# GENRES KISMININ MATEMATIKSEL OLARAK TEMSILI ICIN METIN VEKTORLESTIRME

# Tek basina anlam tasimayan ingilizce kelimeleri cikar orn: and, or, of vs.
tfidf = TfidfVectorizer(stop_words='english')

# df[df['GENRES'].isnull()] # bos yok

# TF-IDF Matrisinin olusturulmasi
tfidf_matrix = tfidf.fit_transform(filt_df['combined_features'])

In [25]:
tfidf_matrix.shape


In [26]:
tfidf.get_feature_names_out()

In [27]:
# Cosine Similarity Matrisinin Olusturulmasi
cosine_sim = cosine_similarity(tfidf_matrix)

In [28]:
# Benzerliklere gore onerilerin yapilmasi
indices = pd.Series(filt_df.index, index=filt_df['ORIGINAL_TITLE'])

In [29]:
# Ayni isimdekileri sil sadece sonuncusunu birak
indices = indices[~indices.index.duplicated(keep='last')]


In [30]:
movie_index = indices['Se7en']

In [31]:
def get_similar_movies(movie_index, cosine_sim, df, top_n=5):
    if 0 <= movie_index < len(cosine_sim):
        similarity_scores = list(enumerate(cosine_sim[movie_index]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        movie_indices = [i for i, _ in similarity_scores[1:top_n+1]]  # Exclude the movie itself
        return df.iloc[movie_indices]
    else:
        return f"Index {movie_index} is out of bounds."

# Example usage
movie_index = indices['Se7en'] # Replace with the index of the movie you want to find similarities for
similar_movies_df = get_similar_movies(movie_index, cosine_sim, df)

print(similar_movies_df)