In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
plt.style.use("seaborn-pastel")
from sklearn import preprocessing
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 200)
import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('movies.csv')
df.head()

In [3]:
df=df[df['IS_ADULT']!=1]

In [4]:
df=df.drop(columns=['Unnamed: 0', 'POPULARITY', 'POSTER_PATH', 'IS_ADULT'])

In [5]:
df.describe([0, 0.05, 0.50, 0.75, 0.85, 0.90, 0.95, 0.99, 1]).T

In [6]:
df[df['VOTE_COUNT'] > 10000].count()

In [7]:
df = df[(df['VOTE_COUNT'] > 5000) & (df['TYPE'] == 'movie')]

In [8]:
df.head()

In [9]:
def check_detail(dataframe):
    d = {'SHAPE': dataframe.shape,
        'COLUMNS': dataframe.columns,
        'INDEX': dataframe.index,
        'VALUE TYPES': dataframe.dtypes,
        'DUPLICATED VALUES': dataframe.duplicated().sum(),
        'NUMBER OF UNIQUE VALUES': dataframe.nunique(),
        'ANY MISSING VALUES': dataframe.isnull().values.any(),
        'MISSING VALUES': dataframe.isnull().sum(),
        'DESCRIBE.T': dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T}
    hashtags = '---------------------------'
    for key, val in d.items():
        print(f'{hashtags} {key} {hashtags}')
        print(val)
    print(f'{hashtags} {"LIST END"} {hashtags}')
   

check_detail(dataframe=df)

In [10]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == 'O']  # Categorical columns
    num_but_cat = [col for col in dataframe.columns if
                   dataframe[col].nunique() < cat_th and dataframe[col].dtype != 'O']  # Numeric but categorical
    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > car_th and dataframe[col].dtype == 'O']  # Categorical but cardinal
    cat_cols += num_but_cat  # Combine categorical columns
    cat_cols = [col for col in cat_cols if col not in cat_but_car]  # Exclude cardinal columns from categorical columns

    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != 'O']  # Numerical columns
    num_cols = [col for col in num_cols if col not in num_but_cat]  # Exclude numeric-looking categories

    print(f'Observations: {dataframe.shape[0]}')
    print(f'Variables: {dataframe.shape[1]}')
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car
def identify_check(dataframe):
    print('-' * 80)
    print(f'Categorical : {cat_cols}')
    print('-' * 80)
    print(f'Numerical : {num_cols}')
    print('-' * 80)
    print(f'Categorical but Cardinal : {cat_but_car}')

cat_cols, num_cols, cat_but_car = grab_col_names(df)
identify_check(df)

In [11]:
df.TYPE.value_counts().plot.pie(autopct="%.0f%%",figsize=(6,6),pctdistance=0.8,
                                                 wedgeprops=dict(width=0.4))
plt.show()

In [12]:
df.replace('\\N', np.nan, inplace=True)

In [13]:
df.isnull().sum()

In [14]:
df = df[df['OVERVIEW'].notna()]

In [15]:
df.head(5)

In [16]:
df.isnull().sum()

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
df["OVERVIEW"].head()

In [19]:
tfidf = TfidfVectorizer(stop_words='english')

In [20]:
tfidf_matrix = tfidf.fit_transform(df['OVERVIEW'])

In [21]:
tfidf_matrix.shape

In [22]:
tfidf.get_feature_names()

In [23]:
tfidf_matrix.toarray()

In [24]:
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)

In [25]:
cosine_sim.shape

In [26]:
cosine_sim[1]

In [27]:
indices = pd.Series(df.index, index=df['ORIGINAL_TITLE'])

In [28]:
indices = indices[~indices.index.duplicated(keep='last')]

In [29]:
movie_index = indices['Way Down East']

In [30]:
cosine_sim[movie_index]

In [31]:
similarity_scores = pd.DataFrame(cosine_sim[movie_index],
                                 columns=["score"])

In [32]:
movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

In [33]:
df['ORIGINAL_TITLE'].iloc[movie_indices]

In [34]:
def content_based_recommender(title, cosine_sim, dataframe):
    # index'leri olusturma
    indices = pd.Series(dataframe.index, index=dataframe['ORIGINAL_TITLE'])
    indices = indices[~indices.index.duplicated(keep='last')]
    # title'ın index'ini yakalama
    movie_index = indices[title]
    # title'a gore benzerlik skorlarını hesapalama
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])
    # kendisi haric ilk 10 filmi getirme
    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['ORIGINAL_TITLE'].iloc[movie_indices]

In [35]:
content_based_recommender('Way Down East', cosine_sim, df)

In [36]:
content_based_recommender('Orphans of the Storm', cosine_sim, df)