In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
plt.style.use("seaborn-pastel")
from sklearn import preprocessing
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 200)
import warnings
warnings.filterwarnings("ignore")

In [8]:
df=pd.read_csv('movies.csv')
df.head()

In [9]:
#df['IS_ADULT'].value_counts()
df=df[df['IS_ADULT']!=1]

In [10]:
df=df.drop(columns=['Unnamed: 0', 'POPULARITY', 'OVERVIEW', 'POSTER_PATH', 'IS_ADULT'])

In [11]:
df.head()

In [12]:
def check_detail(dataframe):
    d = {'SHAPE': dataframe.shape,
        'COLUMNS': dataframe.columns,
        'INDEX': dataframe.index,
        'VALUE TYPES': dataframe.dtypes,
        'DUPLICATED VALUES': dataframe.duplicated().sum(),
        'NUMBER OF UNIQUE VALUES': dataframe.nunique(),
        'ANY MISSING VALUES': dataframe.isnull().values.any(),
        'MISSING VALUES': dataframe.isnull().sum(),
        'DESCRIBE.T': dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T}
    hashtags = '---------------------------'
    for key, val in d.items():
        print(f'{hashtags} {key} {hashtags}')
        print(val)
    print(f'{hashtags} {"LIST END"} {hashtags}')
   

check_detail(dataframe=df)

In [13]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtype == 'O']  # Categorical columns
    num_but_cat = [col for col in dataframe.columns if
                   dataframe[col].nunique() < cat_th and dataframe[col].dtype != 'O']  # Numeric but categorical
    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > car_th and dataframe[col].dtype == 'O']  # Categorical but cardinal
    cat_cols += num_but_cat  # Combine categorical columns
    cat_cols = [col for col in cat_cols if col not in cat_but_car]  # Exclude cardinal columns from categorical columns

    num_cols = [col for col in dataframe.columns if dataframe[col].dtype != 'O']  # Numerical columns
    num_cols = [col for col in num_cols if col not in num_but_cat]  # Exclude numeric-looking categories

    print(f'Observations: {dataframe.shape[0]}')
    print(f'Variables: {dataframe.shape[1]}')
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

In [14]:
def identify_check(dataframe):
    print('-' * 80)
    print(f'Categorical : {cat_cols}')
    print('-' * 80)
    print(f'Numerical : {num_cols}')
    print('-' * 80)
    print(f'Categorical but Cardinal : {cat_but_car}')

cat_cols, num_cols, cat_but_car = grab_col_names(df)
identify_check(df)

In [15]:
df.TYPE.value_counts().plot.pie(autopct="%.0f%%",figsize=(6,6),pctdistance=0.8,
                                                 wedgeprops=dict(width=0.4))
plt.show()

In [16]:
df.replace('\\N', np.nan, inplace=True)

In [17]:
df.isnull().sum()

In [18]:
df.dropna(inplace=True)

In [19]:
def check_detail(dataframe):
    d = {'SHAPE': dataframe.shape,
        'COLUMNS': dataframe.columns,
        'INDEX': dataframe.index,
        'VALUE TYPES': dataframe.dtypes,
        'DUPLICATED VALUES': dataframe.duplicated().sum(),
        'NUMBER OF UNIQUE VALUES': dataframe.nunique(),
        'ANY MISSING VALUES': dataframe.isnull().values.any(),
        'MISSING VALUES': dataframe.isnull().sum(),
        'DESCRIBE.T': dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T}
    hashtags = '---------------------------'
    for key, val in d.items():
        print(f'{hashtags} {key} {hashtags}')
        print(val)
    print(f'{hashtags} {"LIST END"} {hashtags}')
   

check_detail(dataframe=df)


In [20]:
df.GENRES.value_counts().plot.pie(autopct="%.0f%%",figsize=(6,6),pctdistance=0.8,
                                              wedgeprops=dict(width=0.4))
plt.show()

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
temp = df.GENRES.dropna()
vec = CountVectorizer(token_pattern='(?u)\\b[\\w-]+\\b', analyzer='word').fit(temp)
bag_of_genres = vec.transform(temp)
unique_genres =  vec.get_feature_names()
np.array(unique_genres)

In [22]:
GENRES = pd.DataFrame(bag_of_genres.todense(),columns=unique_genres,index=temp.index)
sorted_genres_perc = 100*pd.Series(GENRES.sum()).sort_values(ascending=False)/GENRES.shape[0]
plt.figure(figsize=(15,8))
sns.barplot(x=sorted_genres_perc.values,y=sorted_genres_perc.index,orient="h")
plt.xlabel("Percentage of Films (%)")
plt.show()

In [23]:
df['combined_features'] = df['ORIGINAL_TITLE'] + ' ' + df['GENRES'] + ' ' + df['DIRECTORS']
# Replace periods (.) with empty strings and commas (,) with spaces
df['combined_features'] = df['combined_features'].str.replace('.', '', regex=False)
df['combined_features'] = df['combined_features'].str.replace(',', ' ', regex=False)
df['combined_features'] = df['combined_features'].str.lower()
df.head()

In [24]:
df = df[(df['VOTE_COUNT'] > 2000) & (df['TYPE'] == 'movie')]

In [26]:
df = df.reset_index(drop=True)
df.shape

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
df["combined_features"].head()

In [29]:
tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [30]:
tfidf_matrix.shape

In [31]:
tfidf.get_feature_names()

In [32]:
tfidf_matrix.toarray()

In [33]:
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)

In [34]:
cosine_sim.shape
cosine_sim[1]

In [36]:
# Benzerliklere gore onerilerin yapilmasi
indices = pd.Series(df.index, index=df['ORIGINAL_TITLE'])

# Ayni isimdekileri sil sadece sonuncusunu birak
indices = indices[~indices.index.duplicated(keep='last')]

movie_index = indices['Se7en']

In [37]:
def get_similar_movies(movie_index, cosine_sim, df, top_n=5):
    if 0 <= movie_index < len(cosine_sim):
        similarity_scores = list(enumerate(cosine_sim[movie_index]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        movie_indices = [i for i, _ in similarity_scores[1:top_n+1]]  # Exclude the movie itself
        return df.iloc[movie_indices]
    else:
        return f"Index {movie_index} is out of bounds."

In [38]:
movie_index = indices['Se7en'] # Replace with the index of the movie you want to find similarities for
similar_movies_df = get_similar_movies(movie_index, cosine_sim, df)

print(similar_movies_df)