In [1]:
import pandas  
import sklearn
import scipy

In [None]:
data = pandas.read_csv("movie_dataset.csv")
key_cols = ['title', 'genres', 'overview', 'keywords', 'runtime', 'budget', 'production_countries', 'production_companies']
data = data[key_cols]

In [None]:
genre_lists = data['genres'].fillna('').apply(
    lambda x: [g.strip() for g in x.split (',') if g.strip()]
)

country_lists = data['production_countries'].fillna('').apply(
    lambda x: [g.strip() for g in x.split (',') if g.strip()]
)

genres_encoder = sklearn.preprocessing.MultiLabelBinarizer()
genres_data = genres_encoder.fit_transform(genre_lists)

country_encoder = sklearn.preprocessing.MultiLabelBinarizer()
country_data = country_encoder.fit_transform(country_lists)

overview_encoder = sklearn.feature_extraction.text.TfidfVectorizer()
overview_data = overview_encoder.fit_transform(data['overview'].fillna(''))

keyword_encoder = sklearn.feature_extraction.text.TfidfVectorizer()
keyword_data = overview_encoder.fit_transform(data['keywords'].fillna(''))

numeric_data = data[['runtime', 'budget']].copy()

for col in ['runtime', 'budget']:
    median_val = numeric_data[col].median()
    numeric_data[col] = numeric_data[col].fillna(median_val)

scaler = sklearn.preprocessing.MinMaxScaler()
numeric_data_normalized = scaler.fit_transform(numeric_data)



In [None]:
weights = {
    "genres":2.0,
    "country":1.1,
    "keyword":1.5,
    "overview":1.3,
    "numeric": 0.9,
}

weighted_genres = scipy.sparse.csr_matrix(genres_data * weights["genres"])
weighted_country = scipy.sparse.csr_matrix(genres_data * weights["country"])

weighted_keywords =  keyword_data * weights["genres"]
weighted_overview =  overview_data * weights["genres"]

numeric_data = scipy.sparse.csr_matrix(numeric_data_normalized * weights["numeric"])

full_data = scipy.sparse.hstack([
    weighted_genres,
    weighted_country,
    weighted_keywords,
    weighted_overview,
    numeric_data
    



])

In [None]:
model = sklearn.neighbors.NearestNeighbors(
    n_neighbors = 20,
    metric= "cosine",
    algorithm="auto",
)

model.fit(full_data)

In [None]:
def recommend_movies(movie_title: str, data: pandas.DataFrame, feature_data, n_recommendations = 20):
    match = data[data["title"].str.lower() == movie_title.lower()]

    if len(match) == 0:
        print("Try a new title")
        return
    
    movie_index = match.index[0]

    movie_vector = feature_data[movie_index]

    distances, indices = model.kneighbors(movie_vector)

    distances = distances.flatten()
    indices = indices.flatten()