# Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import re
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

import ast
import datetime

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

import pickle



# Functions

In [49]:
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']


def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan
    

def get_month(x):
    try:
        return month_order[int(str(x).split('-')[1]) - 1]
    except:
        return np.nan

def get_day(x):
    try:
        year, month, day = (int(i) for i in x.split('-'))
        answer = datetime.date(year, month, day).weekday()
        return day_order[answer]
    except:
        return np.nan
    

def feature_engineering(df):
    df['belongs_to_collection'] = df['belongs_to_collection'].apply(lambda x: 0 if x == np.nan else 1)
    for genre in genres_train:
        df['is_' + str(genre)] = df['genres'].apply(lambda x: 1 if genre in x else 0)
    df['genres'] = df['genres'].apply(lambda x: len(x))
    df['homepage'] = df['homepage'].apply(lambda x: 0 if x == np.nan else 1)
    df['is_english'] = df['original_language'].apply(lambda x: 1 if x=='en' else 0)
    df = df.drop('original_language', axis=1)
    df = df.drop('overview', axis=1)
    df = df.drop('poster_path', axis=1)
    for comp in ['Warner Bros','Universal Pictures','Paramount Pictures',	'Twentieth Century Fox Film Corporation']:
        df['is_' + str(comp)] = df['production_companies'].apply(lambda x: 1 if comp in x else 0)
    for country in ['United States of America','United Kingdom']:
        df['is_' + str(country)] = df['production_countries'].apply(lambda x: 1 if country in x else 0)
    df = df.drop(['production_companies','production_countries'], axis=1)
    df = df.drop('release_date', axis=1)
    df = df.drop('status', axis=1)
    df = df.drop('tagline', axis=1)
    df = df.drop('title', axis=1)
    df = df.drop('video', axis=1)
#    df = pd.get_dummies(df, prefix='is')
    df['runtime'] = df['runtime'].fillna(df['runtime'].mean())
    df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean())
    for day in day_order:
        df['is_' + str(day)] = df['day'].apply(lambda x: 1 if day in x else 0)
    for month in month_order:
        df['is_' + str(month)] = df['month'].apply(lambda x: 1 if month in x else 0)
    df = df.drop(['day','month'], axis=1)
    return df


def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower().strip()
    # Remove punctuation also numbers
    text = re.sub('[^a-z]+', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Re-join tokens into a string
    return ' '.join(lemmatized_text)
    


# Read Data

In [50]:
# Reading Data
df = pd.read_csv('movies_metadata_small.csv')

# Preprocessing

In [51]:
# Preprocessing

df.drop(columns=["Unnamed: 0"], inplace=True)

df = df.drop(['imdb_id'], axis=1)
df = df.drop('original_title', axis=1)
df['revenue'] = df['revenue'].replace(0, np.nan)
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')
df['budget'] = df['budget'].replace(0, np.nan)
df['return'] = df['revenue'] / df['budget']
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
df = df.drop('adult', axis=1)

df['production_countries'] = df['production_countries'].fillna('[]').apply(ast.literal_eval)
df['production_countries'] = df['production_countries'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

df['production_companies'] = df['production_companies'].fillna('[]').apply(ast.literal_eval)
df['production_companies'] = df['production_companies'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


df['popularity'] = df['popularity'].apply(clean_numeric).astype('float')
df['vote_count'] = df['vote_count'].apply(clean_numeric).astype('float')
df['vote_average'] = df['vote_average'].apply(clean_numeric).astype('float')

df['vote_average'] = df['vote_average'].replace(0, np.nan)

df['day'] = df['release_date'].apply(get_day)
df['month'] = df['release_date'].apply(get_month)

df['spoken_languages'] = df['spoken_languages'].fillna('[]').apply(ast.literal_eval).apply(lambda x: len(x) if isinstance(x, list) else np.nan)

df['runtime'] = df['runtime'].astype('float')

df['year'] = df['year'].replace('NaT', np.nan)
df['year'] = df['year'].apply(clean_numeric)

df['genres'] = df['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])



s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_rgf = df.drop('genres', axis=1).join(s)
genres_train = gen_rgf['genre'].drop_duplicates()


data = df.copy()
clean_df = feature_engineering(data)

# Vectorization

In [52]:
df['combined_text'] = df['overview'].fillna('') + ' ' + df['tagline'].fillna('')

# Apply text preprocessing
df['processed_text'] = df['combined_text'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95)

# Fit and transform the processed text
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])

# New Dataset

In [5]:
new_df = pd.read_csv("movies_features.csv")

new_df.head()

Unnamed: 0.1,Unnamed: 0,UMAP Component 1,UMAP Component 2,id,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15
0,0,12.306741,2.355496,862,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,12.601574,4.976632,8844,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,13.944426,5.778309,15602,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,3,12.983399,7.098469,31357,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,4.362645,13.562076,11862,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9082 entries, 0 to 9081
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        9082 non-null   int64  
 1   UMAP Component 1  9082 non-null   float64
 2   UMAP Component 2  9082 non-null   float64
 3   id                9082 non-null   int64  
 4   cluster_0         9082 non-null   bool   
 5   cluster_1         9082 non-null   bool   
 6   cluster_2         9082 non-null   bool   
 7   cluster_3         9082 non-null   bool   
 8   cluster_4         9082 non-null   bool   
 9   cluster_5         9082 non-null   bool   
 10  cluster_6         9082 non-null   bool   
 11  cluster_7         9082 non-null   bool   
 12  cluster_8         9082 non-null   bool   
 13  cluster_9         9082 non-null   bool   
 14  cluster_10        9082 non-null   bool   
 15  cluster_11        9082 non-null   bool   
 16  cluster_12        9082 non-null   bool   


# Combining TF-IDF Vector with Numerical Features

In [8]:
# feature_array = clean_df.values

feature_array = new_df.values

# Combine Features
# combined_features = np.hstack((feature_array, tfidf_matrix.toarray()))

# Imputing nan's with zeros
combined_features = np.nan_to_num(feature_array)

# Train your recommender model, for example, using Nearest Neighbors
model = NearestNeighbors(n_neighbors=41, algorithm='auto', metric='cosine')
model.fit(combined_features)


In [9]:
array_filename = 'combined_features_array_v2.pkl'
with open(array_filename, 'wb') as array_file:
    pickle.dump(combined_features, array_file)

In [10]:
# Save the model to a file
model_filename = 'nearest_neighbors_model_v2.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)


In [11]:
# Example function to get recommendations for a given movie
def get_recommendations(movie_index):
    distances, indices = model.kneighbors([combined_features[movie_index]])
    return indices

# Example usage: get recommendations for movie at index 0
recommendations = get_recommendations(0)
print(recommendations)

[[ 0  5 14 16 10  1  8 20 18 26 32 22 37 11  6 19 23  2 25  9 13 44 12 56
   4 27 46  3 36 29 38 41 53 67 40  7 39 49 72 51 88]]


In [56]:
# Example function to get recommendations for a given movie
def get_recommendations(movie_index):
    distances, indices = model.kneighbors([combined_features[movie_index]])
    recommendations = list(zip(indices[0], distances[0]))
    return recommendations

# Example usage: get recommendations for movie at index 0
recommendations = get_recommendations(0)
print(recommendations)

[(0, 0.0), (1511, 5.745491082898013e-09), (1138, 1.740554846918485e-07), (3195, 2.378076876530244e-07), (6452, 2.6034964173415887e-07)]
