#### Step 1: Data Preprocessing and Cleaning

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
url = 'https://raw.githubusercontent.com/siddhantbhattarai/Machine_Learning_Bootcamp_2024/refs/heads/main/Datasets/movies.csv'
movies_df = pd.read_csv(url)
movies_df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,,,


In [2]:
# Clean 'Votes' column (remove commas and convert to numeric)
movies_df['VOTES'] = movies_df['VOTES'].replace(',', '', regex=True).astype(float)

In [3]:
# Convert 'YEAR' column to numeric (handle ranges or missing years)
movies_df['YEAR'] = pd.to_numeric(movies_df['YEAR'].str.extract(r'(\d{4})')[0], errors='coerce')

In [4]:
# Fill missing values in 'RATING' and 'RunTime' columns with mean
movies_df['RATING'] = movies_df['RATING'].fillna(movies_df['RATING'].mean())
movies_df['RunTime'] = movies_df['RunTime'].fillna(movies_df['RunTime'].mean())

In [5]:
# Drop rows with too many missing values (like 'Gross')
movies_df = movies_df.drop(columns=['Gross'])

In [6]:
# One-hot encode 'GENRE' (turning categorical genres into numerical features)
movies_df['GENRE'] = movies_df['GENRE'].str.strip()  # Remove leading/trailing spaces
genres = movies_df['GENRE'].str.get_dummies(sep=',')

In [7]:
# Merge the one-hot encoded genre back to the dataframe
movies_df = pd.concat([movies_df, genres], axis=1)

In [8]:
from sklearn.impute import SimpleImputer

# Impute missing numeric values using the mean for 'VOTES', 'RunTime', and 'RATING'
imputer = SimpleImputer(strategy='mean')

# Select columns with numeric data
numeric_columns = ['RATING', 'VOTES', 'RunTime']

# Apply the imputer to fill missing values
movies_df[numeric_columns] = imputer.fit_transform(movies_df[numeric_columns])

# Check if there are any remaining NaN values
print(movies_df.isna().sum())

MOVIES         0
YEAR         748
GENRE         80
RATING         0
ONE-LINE       0
            ... 
Sport          0
Talk-Show      0
Thriller       0
War            0
Western        0
Length: 62, dtype: int64


In [9]:
# Fill missing values in 'YEAR' with the median year
median_year = movies_df['YEAR'].median()
movies_df['YEAR'] = movies_df['YEAR'].fillna(median_year)

# Fill missing values in 'GENRE' with 'Unknown'
movies_df['GENRE'] = movies_df['GENRE'].fillna('Unknown')

# Check again for missing values
print(movies_df.isna().sum())

MOVIES       0
YEAR         0
GENRE        0
RATING       0
ONE-LINE     0
            ..
Sport        0
Talk-Show    0
Thriller     0
War          0
Western      0
Length: 62, dtype: int64


In [10]:
# Preview the cleaned data
movies_df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Action,Adventure,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,Blood Red Sky,2021.0,"Action, Horror, Thriller",6.1,\nA woman with a mysterious illness is forced ...,\n Director:\nPeter Thorwarth\n| \n Star...,21062.0,121.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Masters of the Universe: Revelation,2021.0,"Animation, Action, Adventure",5.0,\nThe war for Eternia begins again in what may...,"\n \n Stars:\nChris Wood, \nSara...",17870.0,25.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,The Walking Dead,2010.0,"Drama, Horror, Thriller",8.2,\nSheriff Deputy Rick Grimes wakes up from a c...,"\n \n Stars:\nAndrew Lincoln, \n...",885805.0,44.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rick and Morty,2013.0,"Animation, Adventure, Comedy",9.2,\nAn animated series that follows the exploits...,"\n \n Stars:\nJustin Roiland, \n...",414849.0,23.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Army of Thieves,2021.0,"Action, Crime, Horror",6.921176,"\nA prequel, set before the events of Army of ...",\n Director:\nMatthias Schweighöfer\n| \n ...,15124.062722,68.688539,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Strip extra spaces and newline characters in the text columns
movies_df['ONE-LINE'] = movies_df['ONE-LINE'].str.strip()
movies_df['STARS'] = movies_df['STARS'].str.strip()

In [12]:
# If 'VOTES' still has commas or incorrect formatting, replace them
movies_df['VOTES'] = movies_df['VOTES'].replace(',', '', regex=True).astype(float)

In [13]:
# Strip extra spaces or newline characters in the 'GENRE' column
movies_df['GENRE'] = movies_df['GENRE'].str.strip()

# If the genres still have extra characters, apply a split or cleaning function
movies_df['GENRE'] = movies_df['GENRE'].str.replace('\n', '')

In [14]:
# Fill missing values in 'YEAR' with the median year
median_year = movies_df['YEAR'].median()
movies_df['YEAR'] = movies_df['YEAR'].fillna(median_year)

# Fill missing genres with 'Unknown'
movies_df['GENRE'] = movies_df['GENRE'].fillna('Unknown')

# Convert all other numeric columns to the correct type if needed
movies_df['RunTime'] = movies_df['RunTime'].fillna(movies_df['RunTime'].mean())

# Final check for missing or improperly formatted data
print(movies_df.isna().sum())

MOVIES       0
YEAR         0
GENRE        0
RATING       0
ONE-LINE     0
            ..
Sport        0
Talk-Show    0
Thriller     0
War          0
Western      0
Length: 62, dtype: int64


In [15]:
# Strip extra spaces and newline characters
movies_df['STARS'] = movies_df['STARS'].str.strip()

# Remove any extra newline characters within the string
movies_df['STARS'] = movies_df['STARS'].str.replace('\n', '')

# Optionally, split multiple stars into a list (if they are separated by commas or other delimiters)
# Assuming stars are separated by commas in your dataset:
movies_df['STARS'] = movies_df['STARS'].str.split(',')

# Preview the cleaned 'STARS' column
movies_df[['STARS']].head()

Unnamed: 0,STARS
0,[Director:Peter Thorwarth| Stars:Peri Baum...
1,"[Stars:Chris Wood, Sarah Michelle Gellar, Le..."
2,"[Stars:Andrew Lincoln, Norman Reedus, Meliss..."
3,"[Stars:Justin Roiland, Chris Parnell, Spence..."
4,[Director:Matthias Schweighöfer| Stars:Mat...


In [16]:
# Preview the cleaned data
movies_df.head()

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Action,Adventure,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,Blood Red Sky,2021.0,"Action, Horror, Thriller",6.1,A woman with a mysterious illness is forced in...,[Director:Peter Thorwarth| Stars:Peri Baum...,21062.0,121.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Masters of the Universe: Revelation,2021.0,"Animation, Action, Adventure",5.0,The war for Eternia begins again in what may b...,"[Stars:Chris Wood, Sarah Michelle Gellar, Le...",17870.0,25.0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,The Walking Dead,2010.0,"Drama, Horror, Thriller",8.2,Sheriff Deputy Rick Grimes wakes up from a com...,"[Stars:Andrew Lincoln, Norman Reedus, Meliss...",885805.0,44.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Rick and Morty,2013.0,"Animation, Adventure, Comedy",9.2,An animated series that follows the exploits o...,"[Stars:Justin Roiland, Chris Parnell, Spence...",414849.0,23.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Army of Thieves,2021.0,"Action, Crime, Horror",6.921176,"A prequel, set before the events of Army of th...",[Director:Matthias Schweighöfer| Stars:Mat...,15124.062722,68.688539,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Step 2: Feature Engineering
- Now, we'll extract features for the machine learning models.

In [17]:
# Select the relevant features for the recommendation system
features_df = movies_df[['RATING', 'VOTES', 'RunTime'] + genres.columns.tolist()]

# Normalize the features (for algorithms that rely on distance measures)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_df)

# Create a new DataFrame with scaled features
features_df_scaled = pd.DataFrame(features_scaled, columns=features_df.columns)

#### Step 3: Data Splitting
- We need to split the data into training and testing sets to evaluate the performance of our models.

In [18]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
X_train, X_test = train_test_split(features_df_scaled, test_size=0.2, random_state=42)

# We won't need a target column since it's a recommendation system

#### Step 4: Model Selection
- We will apply four machine learning algorithms to create a movie recommendation system:

**1. K-Nearest Neighbors (KNN) for Collaborative Filtering**

In [19]:
from sklearn.neighbors import NearestNeighbors

# Fit the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X_train)

In [20]:
# Make sure you're passing a DataFrame with feature names to kneighbors
def knn_recommend(movie_index, n_recommendations=5):
    # Select the movie from the training set
    movie_features = X_train.iloc[[movie_index]]
    
    # Get distances and indices of the nearest neighbors
    distances, indices = knn.kneighbors(movie_features, n_neighbors=n_recommendations+1)
    
    # Return the recommended movies (excluding the input movie itself)
    return movies_df.iloc[indices[0][1:]]

# Example: Recommend movies for the first movie in the test set
knn_recommend(0)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Action,Adventure,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
3064,Losers,2019.0,"Documentary, Sport",7.7,"In a ""winning is everything"" society, how do w...","[Stars:Michael Bentt, Surya Bonaly, Harold P...",1551.0,30.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4148,Oh Yuck!,2017.0,Family,7.4,"Oh Yuck is a fast-paced, fact-laden, laugh out...","[Stars:Kayne Tremills, Nia Roam, David Colli...",30.0,68.688539,0,0,...,0,0,0,0,0,0,0,0,0,0
334,Bodyguard,2018.0,"Crime, Drama, Thriller",8.1,A contemporary thriller featuring the Royalty ...,"[Stars:Richard Madden, Sophie Rundle, Vincen...",103187.0,60.0,0,0,...,0,0,0,0,0,0,0,0,0,0
6345,Wraith,2018.0,Sci-Fi,6.921176,"Follows resident of New York City, who is tryi...",[Director:Sebastián Hofmann],15124.062722,68.688539,0,0,...,0,0,0,1,0,0,0,0,0,0
6940,Brigada Costa del Sol,2019.0,"Action, Adventure, Drama",8.2,Add a Plot,[Director:Marco A. Castillo| Stars:Hugo Si...,12.0,68.688539,0,1,...,0,0,0,0,0,0,0,0,0,0


**2. Matrix Factorization using SVD (Singular Value Decomposition)**

In [21]:
from sklearn.decomposition import TruncatedSVD

# Apply SVD for collaborative filtering
svd = TruncatedSVD(n_components=20, random_state=42)
X_svd_train = svd.fit_transform(X_train)

# Recommend movies based on latent features
def svd_recommend(movie_index, n_recommendations=5):
    movie_latent_factors = X_svd_train[movie_index]
    distances = np.dot(X_svd_train, movie_latent_factors)
    top_indices = np.argsort(-distances)[1:n_recommendations+1]
    return movies_df.iloc[top_indices]

# Example: Recommend movies using SVD
svd_recommend(0)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Action,Adventure,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
6366,Wild Guys,2018.0,"Adventure, Comedy, Drama",6.921176,"A show about Luxury, Weird Stuff, Entertainmen...",[],15124.062722,68.688539,0,0,...,0,0,0,0,0,0,0,0,0,0
4673,Buried by the Bernards,2021.0,Reality-TV,7.3,"In 2017, Ryan Bernard opened the doors to R Be...","[Stars:Debbie Bernard, Deja Bernard, Raegan ...",312.0,68.688539,0,0,...,0,1,0,0,0,0,0,0,0,0
6099,The Night of the Wild Boar,2016.0,"Crime, Horror, Mystery",4.3,While looking for answers about her boyfriend'...,[Director:Ramiro Tenorio| Stars:Catalina Z...,226.0,75.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2612,Nae Dwie Teriuseu,2018.0,"Comedy, Drama, Romance",7.5,Go Ae Rin suddenly loses her husband. A myster...,"[Stars:So Ji-seob, In-sun Jung, Im She-mi, ...",993.0,35.0,0,0,...,0,0,0,0,0,0,0,0,0,0
6878,Greenleaf,2016.0,Drama,8.1,Grace is told to merge the white church of Fai...,[Director:Crystle Roberson| Stars:Merle Da...,33.0,42.0,0,0,...,0,0,0,0,0,0,0,0,0,0


**3. Content-Based Filtering (using genre and metadata)**

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity based on content (genre and metadata)
cosine_sim = cosine_similarity(X_train)

# Recommend movies based on similarity of content
def content_based_recommend(movie_index, n_recommendations=5):
    sim_scores = list(enumerate(cosine_sim[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in sim_scores[1:n_recommendations+1]]
    return movies_df.iloc[top_indices]

# Example: Recommend similar movies
content_based_recommend(0)

Unnamed: 0,MOVIES,YEAR,GENRE,RATING,ONE-LINE,STARS,VOTES,RunTime,Action,Adventure,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
3064,Losers,2019.0,"Documentary, Sport",7.7,"In a ""winning is everything"" society, how do w...","[Stars:Michael Bentt, Surya Bonaly, Harold P...",1551.0,30.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4148,Oh Yuck!,2017.0,Family,7.4,"Oh Yuck is a fast-paced, fact-laden, laugh out...","[Stars:Kayne Tremills, Nia Roam, David Colli...",30.0,68.688539,0,0,...,0,0,0,0,0,0,0,0,0,0
334,Bodyguard,2018.0,"Crime, Drama, Thriller",8.1,A contemporary thriller featuring the Royalty ...,"[Stars:Richard Madden, Sophie Rundle, Vincen...",103187.0,60.0,0,0,...,0,0,0,0,0,0,0,0,0,0
6345,Wraith,2018.0,Sci-Fi,6.921176,"Follows resident of New York City, who is tryi...",[Director:Sebastián Hofmann],15124.062722,68.688539,0,0,...,0,0,0,1,0,0,0,0,0,0
6940,Brigada Costa del Sol,2019.0,"Action, Adventure, Drama",8.2,Add a Plot,[Director:Marco A. Castillo| Stars:Hugo Si...,12.0,68.688539,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Step 5: Evaluation

In [24]:
from sklearn.metrics import mean_squared_error

# Assuming knn_predictions are the predicted ratings, ensure they are in the correct format
knn_predictions, _ = knn.kneighbors(X_test[:10], n_neighbors=5)

# Ensure the shapes match (X_test[:10] is a subset of test data)
# knn_predictions would need to be transformed into an array of ratings

# Example: If we're using the 'RATING' column to compare
actual_ratings = movies_df.loc[X_test.index[:10], 'RATING']

# Calculate Mean Squared Error between predicted ratings and actual ratings
mse = mean_squared_error(actual_ratings, knn_predictions[:, 0])  # Assuming you compare to the closest movie
print(f'Mean Squared Error for KNN: {mse}')

Mean Squared Error for KNN: 52.80604500613215
