### Importing libraries


In [1]:
import pandas as pd
import numpy as np
import ast
from ast import literal_eval
from pymongo import MongoClient

### Reading all the csv files

In [2]:
credits = pd.read_csv("credits.csv")
keywords = pd.read_csv("keywords.csv")
links = pd.read_csv("links.csv")
ratings = pd.read_csv("ratings_small.csv")
movie_data = pd.read_csv("movies_metadata.csv", low_memory=False)

In [3]:
# Convert the 'id' column values to integers for better data consistency and numeric operations.
movie_data["id"] = pd.to_numeric(movie_data['id'], errors='coerce',downcast="integer")

# Clean 'imdb_id' by removing prefix 'tt' and convert to integers.
movie_data["imdb_id"] = movie_data['imdb_id'].str[2:]
movie_data["imdb_id"] = pd.to_numeric(movie_data['imdb_id'], errors='coerce',downcast="integer")

In [4]:
# Remove rows with missing values in the 'id' column for data completeness.
movie_data.dropna(subset=["id"], inplace=True)
movie_data = movie_data.merge(credits,on=["id"],how="left")
movie_data = movie_data.merge(keywords,on=["id"],how="left")
movie_data.drop(["imdb_id"],axis=1).merge(links,left_on="id",right_on="movieId",how="inner")
movie_data.merge(links,left_on="imdb_id",right_on="imdbId",how="inner").shape
movie_data.drop_duplicates().shape
movie_data.dropna(subset=["cast","crew","keywords","popularity"],inplace=True)

# Convert 'budget' column to numeric and replace 0 values with NaN
movie_data['budget'] = pd.to_numeric(movie_data['budget'], errors='coerce')
movie_data['budget'] = movie_data['budget'].replace(0, np.nan)

# Replace 0 values in 'revenue' column with NaN
movie_data['revenue'] = movie_data['revenue'].replace(0, np.nan)

# Modify ratings to drop 'userId' & 'timestamp'
ratings.drop(columns=['userId', 'timestamp'], inplace=True)

# Calculate average rating based on movieId
ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings['rating'] = ratings['rating'].round(1)

# Convert 'id' column to numeric
movie_data["id"] = pd.to_numeric(movie_data['id'], errors='coerce', downcast="integer")

# Merge ratings data
movie_data = movie_data.merge(ratings, left_on="id", right_on="movieId", how="left")
movie_data.drop(columns=['movieId'], inplace=True)

### Converting column with json strings to lists of dictionaries 

In [5]:
movie_data['genres'] = movie_data['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['belongs_to_collection'] = movie_data['belongs_to_collection'].fillna("[]").apply(ast.literal_eval).apply(lambda x: x if isinstance(x, dict) else np.nan)
movie_data['production_companies'] = movie_data['production_companies'].apply(ast.literal_eval)
movie_data['production_companies'] = movie_data['production_companies'].fillna("[]").apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['production_countries'] = movie_data['production_countries'].fillna('[]').apply(ast.literal_eval)
movie_data['production_countries'] = movie_data['production_countries'].apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['spoken_languages'] = movie_data['spoken_languages'].apply(ast.literal_eval)
movie_data['spoken_languages'] = movie_data['spoken_languages'].fillna('[]').apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['keywords'] = movie_data['keywords'].apply(ast.literal_eval)
movie_data['keywords'] = movie_data['keywords'].apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(ast.literal_eval)
movie_data['crew'] = movie_data['crew'].apply(ast.literal_eval)
movie_data['crew'] = movie_data['crew'].apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(lambda x: [i for i in x] if isinstance(x, list) else [])
movie_data['cast'] = movie_data['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)
movie_data['crew'] = movie_data['crew'].apply(lambda x: x[:3] if len(x) >= 3 else x)


In [6]:
denormalized_data = movie_data.to_dict(orient='records')


### Connecting to MongoDB 

In [14]:
# Connect to MongoDB
Connection_string = 'mongodb+srv://itsmeparag14:sarsamba99@cluster0.rh4e6ak.mongodb.net/'
client = MongoClient(Connection_string)
db = client['Lab2']
collection = db['Movies']

In [15]:
collection.insert_many(denormalized_data)

InsertManyResult([ObjectId('663070bee7ea526e38e99a34'), ObjectId('663070bee7ea526e38e99a35'), ObjectId('663070bee7ea526e38e99a36'), ObjectId('663070bee7ea526e38e99a37'), ObjectId('663070bee7ea526e38e99a38'), ObjectId('663070bee7ea526e38e99a39'), ObjectId('663070bee7ea526e38e99a3a'), ObjectId('663070bee7ea526e38e99a3b'), ObjectId('663070bee7ea526e38e99a3c'), ObjectId('663070bee7ea526e38e99a3d'), ObjectId('663070bee7ea526e38e99a3e'), ObjectId('663070bee7ea526e38e99a3f'), ObjectId('663070bee7ea526e38e99a40'), ObjectId('663070bee7ea526e38e99a41'), ObjectId('663070bee7ea526e38e99a42'), ObjectId('663070bee7ea526e38e99a43'), ObjectId('663070bee7ea526e38e99a44'), ObjectId('663070bee7ea526e38e99a45'), ObjectId('663070bee7ea526e38e99a46'), ObjectId('663070bee7ea526e38e99a47'), ObjectId('663070bee7ea526e38e99a48'), ObjectId('663070bee7ea526e38e99a49'), ObjectId('663070bee7ea526e38e99a4a'), ObjectId('663070bee7ea526e38e99a4b'), ObjectId('663070bee7ea526e38e99a4c'), ObjectId('663070bee7ea526e38e99a