In [19]:
import pandas as pd
import json
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

### Loading json to dataframe

In [20]:
def combine_json_to_dataframe(directory_path, file_pattern='*.json'):
    """
    Reads multiple JSON files (with IMDb IDs as top-level keys)
    and combines inner movie data into a single DataFrame.
    """
    search_path = os.path.join(directory_path, file_pattern)
    all_json_files = glob.glob(search_path)
    data_list = []

    if not all_json_files:
        print(f"No files found matching pattern '{file_pattern}' in '{directory_path}'")
        return pd.DataFrame()

    for file_path in all_json_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                movie_data = json.load(f)
                # flatten each fileâ€™s movie dictionary
                # for _, movie in movie_data.items():
                #     data_list.append(movie)
                data_list.append(movie_data)
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    df = pd.DataFrame(data_list)
    return df


In [21]:
json_directory = '../data/all'
movies_df = combine_json_to_dataframe(json_directory, '*.json')

In [22]:
# print(movies_df.head())
print(f"Total rows (movies): {len(movies_df)}\n")
print(movies_df.columns)

Total rows (movies): 10000

Index(['id', 'title', 'description', 'summary', 'image', 'url',
       'datePublished', 'duration', 'genre', 'keywords', 'aggregateRating',
       'actors', 'directors', 'creators', 'trailer', 'review'],
      dtype='object')


### Preparing the dataset

In [23]:
all_genres = set()
for genres in movies_df['genre']:
    all_genres.update(genres)
all_genres = sorted(list(all_genres))
print("Unique genres:", len(all_genres))
print(all_genres)

Unique genres: 23
['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']


In [24]:
# joining description and summary into one text
movies_df['text'] = movies_df.apply(
    lambda row: row['description'] + " " + row['summary'] if pd.notnull(row['summary']) else row['description'],
    axis=1)

In [25]:
# Convert the list of genres into a binary matrix (multi-label)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(movies_df['genre'])
print(y)

[[1 0 1 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [26]:
# Prepare X (text) and y (genres) for train/test split
X = movies_df['text']
# Keep the ids for later analysis
ids = movies_df['id']

X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X, y, ids, test_size=0.2, random_state=42
)

In [27]:
# save dataframe and traning/test datasets to pickle files

with open('../data/movies_df.pkl', 'wb') as f:
    pickle.dump((movies_df), f)

with open('../data/train_data.pkl', 'wb') as f:
    pickle.dump((X_train, y_train, ids_train), f)

with open('../data/test_data.pkl', 'wb') as f:
    pickle.dump((X_test, y_test, ids_test), f)

with open('../data/mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)