In [1]:
import os
import pandas as pd
import json
import re
import zipfile

# Rotten tomatoes dataset
https://www.kaggle.com/stefanoleone992/rotten-tomatoes-movies-and-critic-reviews-dataset?select=rotten_tomatoes_movies.csv

In [4]:
base_dir = os.path.dirname(os.path.dirname((os.path.dirname(os.getcwd()))))
data_dir = os.path.join(base_dir, os.path.join('Movies_dataset', 'rotten_tomatoes'))

movie_data_file = os.path.join(data_dir, 'rotten_tomatoes_movies.csv')
movie_data = pd.read_csv(movie_data_file)

In [7]:
movie_data

Unnamed: 0,rotten_tomatoes_link,movie_title,movie_info,critics_consensus,content_rating,genres,directors,authors,actors,original_release_date,...,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count
0,m/0814255,Percy Jackson & the Olympians: The Lightning T...,"Always trouble-prone, the life of teenager Per...",Though it may seem like just another Harry Pot...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","Logan Lerman, Brandon T. Jackson, Alexandra Da...",2010-02-12,...,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76
1,m/0878835,Please Give,Kate (Catherine Keener) and her husband Alex (...,Nicole Holofcener's newest might seem slight i...,R,Comedy,Nicole Holofcener,Nicole Holofcener,"Catherine Keener, Amanda Peet, Oliver Platt, R...",2010-04-30,...,Sony Pictures Classics,Certified-Fresh,87.0,142.0,Upright,64.0,11574.0,44,123,19
2,m/10,10,"A successful, middle-aged Hollywood songwriter...",Blake Edwards' bawdy comedy may not score a pe...,R,"Comedy, Romance",Blake Edwards,Blake Edwards,"Dudley Moore, Bo Derek, Julie Andrews, Robert ...",1979-10-05,...,Waner Bros.,Fresh,67.0,24.0,Spilled,53.0,14684.0,2,16,8
3,m/1000013-12_angry_men,12 Angry Men (Twelve Angry Men),Following the closing arguments in a murder tr...,Sidney Lumet's feature debut is a superbly wri...,NR,"Classics, Drama",Sidney Lumet,Reginald Rose,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",1957-04-13,...,Criterion Collection,Certified-Fresh,100.0,54.0,Upright,97.0,105386.0,6,54,0
4,m/1000079-20000_leagues_under_the_sea,"20,000 Leagues Under The Sea","In 1866, Professor Pierre M. Aronnax (Paul Luk...","One of Disney's finest live-action adventures,...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,Earl Felton,"James Mason, Kirk Douglas, Paul Lukas, Peter L...",1954-01-01,...,Disney,Fresh,89.0,27.0,Upright,74.0,68918.0,5,24,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17707,m/zoot_suit,Zoot Suit,Mexican-American gangster Henry Reyna (Daniel ...,,R,"Drama, Musical & Performing Arts",Luis Valdez,Luis Valdez,"Daniel Valdez, Edward James Olmos, Charles Aid...",1981-10-02,...,MCA Universal Home Video,Rotten,56.0,9.0,Upright,74.0,1195.0,2,5,4
17708,m/zootopia,Zootopia,From the largest elephant to the smallest shre...,The brilliantly well-rounded Zootopia offers a...,PG,"Action & Adventure, Animation, Comedy","Byron Howard, Rich Moore, Jared Bush","Jared Bush, Phil Johnston","J.K. Simmons, Kristen Bell, Octavia Spencer, A...",2016-03-04,...,Walt Disney Animation Studios,Certified-Fresh,98.0,291.0,Upright,92.0,101511.0,50,285,7
17709,m/zorba_the_greek,Zorba the Greek,Traveling to inspect an abandoned mine his fat...,,NR,"Action & Adventure, Art House & International,...",,,"Anthony Quinn, Alan Bates, Irene Papas, Lila K...",1964-12-17,...,Fox,Fresh,80.0,10.0,Upright,86.0,7146.0,0,8,2
17710,m/zulu,Zulu,"In 1879, the Zulu nation hands colonial Britis...",Zulu patiently establishes a cast of colorful ...,PG,"Classics, Drama","Cy Endfield, Cyril Endfield","Cy Endfield, John Prebble","Stanley Baker, Jack Hawkins, Ulla Jacobsson, J...",1964-06-17,...,Paramount Pictures,Fresh,96.0,23.0,Upright,91.0,30193.0,6,22,1


In [32]:
def parse_id(link):
    return link[2:]

def simplify_name(name):
    name = re.findall("[ A-Za-z0-9'&-:]", name)
    name_cleaned = ''.join(name).strip()
    name_simplified = re.split("[:]", name_cleaned)[0].strip()
    if name_simplified == '':
        name_simplified = name_cleaned
    return name_simplified

def get_genre(uniq_id, column, column_inv_index):
    res = set()
    if type(column) is str:
        res = res.union(set(map(lambda x: x.lower().strip(), column.split(','))))
        for r in res:
            column_inv_index.setdefault(r, []).append(uniq_id)
    return list(res), column_inv_index

movie_id_to_info = dict()
inv_movie_genre = dict()
inv_name_simplified = dict()
inv_movie_content_rating = dict()
audience_count_ranked = []

for i in range(len(movie_data)):
    uniq_id = movie_data['rotten_tomatoes_link'][i][2:]
    name = movie_data['movie_title'][i]
    name_simp = simplify_name(name)
    inv_name_simplified.setdefault(name_simp, []).append(uniq_id)
    
    content_rating = movie_data['content_rating'][i]
    inv_movie_content_rating.setdefault(content_rating, []).append(uniq_id)
    description = movie_data['movie_info'][i]
    critics = movie_data['critics_consensus'][i]
    
    genres, inv_movie_genre = get_genre(uniq_id, movie_data['genres'][i], inv_movie_genre)
    audience_count = movie_data['audience_count'][i]
    audience_count_ranked.append((uniq_id, audience_count))
    
    info = {'name': name, 'simplified_name': name_simp, 'description': description, 'genre': genres, 
            'critics_consensus': critics, 'content_rating': content_rating, 'audience_count': audience_count}
    movie_id_to_info[uniq_id] = info

audience_count_ranked = sorted(audience_count_ranked, key=lambda item:(item[1]), reverse = True)
audience_count_ranked = [x[0] for x in audience_count_ranked]

In [36]:
len(inv_movie_genre)

21

In [35]:
len(inv_movie_content_rating.keys())

6

In [37]:
out_dir = os.getcwd()
with open(os.path.join(out_dir, 'inv_movie_genre.json'), 'w') as json_file:
    json.dump(inv_movie_genre, json_file)
with open(os.path.join(out_dir, 'inv_movie_content_rating.json'), 'w') as json_file:
    json.dump(inv_movie_content_rating, json_file)
with open(os.path.join(out_dir, 'movie_id_to_info.json'), 'w') as json_file:
    json.dump(movie_id_to_info, json_file)
with open(os.path.join(out_dir, 'movie_id_ranked_by_audience_number.json'), 'w') as json_file:
    json.dump(audience_count_ranked, json_file)