In [12]:
import pandas as pd

# Load the dataset
file_path = 'Datasets/title.akas.tsv'  # Replace with the actual path to your .tsv file
df = pd.read_csv(file_path, sep='\t', dtype=str)

# Filter titles with language 'en'
df_filtered = df[df['language'] == 'en']

# Display the filtered data
print(df_filtered.head())

       titleId ordering                                title region language  \
37   tt0000005        2                     Blacksmith Scene     CA       en   
88   tt0000010        2  La sortie de l'usine Lumière à Lyon     CA       en   
119  tt0000012       21  The Arrival of a Train at La Ciotat    XWW       en   
129  tt0000012        4               The Arrival of a Train    XEU       en   
131  tt0000012        6               The Arrival of a Train    XWW       en   

           types attributes isOriginalTitle  
37   imdbDisplay         \N               0  
88   imdbDisplay         \N               0  
119  alternative         \N               0  
129  imdbDisplay         \N               0  
131  imdbDisplay         \N               0  


In [4]:
print(df_filtered.info())


<class 'pandas.core.frame.DataFrame'>
Index: 554932 entries, 37 to 49827960
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   titleId          554932 non-null  object
 1   ordering         554932 non-null  object
 2   title            554930 non-null  object
 3   region           554932 non-null  object
 4   language         554932 non-null  object
 5   types            554932 non-null  object
 6   attributes       554932 non-null  object
 7   isOriginalTitle  554932 non-null  object
dtypes: object(8)
memory usage: 38.1+ MB
None


In [7]:
print(df_filtered['title'].isnull().sum())

2


In [5]:
df_filtered = df_filtered.dropna(subset=['title'])

In [6]:
print(df_filtered['title'].isnull().sum())

0


In [9]:
def format_title_for_rt(title):
    return title.lower().replace(' ', '-')

# Apply the function to create Rotten Tomatoes formatted title
df_filtered['rt_title'] = df_filtered['title'].apply(format_title_for_rt)

In [10]:
# Group by titleId and keep the first occurrence of each unique titleId
df_cleaned = df_filtered.groupby('titleId', as_index=False).first()

# Display the cleaned dataset
print(df_cleaned)

          titleId ordering                                        title  \
0       tt0000005        2                             Blacksmith Scene   
1       tt0000010        2          La sortie de l'usine Lumière à Lyon   
2       tt0000012       21          The Arrival of a Train at La Ciotat   
3       tt0000013        4  The Photographical Congress Arrives in Lyon   
4       tt0000016        4                        Boat Leaving the Port   
...           ...      ...                                          ...   
302674  tt9916216        2                 Kalyanam Mudhal Kadhal Varai   
302675  tt9916362       11                             Coven of Sisters   
302676  tt9916428        4                          The Secret of China   
302677  tt9916634        2                                       Eugène   
302678  tt9916706        2                              Dankyavar Danka   

       region language        types attributes isOriginalTitle  \
0          CA       en  imdbDispl

In [13]:
# Load datasets

titles = pd.read_csv('Datasets/title.basics.tsv', sep='\t', low_memory=False)
crew = pd.read_csv('Datasets/title.crew.tsv', sep='\t', low_memory=False)
ratings = pd.read_csv('Datasets/title.ratings.tsv', sep='\t', low_memory=False)


# Step 1: Filter for English movies released after 1960
titles_filtered = titles[(titles['startYear'] > '1960') & (titles['titleType'] == 'movie') & (titles['isAdult'] == '0')]

# Fill NaN values in originalTitle with an empty string
titles_filtered.loc[:, 'originalTitle'] = titles_filtered['originalTitle'].fillna('')

# Check for English titles
english_movies = titles_filtered[titles_filtered['originalTitle'].str.contains('[a-zA-Z]', regex=True)]

# Step 2: Merge with crew and ratings datasets
merged_data = english_movies.merge(crew, left_on='tconst', right_on='tconst', how='left')
merged_data = merged_data.merge(ratings, left_on='tconst', right_on='tconst', how='left')

# Step 3: Extract necessary columns
final_dataset = merged_data[['tconst', 'originalTitle', 'startYear', 'genres', 'runtimeMinutes', 'directors', 'averageRating', 'numVotes']]

# Step 4: Clean up the final dataset
# Replace '\N' with NaN and fill NaN values in runtimeMinutes
final_dataset['runtimeMinutes'] = final_dataset['runtimeMinutes'].replace('\\N', pd.NA).fillna(0).astype(int)

# Display the final dataset
print(final_dataset.head())

# Save the dataset if needed
final_dataset.to_csv('english_movies_after_1960.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['runtimeMinutes'] = final_dataset['runtimeMinutes'].replace('\\N', pd.NA).fillna(0).astype(int)


      tconst                originalTitle startYear                  genres  \
0  tt0011801             Tötet nicht mehr      2019            Action,Crime   
1  tt0013274  Istoriya grazhdanskoy voyny      2021             Documentary   
2  tt0015414       La tierra de los toros      2000                      \N   
3  tt0015724                Dama de noche      1993   Drama,Mystery,Romance   
4  tt0035423               Kate & Leopold      2001  Comedy,Fantasy,Romance   

   runtimeMinutes            directors  averageRating  numVotes  
0               0            nm0681726            NaN       NaN  
1              94  nm0412842,nm0895048            6.7      74.0  
2              60            nm0615736            5.4      17.0  
3             102            nm0529960            6.3      31.0  
4             118            nm0003506            6.4   90221.0  


In [24]:
# Load the datasets
akas = pd.read_csv('Datasets/title.akas.tsv', sep='\t', low_memory=False)
titles = pd.read_csv('Datasets/title.basics.tsv', sep='\t', low_memory=False)
crew = pd.read_csv('Datasets/title.crew.tsv', sep='\t', low_memory=False)
ratings = pd.read_csv('Datasets/title.ratings.tsv', sep='\t', low_memory=False)
names = pd.read_csv('Datasets/name.basics.tsv', sep='\t', low_memory=False)

# Step 1: Filter for English movies released after 2000
titles_filtered = titles[(titles['startYear'] > '2010') & 
                         (titles['titleType'] == 'movie')]

# Step 2: Filter for English language movies (handling NaN values)
english_movies = titles_filtered[titles_filtered['originalTitle'].str.contains('[a-zA-Z]', regex=True, na=False)]

# Step 3: Include only movies from the specified regions
# Merge with the akas dataset to filter by region and include isOriginalTitle
akas_filtered = akas[akas['region'].isin(['UK', 'US'])]  # 'IN' for India in IMDb region codes
akas_filtered = akas_filtered[['titleId', 'region', 'isOriginalTitle']]  # Select only necessary columns

# Merge akas_filtered with english_movies to include region and isOriginalTitle
english_movies = english_movies.merge(akas_filtered, left_on='tconst', right_on='titleId', how='left')

# Step 4: Merge with crew and ratings datasets
merged_data = english_movies.merge(crew, left_on='tconst', right_on='tconst', how='left')
merged_data = merged_data.merge(ratings, left_on='tconst', right_on='tconst', how='left')

# Step 5: Get director names
merged_data = merged_data.explode('directors')
merged_data = merged_data.merge(names, left_on='directors', right_on='nconst', how='left')

# Step 6: Create final dataset
final_dataset = merged_data[['tconst', 'originalTitle', 'startYear', 'genres', 'runtimeMinutes', 
                             'primaryName', 'averageRating', 'numVotes', 'isOriginalTitle', 'isAdult', 'region']]
final_dataset.rename(columns={'primaryName': 'director'}, inplace=True)

# Step 7: Clean up the final dataset
# Handle the runtimeMinutes field, converting to int and replacing invalid entries
final_dataset['runtimeMinutes'] = pd.to_numeric(final_dataset['runtimeMinutes'], errors='coerce').fillna(0).astype(int)

# Step 8: Remove rows with any NaN values
final_dataset_cleaned = final_dataset.dropna()

# Display the cleaned final dataset
print(final_dataset_cleaned.head())


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset.rename(columns={'primaryName': 'director'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['runtimeMinutes'] = pd.to_numeric(final_dataset['runtimeMinutes'], errors='coerce').fillna(0).astype(int)


       tconst               originalTitle startYear  \
4   tt0069049  The Other Side of the Wind      2018   
16  tt0112502                     Bigfoot      2017   
17  tt0116991         Mariette in Ecstasy      2019   
18  tt0120589                Predestinado      2022   
55  tt0137204             Joe Finds Grace      2017   

                        genres  runtimeMinutes           director  \
4                        Drama             122       Orson Welles   
16             Horror,Thriller               0           Mc Jones   
17                       Drama             101        John Bailey   
18             Biography,Drama             108  Gustavo Fernández   
55  Adventure,Animation,Comedy              83   Anthony Harrison   

    averageRating  numVotes  isOriginalTitle isAdult region  
4             6.7    8156.0              0.0       0     US  
16            4.7      42.0              0.0       0     US  
17            7.1      81.0              0.0       0     US  
18    

In [15]:
# Remove duplicate rows based on all columns
final_dataset_cleaned = final_dataset_cleaned.drop_duplicates()

# If you want to remove duplicates based on specific columns (e.g., 'originalTitle' and 'startYear'):
# final_dataset_cleaned = final_dataset_cleaned.drop_duplicates(subset=['originalTitle', 'startYear'])

# Display the cleaned dataset without duplicates
print(final_dataset_cleaned.head())


       tconst                originalTitle startYear                  genres  \
4   tt0035423               Kate & Leopold      2001  Comedy,Fantasy,Romance   
7   tt0036606  Another Time, Another Place      1983               Drama,War   
9   tt0038687           Let There Be Light      1980         Documentary,War   
12  tt0042423        The Dungeon of Harrow      1964                  Horror   
14  tt0044932            Out of the Shadow      1961                 Mystery   

    runtimeMinutes         director  averageRating  numVotes  isOriginalTitle  \
4              118    James Mangold            6.4   90221.0              0.0   
7              118  Michael Radford            6.4     362.0              0.0   
9               58      John Huston            7.5    2048.0              0.0   
12              86      Pat Boyette            3.4     646.0              0.0   
14              61   Michael Winner            5.0     240.0              0.0   

   isAdult region  
4        0  

In [20]:
print(final_dataset_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
Index: 119154 entries, 4 to 394770
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           119154 non-null  object 
 1   originalTitle    119154 non-null  object 
 2   startYear        119154 non-null  object 
 3   genres           119154 non-null  object 
 4   runtimeMinutes   119154 non-null  int64  
 5   director         119154 non-null  object 
 6   averageRating    119154 non-null  float64
 7   numVotes         119154 non-null  float64
 8   isOriginalTitle  119154 non-null  float64
 9   isAdult          119154 non-null  object 
 10  region           119154 non-null  object 
dtypes: float64(3), int64(1), object(7)
memory usage: 10.9+ MB
None


In [23]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Helper function to format movie title for Rotten Tomatoes URL
def format_movie_title_for_url(title):
    title = title.replace('&', 'and')  # Replace '&' with 'and'
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove special characters
    title = title.replace(' ', '_').lower()       # Replace spaces with underscores and make lowercase
    return title

# Function to scrape critic rating and number of votes from Rotten Tomatoes
def scrape_rotten_tomatoes(movie_title):
    base_url = "https://www.rottentomatoes.com/m/"
    formatted_title = format_movie_title_for_url(movie_title)
    url = base_url + formatted_title
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Error fetching {movie_title}: Status code {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.text, 'html.parser')

        # Try JSON-LD script
        json_ld_script = soup.find('script', type='application/ld+json')
        if json_ld_script:
            json_ld_data = json.loads(json_ld_script.string)
            # Look for the aggregate rating
            if 'aggregateRating' in json_ld_data:
                critic_score = json_ld_data['aggregateRating']['ratingValue']
                voter_count = json_ld_data['aggregateRating'].get('ratingCount', None)
                return critic_score, voter_count

        # Try fallback methods if JSON-LD doesn't provide the data
        critic_score_elem = soup.find('score-board')
        if critic_score_elem:
            critic_score = critic_score_elem.get('tomatometerscore')
            voter_count = critic_score_elem.get('tomatometerreviewcount')

            # Return valid data if found
            if critic_score and voter_count:
                return critic_score, voter_count

        # If nothing is found, return None values
        print(f"No valid score found for {movie_title}")
        return None, None

    except Exception as e:
        print(f"Error scraping {movie_title}: {e}")
        return None, None

# Add new columns for critic ratings and number of voters
final_dataset_cleaned['criticsRating'] = None
final_dataset_cleaned['voterCount'] = None

# Filter for movies released after 2010
movies_after_2010 = final_dataset_cleaned[final_dataset_cleaned['startYear'] > '2010']

# Scrape Rotten Tomatoes for each movie (limit to first 20 for testing)
unsuccessful_count = 0  # Initialize counter for unsuccessful scrapes
for index, row in movies_after_2000.head(20).iterrows():
    movie_title = row['originalTitle']
    critics_rating, voter_count = scrape_rotten_tomatoes(movie_title)

    if critics_rating is not None and voter_count is not None:
        final_dataset_cleaned.at[index, 'criticsRating'] = critics_rating
        final_dataset_cleaned.at[index, 'voterCount'] = voter_count
    else:
        unsuccessful_count += 1  # Increment the counter for unsuccessful scrapes

# Display the count of unsuccessful scrapes
print(f"Number of unsuccessful web scraping attempts: {unsuccessful_count}")

# Display the updated dataset
print(final_dataset_cleaned.head(20))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_cleaned['criticsRating'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_cleaned['voterCount'] = None


No valid score found for Vazir
No valid score found for Vazir
Error fetching Mariette in Ecstasy: Status code 404
Error fetching Predestinado: Status code 404
Error fetching Joe Finds Grace: Status code 404
No valid score found for Housesitter: The Night They Saved Siegfried's Brain
No valid score found for Nine Ball
No valid score found for Nine Ball
No valid score found for Blood Type
Error fetching Az ember tragédiája: Status code 404
Error fetching Heartland of Darkness: Status code 404
Error fetching Heartland of Darkness: Status code 404
Error fetching Reverse Heaven: Status code 404
No valid score found for Mysteries
No valid score found for Holy Hollywood
No valid score found for The Perfect Shadow
No valid score found for The Perfect Shadow
Number of unsuccessful web scraping attempts: 19
        tconst                                      originalTitle startYear  \
4    tt0069049                         The Other Side of the Wind      2018   
14   tt0111596                   

In [25]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Helper function to format movie title for Rotten Tomatoes URL
def format_movie_title_for_url(title):
    title = title.replace('&', 'and')  # Replace '&' with 'and'
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove special characters
    title = title.replace(' ', '_').lower()       # Replace spaces with underscores and make lowercase
    return title

# Function to scrape critic rating and number of votes from Rotten Tomatoes
def scrape_rotten_tomatoes(movie_title):
    base_url = "https://www.rottentomatoes.com/m/"
    formatted_title = format_movie_title_for_url(movie_title)
    url = base_url + formatted_title
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 404:
            print(f"Error fetching {movie_title}: Status code 404 (Page not found)")
            return None, None
        elif response.status_code != 200:
            print(f"Error fetching {movie_title}: Status code {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.text, 'html.parser')

        # Try JSON-LD script
        json_ld_script = soup.find('script', type='application/ld+json')
        if json_ld_script:
            json_ld_data = json.loads(json_ld_script.string)
            # Look for the aggregate rating
            if 'aggregateRating' in json_ld_data:
                critic_score = json_ld_data['aggregateRating']['ratingValue']
                voter_count = json_ld_data['aggregateRating'].get('ratingCount', None)
                return critic_score, voter_count

        # Try fallback methods if JSON-LD doesn't provide the data
        critic_score_elem = soup.find('score-board')
        if critic_score_elem:
            critic_score = critic_score_elem.get('tomatometerscore')
            voter_count = critic_score_elem.get('tomatometerreviewcount')

            # Return valid data if found
            if critic_score and voter_count:
                return critic_score, voter_count

        # If nothing is found, return None values
        print(f"No valid score found for {movie_title}")
        return None, None

    except Exception as e:
        print(f"Error scraping {movie_title}: {e}")
        return None, None

# Add new columns for critic ratings and number of voters
final_dataset_cleaned['criticsRating'] = None
final_dataset_cleaned['voterCount'] = None

# Filter for movies released after 2010
movies_after_2010 = final_dataset_cleaned[final_dataset_cleaned['startYear'] > '2010']

# Scrape Rotten Tomatoes for each movie (limit to first 20 for testing)
unsuccessful_count = 0  # Initialize counter for unsuccessful scrapes
for index, row in movies_after_2010.head(20).iterrows():
    movie_title = row['originalTitle']
    critics_rating, voter_count = scrape_rotten_tomatoes(movie_title)

    if critics_rating is not None and voter_count is not None:
        final_dataset_cleaned.at[index, 'criticsRating'] = critics_rating
        final_dataset_cleaned.at[index, 'voterCount'] = voter_count
    else:
        unsuccessful_count += 1  # Increment the counter for unsuccessful scrapes

# Display the count of unsuccessful scrapes
print(f"Number of unsuccessful web scraping attempts: {unsuccessful_count}")

# Display the updated dataset
print(final_dataset_cleaned.head(20))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_cleaned['criticsRating'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset_cleaned['voterCount'] = None


Error fetching Mariette in Ecstasy: Status code 404 (Page not found)
Error fetching Predestinado: Status code 404 (Page not found)
Error fetching Joe Finds Grace: Status code 404 (Page not found)
No valid score found for Housesitter: The Night They Saved Siegfried's Brain
No valid score found for Nine Ball
No valid score found for Blood Type
Error fetching Az ember tragédiája: Status code 404 (Page not found)
Error fetching Heartland of Darkness: Status code 404 (Page not found)
Error fetching Heartland of Darkness: Status code 404 (Page not found)
Error fetching Reverse Heaven: Status code 404 (Page not found)
No valid score found for Mysteries
No valid score found for Holy Hollywood
No valid score found for The Perfect Shadow
No valid score found for The Perfect Shadow
Error fetching San Jie Cao: Status code 404 (Page not found)
Error fetching 50 Feet of String: Status code 404 (Page not found)
No valid score found for Black Star: Autobiography of a Close Friend
Number of unsuccessfu

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Helper function to format movie title for Rotten Tomatoes URL
def format_movie_title_for_url(title):
    title = title.replace('&', 'and')  # Replace '&' with 'and'
    title = re.sub(r'[^a-zA-Z0-9\s]', '', title)  # Remove special characters
    title = title.replace(' ', '_').lower()       # Replace spaces with underscores and make lowercase
    return title

# Function to scrape critic rating and audience rating from Rotten Tomatoes
def scrape_rotten_tomatoes(movie_title):
    base_url = "https://www.rottentomatoes.com/m/"
    formatted_title = format_movie_title_for_url(movie_title)
    url = base_url + formatted_title
    headers = {'User-Agent': 'Mozilla/5.0'}

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 404:
            print(f"Error fetching {movie_title}: Status code 404 (Page not found)")
            return None, None
        elif response.status_code != 200:
            print(f"Error fetching {movie_title}: Status code {response.status_code}")
            return None, None

        soup = BeautifulSoup(response.text, 'html.parser')

        # Try to extract critic score
        critic_score_button = soup.find('rt-button', {'slot': 'criticsScore'})
        critic_score = None
        if critic_score_button:
            critic_score = critic_score_button.get_text(strip=True)

        # Try to extract audience score
        audience_score_button = soup.find('rt-button', {'slot': 'audienceScore'})
        audience_score = None
        if audience_score_button:
            audience_score = audience_score_button.get_text(strip=True)

        # If nothing is found, return None values
        if not critic_score and not audience_score:
            print(f"No valid score found for {movie_title}")
            return None, None

        return critic_score, audience_score

    except Exception as e:
        print(f"Error scraping {movie_title}: {e}")
        return None, None



# Add new columns for critic ratings and audience ratings
final_dataset_cleaned['criticsRating'] = None
final_dataset_cleaned['audienceRating'] = None

# Filter for movies released after 2010 (modify this as needed)
movies_after_2010 = final_dataset_cleaned[final_dataset_cleaned['startYear'] > '2010']

# Scrape Rotten Tomatoes for each movie (limit to first 20 for testing)
unsuccessful_count = 0  # Initialize counter for unsuccessful scrapes
for index, row in movies_after_2010.iterrows():
    movie_title = row['originalTitle']
    critics_rating, audience_rating = scrape_rotten_tomatoes(movie_title)

    if critics_rating is not None and audience_rating is not None:
        final_dataset_cleaned.at[index, 'criticsRating'] = critics_rating
        final_dataset_cleaned.at[index, 'audienceRating'] = audience_rating
    else:
        unsuccessful_count += 1  # Increment the counter for unsuccessful scrapes

# Display the count of unsuccessful scrapes
print(f"Number of unsuccessful web scraping attempts: {unsuccessful_count}")

# Display the updated dataset (optional)
print(final_dataset_cleaned.head(20))

# Save the updated dataset to a CSV (optional)
final_dataset_cleaned.to_csv('updated_dataset_with_rt_ratings.csv', index=False)


In [34]:
print(final_dataset_cleaned['criticsRating'])

4          83%
16            
17        None
18        None
55        None
          ... 
356455    None
356460    None
356464    None
356466    None
356467    None
Name: criticsRating, Length: 77880, dtype: object


In [36]:
print(final_dataset_cleaned[final_dataset_cleaned['criticsRating'].notna()].count())

tconst             721
originalTitle      721
startYear          721
genres             721
runtimeMinutes     721
director           721
averageRating      721
numVotes           721
isOriginalTitle    721
isAdult            721
region             721
criticsRating      721
voterCount           1
audienceRating     721
dtype: int64


In [38]:
print(final_dataset_cleaned[final_dataset_cleaned['criticsRating'] != 'None'])

           tconst                  originalTitle startYear  \
4       tt0069049     The Other Side of the Wind      2018   
16      tt0112502                        Bigfoot      2017   
17      tt0116991            Mariette in Ecstasy      2019   
18      tt0120589                   Predestinado      2022   
55      tt0137204                Joe Finds Grace      2017   
...           ...                            ...       ...   
356455  tt9915872  Boku no kanojo wa mahoutsukai      2019   
356460  tt9916170                       O Ensaio      2019   
356464  tt9916190                      Safeguard      2020   
356466  tt9916362                       Akelarre      2020   
356467  tt9916428   Hong xing zhao yao Zhong guo      2019   

                            genres  runtimeMinutes           director  \
4                            Drama             122       Orson Welles   
16                 Horror,Thriller               0           Mc Jones   
17                           Drama  

In [39]:
print(final_dataset_cleaned.loc[final_dataset_cleaned['criticsRating'] != 'None', 'criticsRating'])

4          83%
16            
17        None
18        None
55        None
          ... 
356455    None
356460    None
356464    None
356466    None
356467    None
Name: criticsRating, Length: 77880, dtype: object


In [43]:
print(final_dataset_cleaned.loc[final_dataset_cleaned['criticsRating'].notna() & (final_dataset_cleaned['criticsRating'] != 'None') & (final_dataset_cleaned['criticsRating'] != ''), 'criticsRating'])


4         83%
240       80%
241       47%
242       47%
243       47%
         ... 
150270    75%
150271    75%
150313    47%
150332    65%
150333    65%
Name: criticsRating, Length: 7080, dtype: object


In [45]:
count = final_dataset_cleaned.loc[final_dataset_cleaned['audienceRating'].notna() & (final_dataset_cleaned['audienceRating'] != 'None') & (final_dataset_cleaned['audienceRating'] != '')].shape[0]
print(f"Count of valid critics ratings: {count}")

Count of valid critics ratings: 10842


In [46]:
print(final_dataset_cleaned.loc[final_dataset_cleaned['audienceRating'].notna() & (final_dataset_cleaned['audienceRating'] != 'None') & (final_dataset_cleaned['audienceRating'] != ''), 'audienceRating'])

4         58%
16        26%
116       86%
202       11%
233       38%
         ... 
150322    40%
150324    40%
150329    14%
150332    82%
150333    82%
Name: audienceRating, Length: 10842, dtype: object


In [47]:
final_dataset_cleaned.head(100)

Unnamed: 0,tconst,originalTitle,startYear,genres,runtimeMinutes,director,averageRating,numVotes,isOriginalTitle,isAdult,region,criticsRating,voterCount,audienceRating
4,tt0069049,The Other Side of the Wind,2018,Drama,122,Orson Welles,6.7,8156.0,0.0,0,US,83%,99,58%
16,tt0112502,Bigfoot,2017,"Horror,Thriller",0,Mc Jones,4.7,42.0,0.0,0,US,,,26%
17,tt0116991,Mariette in Ecstasy,2019,Drama,101,John Bailey,7.1,81.0,0.0,0,US,,,
18,tt0120589,Predestinado,2022,"Biography,Drama",108,Gustavo Fernández,7.2,417.0,0.0,0,US,,,
55,tt0137204,Joe Finds Grace,2017,"Adventure,Animation,Comedy",83,Anthony Harrison,8.6,294.0,0.0,0,US,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,tt0433035,Real Steel,2011,"Action,Drama,Sci-Fi",127,Shawn Levy,7.1,362971.0,0.0,0,US,60%,,73%
405,tt0433035,Real Steel,2011,"Action,Drama,Sci-Fi",127,Shawn Levy,7.1,362971.0,0.0,0,US,60%,,73%
406,tt0433035,Real Steel,2011,"Action,Drama,Sci-Fi",127,Shawn Levy,7.1,362971.0,0.0,0,US,60%,,73%
407,tt0433397,Satin,2011,"Comedy,Drama,Music",84,Christopher Olness,4.5,272.0,0.0,0,US,,,59%
