In [1]:
import pandas as pd

amazonprime_df = pd.read_csv(r'datasources\amazonprime.csv')
hotstar_df = pd.read_csv(r'datasources\hotstar.csv')
netflix_df = pd.read_csv(r'datasources\netflix.csv')


In [2]:
amazonprime_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,tm87233,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,"['drama', 'family', 'fantasy', 'romance', 'com...",['US'],,tt0038650,8.6,467766.0,27.611,8.261
1,tm143047,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,"['comedy', 'war']",['US'],,tt0023969,7.8,60933.0,9.013,7.357
2,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['drama', 'romance', 'comedy']",['US'],,tt0032599,7.8,60244.0,14.759,7.433
3,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family']",['US'],26.0,tt0850645,8.5,1149.0,15.424,7.6
4,tm5012,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,"['western', 'drama', 'romance', 'action']",['US'],,tt0040724,7.8,32210.0,12.4,7.4


In [3]:
hotstar_df.head()

Unnamed: 0,hotstar_id,title,description,genre,year,age_rating,running_time,seasons,episodes,type
0,1000087439,Sambha - Aajcha Chawa,A young man sets off on a mission to clean up ...,Action,2012,U/A 16+,141.0,,,movie
1,1260023113,Cars Toon: Mater And The Ghostlight,Mater is haunted by a mysterious blue light th...,Animation,2006,U,7.0,,,movie
2,1260103188,Kanmani Rambo Khatija,"Unlucky since birth, Rambo finds hope when he ...",Romance,2022,U/A 16+,157.0,,,movie
3,1260126754,Butterfly,While trying to rescue her sister's kids from ...,Thriller,2022,U/A 16+,136.0,,,movie
4,1260018228,Sister Act,"Rene, a lounge singer, decides to stay at a Ch...",Comedy,1992,U/A 7+,100.0,,,movie


In [4]:
netflix_df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [5]:
def normalize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^\w]', '', regex=True)
    return df

amazonprime_df = normalize_columns(amazonprime_df)
hotstar_df = normalize_columns(hotstar_df)
netflix_df = normalize_columns(netflix_df)


In [20]:
from difflib import SequenceMatcher

def column_similarity(col1, col2):
    return SequenceMatcher(None, col1, col2).ratio()

def match_columns(df1, df2, threshold=0.7):
    matched_columns = {}
    for col1 in df1.columns:
        best_match = None
        best_score = 0
        for col2 in df2.columns:
            similarity_score = column_similarity(col1, col2)
            if similarity_score > best_score and similarity_score >= threshold:
                best_match = col2
                best_score = similarity_score
        if best_match:
            matched_columns[col1] = best_match
    return matched_columns


In [21]:
amazon_hotstar_mapping = match_columns(amazonprime_df, hotstar_df)
hotstar_netflix_mapping = match_columns(hotstar_df, netflix_df)
amazon_netflix_mapping = match_columns(amazonprime_df, netflix_df)


In [22]:
amazon_hotstar_mapping

{'title': 'title',
 'type': 'type',
 'description': 'description',
 'runtime': 'running_time',
 'genres': 'genre',
 'seasons': 'seasons'}

In [23]:
hotstar_netflix_mapping

{'title': 'title',
 'description': 'description',
 'age_rating': 'rating',
 'type': 'type'}

In [25]:
amazon_netflix_mapping

{'title': 'title',
 'type': 'type',
 'description': 'description',
 'release_year': 'release_year'}

In [26]:
dfs = [amazonprime_df, hotstar_df, netflix_df]  
mappings = [amazon_hotstar_mapping, hotstar_netflix_mapping]  
sources = ['is_amazon', 'is_hotstar', 'is_netflix'] 


In [48]:
def automerge(dfs, mappings, sources):
    final_df = dfs[0].copy()
    final_df[sources[0]] = 1 
    for i in range(1, len(dfs)):
        df_to_merge = dfs[i].rename(columns=mappings[i-1]).copy()
        df_to_merge[sources[i]] = 1  
        final_df = pd.merge(final_df, df_to_merge, on='title', how='outer', suffixes=('', f'_df{i}'))

        for col in final_df.columns:
            if f'{col}_df{i}' in final_df.columns:
                final_df[col] = final_df[col].combine_first(final_df[f'{col}_df{i}'])
                final_df = final_df.drop(columns=[f'{col}_df{i}'])

    for source in sources:
        final_df[source] = final_df[source].fillna(0).astype(int)

    return final_df

In [27]:
#fix 
import pandas as pd

def automerge(dfs, mappings, sources):
    # Initialize the final DataFrame with the first data source
    final_df = dfs[0].copy()
    final_df[sources[0]] = 1  # Mark presence in the first data source
    
    for i in range(1, len(dfs)):
        # Rename columns of the next DataFrame according to mappings and add source column
        df_to_merge = dfs[i].rename(columns=mappings[i-1]).copy()
        df_to_merge[sources[i]] = 1  # Mark presence in this data source
        
        # Merge with the final DataFrame
        final_df = pd.merge(final_df, df_to_merge, on='title', how='outer', suffixes=('', f'_df{i}'))
        
        # Combine columns and remove duplicates
        for col in final_df.columns:
            if f'{col}_df{i}' in final_df.columns:
                final_df[col] = final_df[col].combine_first(final_df[f'{col}_df{i}'])
                final_df = final_df.drop(columns=[f'{col}_df{i}'])

    # Fill missing values in source columns with 0
    for source in sources:
        final_df[source] = final_df[source].fillna(0).astype(int)
    
    # Select the row with the least missing data for each movie (grouped by 'title')
    final_df['missing_count'] = final_df.isnull().sum(axis=1)  # Count missing values per row
    final_df = final_df.sort_values(by='missing_count').drop_duplicates(subset='title', keep='first')
    final_df = final_df.drop(columns=['missing_count'])  # Remove helper column
    
    return final_df


In [28]:
merged = automerge(dfs, mappings, sources)

In [29]:
merged = merged.drop(columns=[col for col in merged.columns if 'tmdb' in col])


In [30]:
merged.columns

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'is_amazon',
       'hotstar_id', 'genre', 'year', 'age_rating', 'running_time', 'episodes',
       'is_hotstar', 'show_id', 'director', 'cast', 'country', 'date_added',
       'rating', 'duration', 'listed_in', 'is_netflix'],
      dtype='object')

In [52]:
pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\ayush\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





In [31]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model = SentenceTransformer('nli-distilroberta-base-v2')


columns = ['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'is_amazon',
       'hotstar_id', 'genre', 'year', 'age_rating', 'running_time', 'episodes',
       'is_hotstar', 'show_id', 'director', 'cast', 'country', 'date_added',
       'rating', 'duration', 'listed_in', 'is_netflix']

# Step 1: Create embeddings for the column names
column_embeddings = model.encode(columns)

# Step 2: Calculate the cosine similarity matrix between columns
similarity_matrix = cosine_similarity(column_embeddings)

# Step 3: Define a similarity threshold for column matching
threshold = 0.6
semantically_similar_columns = []

# Step 4: Identify column pairs with similarity above the threshold
for i in range(len(columns)):
    for j in range(i+1, len(columns)):
        if similarity_matrix[i][j] >= threshold:
            semantically_similar_columns.append((columns[i], columns[j], similarity_matrix[i][j]))

# Step 5: Print the semantically similar columns and their similarity scores
for col1, col2, score in semantically_similar_columns:
    print(f"{col1} <--> {col2}: Similarity = {score:.2f}")


  from tqdm.autonotebook import tqdm, trange


id <--> type: Similarity = 0.64
id <--> imdb_id: Similarity = 0.70
id <--> show_id: Similarity = 0.71
title <--> type: Similarity = 0.60
title <--> description: Similarity = 0.77
type <--> description: Similarity = 0.65
type <--> runtime: Similarity = 0.62
type <--> rating: Similarity = 0.60
release_year <--> year: Similarity = 0.72
release_year <--> date_added: Similarity = 0.67
age_certification <--> age_rating: Similarity = 0.75
runtime <--> cast: Similarity = 0.61
runtime <--> duration: Similarity = 0.63
imdb_id <--> imdb_score: Similarity = 0.70
imdb_id <--> imdb_votes: Similarity = 0.63
imdb_id <--> show_id: Similarity = 0.64
imdb_score <--> imdb_votes: Similarity = 0.72
hotstar_id <--> is_hotstar: Similarity = 0.88
hotstar_id <--> show_id: Similarity = 0.60
age_rating <--> rating: Similarity = 0.70
running_time <--> duration: Similarity = 0.61


In [32]:
# BASED ON ABOVE SEMANTIC MATCHING
merged['rating'] = merged['rating'].combine_first(merged['age_rating'])
merged['age_certification'] = merged['age_certification'].combine_first(merged['age_rating'])
merged = merged.drop(columns=['age_rating'])
merged['runtime'] = merged['duration'].combine_first(merged['running_time']).combine_first(merged['runtime'])
merged = merged.drop(columns=['duration', 'running_time'])
id_columns_to_remove = [col for col in merged.columns if 'id' in col and col != 'imdb_id']
merged = merged.drop(columns=id_columns_to_remove)
merged = merged.drop(columns=['imdb_votes']) #not required

In [33]:
merged['year'] = merged['release_year'].combine_first(merged['year']).combine_first(merged['date_added'])
merged = merged.drop(columns=['release_year', 'date_added'])

if 'production_countries' in merged.columns and 'country' in merged.columns:
    merged['production_countries'] = merged['production_countries'].combine_first(merged['country'])

merged = merged.drop(columns=['country'], errors='ignore')

#after manually checking "listed_in"
if 'genres' in merged.columns and 'listed_in' in merged.columns:
    merged['genres'] = merged['genres'].combine_first(merged['listed_in'])
merged = merged.drop(columns=['listed_in'], errors='ignore')


  merged['year'] = merged['release_year'].combine_first(merged['year']).combine_first(merged['date_added'])


In [34]:
merged['rating'] = merged['rating'].combine_first(merged['age_certification'])
merged = merged.drop(columns=['age_certification'])

In [35]:
merged = merged.drop(columns=["seasons", "episodes",]) #columns not required

In [36]:
merged.columns

Index(['title', 'type', 'description', 'runtime', 'genres',
       'production_countries', 'imdb_id', 'imdb_score', 'is_amazon', 'genre',
       'year', 'is_hotstar', 'director', 'cast', 'rating', 'is_netflix'],
      dtype='object')

In [37]:
merged

Unnamed: 0,title,type,description,runtime,genres,production_countries,imdb_id,imdb_score,is_amazon,genre,year,is_hotstar,director,cast,rating,is_netflix
6663,Euphoria,MOVIE,"The story of two sisters on a journey, where t...",110 min,"['drama', 'european']","['DE', 'SE', 'GB']",tt5698320,5.8,1,Drama,2018.0,1,Valeria Golino,"Riccardo Scamarcio, Valerio Mastandrea, Isabel...",R,1
11355,Lakshya,MOVIE,"An aimless, jobless, irresponsible grown man j...",185 min,"['drama', 'war', 'action', 'romance']",['IN'],tt0323013,7.8,1,Crime,2004.0,1,Farhan Akhtar,"Amitabh Bachchan, Hrithik Roshan, Preity Zinta...",TV-PG,1
17982,Signal,SHOW,Detectives from the present and a detective f...,1 Season,"['scifi', 'thriller', 'crime', 'drama', 'fanta...",['KR'],tt5332206,8.5,1,Drama,2016.0,1,,"Kim Hye-su, Lee Je-hoon, Cho Jin-woong, Jang H...",TV-MA,1
16823,Rocky,MOVIE,"When world heavyweight boxing champion, Apollo...",120 min,"['drama', 'sport']",['US'],tt0075148,8.1,1,Action,1976.0,1,John G. Avildsen,"Sylvester Stallone, Talia Shire, Burt Young, C...",PG,1
7537,Game,MOVIE,Four strangers are invited by the reclusive Ka...,135 min,"['thriller', 'action', 'crime', 'drama', 'euro...",['IN'],tt1772872,5.2,1,Thriller,2011.0,1,Abhinay Deo,"Abhishek Bachchan, Kangana Ranaut, Anupam Kher...",TV-14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,Dhada,movie,"As Viswa tries to woo Rhea, the daughter of a ...",141.0,,,,,0,Action,2011.0,1,,,U/A 13+,0
5622,Dhairyam,movie,Seenu and Mallika are in love but hail from di...,147.0,,,,,0,Romance,2005.0,1,,,U/A 13+,0
5625,Dhana 51,movie,"With an aim to become an honest cop, Dhana tea...",146.0,,,,,0,Drama,2005.0,1,,,U/A 13+,0
10456,Kabaddi,movie,Praveen trains to be a kabaddi player. When th...,129.0,,,,,0,Drama,2009.0,1,,,U/A 7+,0


In [38]:
merged.to_csv(r"automated_matched2.csv")

In [39]:
import pandas as pd

# CODE TO CHECK IF DUPLICATE ROW ISSUE RESOLVED
file1 = 'automated_matched.csv' 
file2 = 'automated_matched2.csv' 

df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)

print("Size of BEFORE (rows, columns):", df1.shape)
print("Size of AFTER (rows, columns):", df2.shape)

column_name = 'title'  
if column_name in df1.columns and column_name in df2.columns:
    unique_names_df1 = df1[column_name].nunique()
    unique_names_df2 = df2[column_name].nunique()

    print(f"Number of unique names in '{column_name}' of BEFORE:", unique_names_df1)
    print(f"Number of unique names in '{column_name}' of AFTER:", unique_names_df2)
else:
    print(f"The column '{column_name}' does not exist in one or both DataFrames.")


Size of BEFORE (rows, columns): (24672, 17)
Size of AFTER (rows, columns): (24307, 17)
Number of unique names in 'title' of BEFORE: 24307
Number of unique names in 'title' of AFTER: 24307


In [42]:
nans_per_column = df2.isnull().sum()
print("NaN values per column:\n", nans_per_column)

NaN values per column:
 Unnamed: 0                  0
title                       0
type                        0
description               141
runtime                  2188
genres                   6309
production_countries     6797
imdb_id                 14278
imdb_score              14677
is_amazon                   0
genre                   17630
year                        0
is_hotstar                  0
director                18909
cast                    17238
rating                   6727
is_netflix                  0
dtype: int64


USING API FOR MISSING DATA

In [43]:
import pandas as pd
df = pd.read_csv(r"automated_matched2.csv")

In [44]:
df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')


In [45]:
df["year"]

0        2018
1        2004
2        2016
3        1976
4        2011
         ... 
24302    2011
24303    2005
24304    2005
24305    2009
24306    2022
Name: year, Length: 24307, dtype: Int64

In [46]:
nan_counts = df.isna().sum()
nan_counts

Unnamed: 0                  0
title                       0
type                        0
description               141
runtime                  2188
genres                   6309
production_countries     6797
imdb_id                 14278
imdb_score              14677
is_amazon                   0
genre                   17630
year                        0
is_hotstar                  0
director                18909
cast                    17238
rating                   6727
is_netflix                  0
dtype: int64

In [47]:
df = df.drop(columns=["Unnamed: 0"])

In [48]:
# Normalize the 'type' column
df['type'] = df['type'].str.lower().apply(
    lambda x: 'Movie' if 'movie' in x else ('Tv Show' if 'show' in x or 'tv show' in x else x.title())
)

df

Unnamed: 0,title,type,description,runtime,genres,production_countries,imdb_id,imdb_score,is_amazon,genre,year,is_hotstar,director,cast,rating,is_netflix
0,Euphoria,Movie,"The story of two sisters on a journey, where t...",110 min,"['drama', 'european']","['DE', 'SE', 'GB']",tt5698320,5.8,1,Drama,2018,1,Valeria Golino,"Riccardo Scamarcio, Valerio Mastandrea, Isabel...",R,1
1,Lakshya,Movie,"An aimless, jobless, irresponsible grown man j...",185 min,"['drama', 'war', 'action', 'romance']",['IN'],tt0323013,7.8,1,Crime,2004,1,Farhan Akhtar,"Amitabh Bachchan, Hrithik Roshan, Preity Zinta...",TV-PG,1
2,Signal,Tv Show,Detectives from the present and a detective f...,1 Season,"['scifi', 'thriller', 'crime', 'drama', 'fanta...",['KR'],tt5332206,8.5,1,Drama,2016,1,,"Kim Hye-su, Lee Je-hoon, Cho Jin-woong, Jang H...",TV-MA,1
3,Rocky,Movie,"When world heavyweight boxing champion, Apollo...",120 min,"['drama', 'sport']",['US'],tt0075148,8.1,1,Action,1976,1,John G. Avildsen,"Sylvester Stallone, Talia Shire, Burt Young, C...",PG,1
4,Game,Movie,Four strangers are invited by the reclusive Ka...,135 min,"['thriller', 'action', 'crime', 'drama', 'euro...",['IN'],tt1772872,5.2,1,Thriller,2011,1,Abhinay Deo,"Abhishek Bachchan, Kangana Ranaut, Anupam Kher...",TV-14,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24302,Dhada,Movie,"As Viswa tries to woo Rhea, the daughter of a ...",141.0,,,,,0,Action,2011,1,,,U/A 13+,0
24303,Dhairyam,Movie,Seenu and Mallika are in love but hail from di...,147.0,,,,,0,Romance,2005,1,,,U/A 13+,0
24304,Dhana 51,Movie,"With an aim to become an honest cop, Dhana tea...",146.0,,,,,0,Drama,2005,1,,,U/A 13+,0
24305,Kabaddi,Movie,Praveen trains to be a kabaddi player. When th...,129.0,,,,,0,Drama,2009,1,,,U/A 7+,0


In [49]:
df.to_csv("updated_auto2.csv")

In [53]:
#My API Key
api_key = "c0130ed03393870771f01685ef2ae381"

import requests
import pandas as pd
import time
from tqdm import tqdm

file_path = 'updated_auto.csv'
df = pd.read_csv(file_path)
# Base URL for TMDB API
base_url = 'https://api.themoviedb.org/3'
image_base_url = 'https://image.tmdb.org/t/p/w500'  # URL format for poster images

# Function to fetch details from TMDB
def fetch_tmdb_data(title, is_movie):
    search_type = 'movie' if is_movie else 'tv'
    search_url = f"{base_url}/search/{search_type}?api_key={api_key}&query={title}"
    response = requests.get(search_url)
    time.sleep(0.03)
    if response.status_code != 200:
        print(f"Failed to fetch data for '{title}'. Status Code: {response.status_code}")
        return None
    
    results = response.json().get('results', [])
    if not results:
        return None  # Return None if no results found to indicate row deletion
    
    # Return the first result's ID for further detail lookup
    return results[0]['id'], results[0].get('overview'), results[0].get('poster_path')

# Function to fetch detailed data from TMDB using movie/show ID
def fetch_detailed_tmdb_data(id, is_movie):
    detail_type = 'movie' if is_movie else 'tv'
    detail_url = f"{base_url}/{detail_type}/{id}?api_key={api_key}&append_to_response=credits"
    response = requests.get(detail_url)
    time.sleep(0.03)
    if response.status_code != 200:
        print(f"Failed to fetch detailed data for ID '{id}'. Status Code: {response.status_code}")
        return {}
    
    details = response.json()
    return details

# Initialize new column for poster URLs if it doesn't exist
if 'poster_url' not in df.columns:
    df['poster_url'] = None

# Iterate over rows with a progress bar and delay to avoid API rate limits
rows_to_delete = []  # Keep track of indices of rows to delete

for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):

    # Check for rows with missing essential information
    missing_data = pd.isnull(row[['description', 'runtime', 'genres', 'production_countries', 
                                  'imdb_id', 'imdb_score', 'genre', 'director', 'cast', 'rating', 'poster_url']])
    
    if missing_data.any():
        is_movie = row['type'].strip().lower() == 'movie'
        title = row['title']
        
        # Fetch data from TMDB
        tmdb_data = fetch_tmdb_data(title, is_movie)
        
        # If no data is found, mark row for deletion
        if not tmdb_data:
            rows_to_delete.append(index)
            continue  # Skip to the next row

        # Unpack basic details and ID
        id, overview, poster_path = tmdb_data
        
        # Update basic fields
        df.at[index, 'description'] = overview
        if poster_path:
            df.at[index, 'poster_url'] = f"{image_base_url}{poster_path}"
        
        # Fetch detailed data for the specific movie/show ID
        tmdb_details = fetch_detailed_tmdb_data(id, is_movie)
        if tmdb_details:
            # Update additional fields
            # Update the runtime field safely
            episode_run_time = tmdb_details.get('episode_run_time', [])
            df.at[index, 'runtime'] = tmdb_details.get('runtime') or (episode_run_time[0] if episode_run_time else None)
            df.at[index, 'genres'] = ', '.join([genre['name'] for genre in tmdb_details.get('genres', [])])
            df.at[index, 'production_countries'] = ', '.join([country['name'] for country in tmdb_details.get('production_countries', [])])
            df.at[index, 'imdb_id'] = tmdb_details.get('imdb_id')
            df.at[index, 'imdb_score'] = tmdb_details.get('vote_average')
            
            # Credits (cast and director)
            credits = tmdb_details.get('credits', {})
            df.at[index, 'director'] = ', '.join(crew['name'] for crew in credits.get('crew', []) if crew['job'] == 'Director')
            df.at[index, 'cast'] = ', '.join([member['name'] for member in credits.get('cast', [])[:5]])
            
            # Rating (certification) if available
            if 'content_ratings' in tmdb_details:
                us_rating = next((rating['rating'] for rating in tmdb_details['content_ratings']['results'] if rating['iso_3166_1'] == 'US'), None)
                df.at[index, 'rating'] = us_rating
            elif 'release_dates' in tmdb_details:
                us_rating = next((release['certification'] for release in tmdb_details['release_dates']['results'] if release['iso_3166_1'] == 'US' and release['release_dates']), None)
                df.at[index, 'rating'] = us_rating
        
        # Delay to respect API rate limits
        

# Delete rows marked for deletion
df.drop(index=rows_to_delete, inplace=True)

# Save the updated DataFrame to a new CSV file
output_file = 'updated_auto_filled_with_posters2.csv'
df.to_csv(output_file, index=False)
print(f"Data processing complete. The updated file has been saved as '{output_file}'.")
print(f"{len(rows_to_delete)} rows were deleted due to missing search results.")


Processing Rows:  42%|████▏     | 10280/24672 [3:43:47<5:13:17,  1.31s/it]


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [54]:
import requests
import pandas as pd
import time
from tqdm import tqdm

# TMDB API Key
api_key = "c0130ed03393870771f01685ef2ae381"
base_url = 'https://api.themoviedb.org/3'
image_base_url = 'https://image.tmdb.org/t/p/w500'

# Function to fetch basic TMDB data
def fetch_tmdb_data(title, is_movie):
    search_type = 'movie' if is_movie else 'tv'
    search_url = f"{base_url}/search/{search_type}?api_key={api_key}&query={title}"
    response = requests.get(search_url)
    time.sleep(0.03)  # Avoid rate limits
    if response.status_code != 200:
        print(f"Failed to fetch data for '{title}'. Status Code: {response.status_code}")
        return None
    
    results = response.json().get('results', [])
    if not results:
        return None  # No results found
    
    return results[0]['id'], results[0].get('overview'), results[0].get('poster_path')

# Function to fetch detailed TMDB data
def fetch_detailed_tmdb_data(id, is_movie):
    detail_type = 'movie' if is_movie else 'tv'
    detail_url = f"{base_url}/{detail_type}/{id}?api_key={api_key}&append_to_response=credits"
    response = requests.get(detail_url)
    time.sleep(0.03)  # Avoid rate limits
    if response.status_code != 200:
        print(f"Failed to fetch detailed data for ID '{id}'. Status Code: {response.status_code}")
        return {}
    
    return response.json()

# Function to process a single row
def process_row(row):
    is_movie = row['type'].strip().lower() == 'movie'
    title = row['title']

    # Fetch basic data
    tmdb_data = fetch_tmdb_data(title, is_movie)
    if not tmdb_data:
        return None  # Mark for deletion if no data found

    # Unpack basic details and fetch detailed data
    id, overview, poster_path = tmdb_data
    tmdb_details = fetch_detailed_tmdb_data(id, is_movie)

    # Update row dictionary
    row['description'] = overview
    if poster_path:
        row['poster_url'] = f"{image_base_url}{poster_path}"
    if tmdb_details:
        # Update runtime
        episode_run_time = tmdb_details.get('episode_run_time', [])
        row['runtime'] = tmdb_details.get('runtime') or (episode_run_time[0] if episode_run_time else None)
        # Update genres and production countries
        row['genres'] = ', '.join([genre['name'] for genre in tmdb_details.get('genres', [])])
        row['production_countries'] = ', '.join([country['name'] for country in tmdb_details.get('production_countries', [])])
        # Update IMDb data
        row['imdb_id'] = tmdb_details.get('imdb_id')
        row['imdb_score'] = tmdb_details.get('vote_average')
        # Update director and cast
        credits = tmdb_details.get('credits', {})
        row['director'] = ', '.join(crew['name'] for crew in credits.get('crew', []) if crew['job'] == 'Director')
        row['cast'] = ', '.join([member['name'] for member in credits.get('cast', [])[:5]])
        # Update rating
        if 'content_ratings' in tmdb_details:
            us_rating = next((rating['rating'] for rating in tmdb_details['content_ratings']['results'] if rating['iso_3166_1'] == 'US'), None)
            row['rating'] = us_rating
        elif 'release_dates' in tmdb_details:
            us_rating = next((release['certification'] for release in tmdb_details['release_dates']['results'] if release['iso_3166_1'] == 'US' and release['release_dates']), None)
            row['rating'] = us_rating

    return row  # Return updated row

# Function to process a chunk of data
def process_chunk(chunk):
    processed_rows = []
    rows_to_delete = []

    for index, row in chunk.iterrows():
        updated_row = process_row(row)
        if updated_row is None:
            rows_to_delete.append(index)
        else:
            processed_rows.append(updated_row)

    # Return the processed rows and rows to delete
    return pd.DataFrame(processed_rows), rows_to_delete

# Main function to process the data in chunks
def process_dataframe_in_chunks(file_path, output_file, chunk_size=500):
    # Load the DataFrame
    df = pd.read_csv(file_path)

    # Initialize new column for poster URLs if it doesn't exist
    if 'poster_url' not in df.columns:
        df['poster_url'] = None

    processed_chunks = []
    rows_to_delete = []

    # Process the DataFrame in chunks
    for start in tqdm(range(0, len(df), chunk_size), desc="Processing Chunks"):
        end = min(start + chunk_size, len(df))
        chunk = df.iloc[start:end]

        # Process the chunk
        processed_chunk, chunk_deletions = process_chunk(chunk)
        processed_chunks.append(processed_chunk)
        rows_to_delete.extend(chunk_deletions)

    # Combine all processed chunks into a single DataFrame
    final_df = pd.concat(processed_chunks, ignore_index=True)

    # Drop rows marked for deletion
    final_df.drop(index=rows_to_delete, inplace=True, errors='ignore')

    # Save the final DataFrame to a new CSV file
    final_df.to_csv(output_file, index=False)
    print(f"Data processing complete. The updated file has been saved as '{output_file}'.")
    print(f"{len(rows_to_delete)} rows were deleted due to missing search results.")

# Run the processing function
file_path = 'updated_auto2.csv'
output_file = 'updated_auto_filled_with_posters_chunks.csv'
process_dataframe_in_chunks(file_path, output_file, chunk_size=1000)


Processing Chunks:  48%|████▊     | 12/25 [6:44:33<5:19:49, 1476.14s/it]

Failed to fetch detailed data for ID '720699'. Status Code: 500


Processing Chunks:  92%|█████████▏| 23/25 [13:36:51<1:20:26, 2413.16s/it]

Failed to fetch detailed data for ID '1323114'. Status Code: 404


Processing Chunks: 100%|██████████| 25/25 [14:04:26<00:00, 2026.68s/it]  


Data processing complete. The updated file has been saved as 'updated_auto_filled_with_posters_chunks.csv'.
2365 rows were deleted due to missing search results.


In [55]:
# Load the CSV file into a pandas DataFrame
file_path = 'updated_auto_filled_with_posters.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,title,type,description,runtime,genres,production_countries,imdb_id,imdb_score,is_amazon,genre,year,is_hotstar,director,cast,rating,is_netflix,poster_url
0,22,'89,Movie,Teens in cabin get murdered by cult,24.0,,,,8.3,0,,2017,0,,,TV-PG,1,
1,23,'Allo 'Allo!,Tv Show,The misadventures of hapless cafe owner René A...,30.0,"Comedy, War & Politics",United Kingdom,,7.745,1,,1984,0,,"Gorden Kaye, Carmen Silvera, Vicki Michelle, R...",,0,https://image.tmdb.org/t/p/w500/m42lb4UxkpaFil...
2,24,'Neath Brooklyn Bridge,Movie,The East Side Kids find a young girl in the ap...,61.0,"Comedy, Drama, Romance, War",United States of America,tt0034420,6.8,1,,1942,0,Wallace Fox,"Leo Gorcey, Bobby Jordan, Huntz Hall, Gabriel ...",,0,https://image.tmdb.org/t/p/w500/xNrOAj4rrD1FNY...
3,25,'Neath Canadian Skies,Movie,Canadian Mountie investigates a murder posing ...,41.0,"Crime, Western",United States of America,tt0038277,4.0,1,,1946,0,B. Reeves Eason,"Russell Hayden, Inez Cooper, Douglas Fowley, C...",,0,https://image.tmdb.org/t/p/w500/ghWMBGUF1LLfYR...
4,26,'Neath the Arizona Skies,Movie,"Chris Morrell, the guardian of half-Indian gir...",52.0,"Action, Western",United States of America,tt0024805,4.5,1,,1934,0,Harry L. Fraser,"John Wayne, Sheila Terry, Shirley Jean Rickert...",,0,https://image.tmdb.org/t/p/w500/nc6apiqjIWP4Cd...


In [56]:

# Delete the 'genre' column from the DataFrame
df = df.drop(columns=['genre'], errors='ignore')

# Delete rows where 'poster_url' column has empty values
df = df[df['poster_url'].notna() & (df['poster_url'] != '')]

# Save the updated DataFrame to a new CSV file
output_file_path = 'final_data_fixed.csv'
df.to_csv(output_file_path, index=False)


In [58]:
df.shape

(21053, 17)