In [1]:
import pandas

In [None]:
from tqdm.auto import tqdm

In [33]:
df = pandas.read_csv('../data/movies_2024-09-10.csv', keep_default_na=False)

In [34]:
drop_col = ['recommendations', 'backdrop_path', 'poster_path']

In [35]:
df.drop(drop_col, axis=1, inplace=True)

In [36]:
df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
overview                 object
popularity              float64
production_companies     object
release_date             object
budget                  float64
revenue                 float64
runtime                  object
status                   object
tagline                  object
vote_average            float64
vote_count              float64
credits                  object
keywords                 object
dtype: object

In [47]:
documents = df.to_dict(orient='records')

In [45]:
df.release_date.replace('', '1970-01-01', inplace=True)

In [48]:
documents[500]

{'id': 967493,
 'title': 'Inside The Walking Dead Season 11 (Part 1)',
 'genres': '',
 'original_language': 'en',
 'overview': 'Join host Clarke Wolfe for the ultimate backstage pass to the final season of The Walking Dead (Part 1) featuring never-before-seen footage access to your favorite cast and crew and sneak peeks of what’s coming next.',
 'popularity': 114.827,
 'production_companies': '',
 'release_date': '1970-01-01',
 'budget': 0.0,
 'revenue': 0.0,
 'runtime': '0.0',
 'status': 'Released',
 'tagline': 'Join host Clarke Wolfe for the ultimate backstage pass to the final season of The Walking Dead (Part 1), featuring never-before-seen footage, access to your favorite cast and crew, and sneak peeks of what’s coming next.',
 'vote_average': 10.0,
 'vote_count': 4.0,
 'credits': 'Clarke Wolfe',
 'keywords': ''}

In [49]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [50]:
es_client.indices.delete(index='movies')

ObjectApiResponse({'acknowledged': True})

In [51]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "integer", "null_value": 0},
            "title": {"type": "text"},
            "genres": {"type": "text"},
            "original_language": {"type": "keyword"},
            "overview": {"type": "text"},
            "popularity": {"type": "float"},
            "production_companies": {"type": "text"},
            "release_date": {"type": "date", "format": "yyyy-MM-dd", "null_value": "1970-01-01"},
            "budget": {"type": "float"},
            "revenue": {"type": "float"},
            "runtime": {"type": "float"},
            "status": {"type": "keyword"},
            "tagline": {"type": "text"},
            "vote_average": {"type": "float"},
            "vote_count": {"type": "float"},
            "credits": {"type": "text"},
            "keywords": {"type": "text"},
        }
    }
}



index_name = "movies"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

In [52]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 3294/722359 [03:16<11:54:06, 16.78it/s]


KeyboardInterrupt: 

In [44]:
es_client.index(index=index_name, document=documents[500])

BadRequestError: BadRequestError(400, 'mapper_parsing_exception', "failed to parse field [release_date] of type [date] in document with id 'vnGp25EBRgEXlikCwjUp'. Preview of field's value: ''")

{'id': 967493,
 'title': 'Inside The Walking Dead Season 11 (Part 1)',
 'genres': '',
 'original_language': 'en',
 'overview': 'Join host Clarke Wolfe for the ultimate backstage pass to the final season of The Walking Dead (Part 1) featuring never-before-seen footage access to your favorite cast and crew and sneak peeks of what’s coming next.',
 'popularity': 114.827,
 'production_companies': '',
 'release_date': '',
 'budget': 0.0,
 'revenue': 0.0,
 'runtime': '0.0',
 'status': 'Released',
 'tagline': 'Join host Clarke Wolfe for the ultimate backstage pass to the final season of The Walking Dead (Part 1), featuring never-before-seen footage, access to your favorite cast and crew, and sneak peeks of what’s coming next.',
 'vote_average': 10.0,
 'vote_count': 4.0,
 'credits': 'Clarke Wolfe',
 'keywords': ''}

In [None]:
es_client.index(index=index_name, document=documents[12])

In [10]:
documents[54]

{'id': 862553,
 'title': 'Bar Fight',
 'genres': 'Comedy',
 'original_language': 'en',
 'overview': 'Nina and Allen split everything in their lives 50/50 after their break-up to avoid any drama—everything that is except for their favorite local bar. Nina and Allen must compete in a ridiculous tavern-style custody battle for their prized watering hole in which lines will be drawn sides will be chosen and beers will be drunken.',
 'popularity': 626.541,
 'production_companies': nan,
 'release_date': '2022-11-11',
 'budget': 0.0,
 'revenue': 0.0,
 'runtime': 84.0,
 'status': 'Released',
 'tagline': 'Time to settle the score one drink at a time.',
 'vote_average': 7.3,
 'vote_count': 18.0,
 'credits': 'Melissa Fumero-Luka Jones-Rachel Bloom-Julian Gant-Dot-Marie Jones-Vik Sahay',
 'keywords': nan}

In [12]:
import math

def preprocess_data(item):
    processed = {}
    for key, value in item.items():
        if isinstance(value, float) and math.isnan(value):
            if key in ['tagline', 'keywords']:
                processed[key] = None
            elif key in ['budget', 'revenue', 'runtime', 'vote_average']:
                processed[key] = 0.0
            elif key == 'vote_count':
                processed[key] = 0
        elif key in ['genres', 'production_companies', 'credits'] and isinstance(value, str):
            processed[key] = value.split('-')
        else:
            processed[key] = value
    return processed

# Example usage:
original_item = {'id': 956101, 'title': 'The Eighth Clause', 'genres': 'Thriller', 'original_language': 'la', 'overview': 'Kat and Borja appear to be a perfect couple but as in every marriage they keep secrets lies and infidelities that will come to light the night an unexpected visitor arrives.', 'popularity': 2259.303, 'production_companies': 'SDB Films-El Hombre Orquesta', 'release_date': '2022-04-29', 'budget': 0.0, 'revenue': 0.0, 'runtime': 0.0, 'status': 'Released', 'tagline': float('nan'), 'vote_average': 4.6, 'vote_count': 10.0, 'credits': 'Maite Perroni-Manuel Vega-Óscar Jaenada-Jessica Coch-Paulina Dávila-Christian Meier-Michel Duval-Mario Tardón', 'keywords': float('nan')}

processed_item = preprocess_data(original_item)
print(processed_item)

{'id': 956101, 'title': 'The Eighth Clause', 'genres': ['Thriller'], 'original_language': 'la', 'overview': 'Kat and Borja appear to be a perfect couple but as in every marriage they keep secrets lies and infidelities that will come to light the night an unexpected visitor arrives.', 'popularity': 2259.303, 'production_companies': ['SDB Films', 'El Hombre Orquesta'], 'release_date': '2022-04-29', 'budget': 0.0, 'revenue': 0.0, 'runtime': 0.0, 'status': 'Released', 'tagline': None, 'vote_average': 4.6, 'vote_count': 10.0, 'credits': ['Maite Perroni', 'Manuel Vega', 'Óscar Jaenada', 'Jessica Coch', 'Paulina Dávila', 'Christian Meier', 'Michel Duval', 'Mario Tardón'], 'keywords': None}
