# Index building, ideas and setup, actual code in the .py files

# Imports and setup

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from functools import reduce
import itertools

from bs4 import BeautifulSoup
from requests import get
import re

# Getting the 1000 movie dataset

1000 movies and 50 movies per page = 20 pages total

In [2]:
NUM_PAGES = 20

Attributes of the movie to get: 
* Movie title
* Genre
* Director name + stars

In [3]:
base_url = 'https://www.imdb.com/search/title?groups=top_1000&sort=user_rating&page={0}'

In [4]:
all_movie_info = []
for i in tqdm(range(1,NUM_PAGES+1)):
    # get url
    url = base_url.format(i)
    
    response = get(url)

    html_soup = BeautifulSoup(response.text,'html.parser')    
    movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    
    # process all the movies for this page
    for j,movie in enumerate(movie_containers):
        movie_res = {}
        
        # title
        movie_title = movie.h3.a.text
        movie_res['title_pretty'] = movie_title
        movie_res['title_lower'] = movie_title.lower()

        # genres
        raw_genre_list = movie.find('p', class_ = 'text-muted').\
            find('span', class_ = 'genre').text[1:].split(',')
        movie_res['genres'] = list(map(lambda x: x.lower().lstrip().rstrip(),raw_genre_list))


        director_stars_info = movie.findAll('p', class_='')[-1]
        if 'Director' not in director_stars_info.text or 'Stars' not in director_stars_info.text:
            print(f'''i, j, movie = {(i,j,movie_title)}. Could not find Directors or Stars''')

        # Get people info, no need to distinguish director from stars
        people = list(map(lambda x: x.text.lower(), director_stars_info.findAll('a')))

        movie_res['people'] = people
        
        all_movie_info.append(movie_res)

100%|██████████| 20/20 [00:24<00:00,  1.22s/it]


In [5]:
df_movies_raw = pd.DataFrame.from_dict(all_movie_info)

In [6]:
df_movies_raw.notnull().sum()

genres          1000
people          1000
title_lower     1000
title_pretty    1000
dtype: int64

In [7]:
df_movies_raw.head()

Unnamed: 0,genres,people,title_lower,title_pretty
0,[drama],"[frank darabont, tim robbins, morgan freeman, ...",the shawshank redemption,The Shawshank Redemption
1,"[crime, drama]","[francis ford coppola, marlon brando, al pacin...",the godfather,The Godfather
2,"[action, crime, drama]","[christopher nolan, christian bale, heath ledg...",the dark knight,The Dark Knight
3,"[crime, drama]","[francis ford coppola, al pacino, robert de ni...",the godfather: part ii,The Godfather: Part II
4,"[crime, drama]","[quentin tarantino, john travolta, uma thurman...",pulp fiction,Pulp Fiction


# Build index

How many unique words are there?

In [8]:
unique_words = set()

for g_list in df_movies_raw.genres:
    for g in g_list:
        unique_words.add(g)

print(len(unique_words))
            
for p_list in df_movies_raw.people:
    for name in p_list:
        for name_part in name.split(' '):
            unique_words.add(name_part)

print(len(unique_words))
                
for title in df_movies_raw.title_lower:
    for word in title.split(' '):
        word = re.sub(r'[^\w\s]','',word)
        if word:
            unique_words.add(word)
        
print(len(unique_words))

21
4316
5680


5k keywords should be ok for an index that I'm planning. Plan for index:
* keywords are parsed from genre, people, and titles
* for each keyword, get list of indices (from results above) of titles with info containing this keyword

In [9]:
keyword_map = {}
def add_to_dict(k_map,itm,i):
    if itm not in keyword_map.keys():
        k_map[itm] = {i}
    else:
        k_map[itm].add(i)

In [10]:
for i,row in tqdm(df_movies_raw.iterrows()):
    for g in row['genres']:
        add_to_dict(keyword_map,g,i)
    
    for name in row['people']:
        for name_part in name.split(' '):
            add_to_dict(keyword_map,name_part,i)
    
    for word in row['title_lower'].split(' '):
        word = re.sub(r'[^\w\s]','',word)
        if word:
            add_to_dict(keyword_map,word,i)

1000it [00:00, 8164.99it/s]


In [11]:
titles = df_movies_raw.title_pretty.values

In [12]:
titles[list(keyword_map['nolan'])]

array(['The Dark Knight', 'The Prestige', 'Dunkirk', 'Inception',
       'Memento', 'Following', 'Batman Begins', 'Interstellar',
       'The Dark Knight Rises'], dtype=object)

In [13]:
def get_movies(search_str):
    list_of_terms = search_str.lower().split(' ')
    movies_to_intersect = []
    for term in list_of_terms:
        candidate_set = keyword_map.get(term,None)
        if candidate_set:
            movies_to_intersect.append(candidate_set)

    final_indices = reduce(lambda x,y: x & y, movies_to_intersect)

    return titles[list(final_indices)]

In [14]:
search_str = 'spielberg hanks '
get_movies(search_str)

array(['Bridge of Spies', 'Saving Private Ryan', 'Catch Me If You Can'],
      dtype=object)