In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from PyMovieDb import IMDB
import json
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup

plt.rc('figure', figsize=(15, 10))
plt.rc('font', size=16)
warnings.filterwarnings('ignore')

In [2]:
genres = [
    'Action',
    'Adventure',
    'Animation',
    'Biography',
    'Comedy',
    'Crime',
    'Documentary',
    'Drama',
    'Family',
    'Fantasy',
    'Film Noir',
    'History',
    'Horror',
    'Music',
    'Musical',
    'Mystery',
    'Romance',
    'Sci-Fi',
    'Short Film',
    'Sport',
    'Superhero',
    'Thriller',
    'War',
    'Western'
]

max_requests = 50
processed_ids = set()
imdb = IMDB()

In [19]:
def get_genre_movies(genre, max_requests=max_requests):
    res = pd.DataFrame()
    i = 0
    while len(res) <= 250 and i < max_requests:
        try:
            movies = imdb.popular_movies(genre=genre, start_id=i*50+1)
            res = res.append(json.loads(movies)['results'], ignore_index=True)
            res = res[~res['id'].isin(processed_ids)]
        except Exception as e:
            print(e)
        i += 1
    processed_ids.update(res['id'].to_list())
    res['genre'] = genre
    return res

def get_all_movies(genres):
    df = pd.DataFrame()
    for genre in tqdm(genres, desc='genres'):
        df = df.append(get_genre_movies(genre), ignore_index=True)
    return df

In [109]:
df = get_all_movies(genres)

genres: 100%|███████████████████████████████████| 24/24 [14:19<00:00, 35.80s/it]


In [114]:
df.to_csv('movies_draft.csv')

In [5]:
df = pd.read_csv('movies_draft.csv', index_col=0)

In [6]:
df

Unnamed: 0,id,name,year,url,poster,genre
0,tt1016150,Im Westen nichts Neues,2022.0,https://www.imdb.com/title/tt1016150/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BYTE1Mm...,Action
1,tt6443346,Qora Adam,2022.0,https://www.imdb.com/title/tt6443346/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BYzZkOG...,Action
2,tt1630029,Avatar: The Way of Water,2022.0,https://www.imdb.com/title/tt1630029/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BM2VhYj...,Action
3,tt9114286,Qora Pantera 2,2022.0,https://www.imdb.com/title/tt9114286/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BNTM4Nj...,Action
4,tt14641788,Enola Holmes 2,2022.0,https://www.imdb.com/title/tt14641788/?ref_=ad...,https://m.media-amazon.com/images/M/MV5BMDI1NW...,Action
...,...,...,...,...,...,...
6245,tt0049593,Pardners,1956.0,https://www.imdb.com/title/tt0049593/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BZDYyZm...,Western
6246,tt0050923,Saddle the Wind,1958.0,https://www.imdb.com/title/tt0050923/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BOTEyND...,Western
6247,tt0065026,Gli specialisti,1969.0,https://www.imdb.com/title/tt0065026/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BZGQxNW...,Western
6248,tt0064379,The Good Guys and the Bad Guys,1969.0,https://www.imdb.com/title/tt0064379/?ref_=adv...,https://m.media-amazon.com/images/M/MV5BZGIxMD...,Western


In [9]:
df['description'] = None

### Tries to get movies description. First - using PyMovieDb, second - using requests + BeautifulSoup

In [25]:
for i in tqdm(range(len(df)), desc='movies'):
    if df.loc[i, 'description'] is None:
        df.loc[i, 'description'] = json.loads(imdb.get_by_id(df.loc[i, 'id'])).get('description', None)

movies: 100%|█████████████████████████████| 6250/6250 [3:43:38<00:00,  2.15s/it]


In [26]:
df.to_csv('movies_with_descriptions.csv')

In [76]:
for i in tqdm(range(len(df)), desc='movies'):
    if df.loc[i, 'description'] is None:
        soup = BeautifulSoup(requests.get(df.loc[i, 'url']).text)
        df.loc[i, 'description'] = soup.find('meta', {'name': 'description'}).get('content', None)

movies: 100%|█████████████████████████████| 6250/6250 [1:27:30<00:00,  1.19it/s]


In [77]:
df.to_csv('movies_with_descriptions.csv')

In [3]:
df = pd.read_csv('movies_with_descriptions.csv', index_col=0)

In [7]:
df[df.description.str.contains('The story of Michelle Payne, the first ')].iloc[0].poster

'https://m.media-amazon.com/images/M/MV5BNWJlNzA4OGQtYjJjNS00ZDljLTgwOGEtYzU4ODhiMDFhYTllXkEyXkFqcGdeQXVyNTE1NjY5Mg@@._V1_UY98_CR0,0,67,98_AL_.jpg'

In [60]:
df.genre.value_counts().sort_index()

Action         300
Adventure      259
Animation      281
Biography      293
Comedy         257
Crime          297
Documentary    288
Drama          269
Family         274
Fantasy        288
Film Noir      256
History        298
Horror         260
Music          288
Musical        269
Mystery        258
Romance        260
Sci-Fi         272
Short Film     234
Sport          259
Superhero      259
Thriller       255
War            257
Western        278
Name: genre, dtype: int64