In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import re
import urllib.request
from bs4 import BeautifulSoup
import csv

# Matching IMDb ID to Movielens 100K dataset by movielens title

## IMDb dataset

In [None]:
!wget https://datasets.imdbws.com/title.basics.tsv.gz
!gunzip -k title.basics.tsv.gz

In [None]:
!head title.basics.tsv

In [None]:
imdb_dtype={
  'tconst': str,
  'titleType': str,
  'primaryTitle': str,
  'originalTitle': str,
  'isAdult': str,
  'startYear': str,
  'endYear': str,
  'runtimeMinutes': str,
  'genres': str
}

imdb = pd.read_csv('title.basics.tsv', 
                     sep='\t', 
                     encoding='latin-1',
                     dtype=imdb_dtype
                     )

In [None]:
imdb

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8370243,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8370244,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8370245,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8370246,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


## Movielens 100k dataset

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

In [None]:
# !head ml-100k/u.item

In [None]:
# Reading files
movies_dtype={
  'id': int,
  'title': str,
  'release_date': str,
  'video_release_date': str,
  'imdb_url': str,
  'unknown': int,
  'Action': int,
  'Adventure': int,
  'Animation': int,
  'Childrens': int,
  'Comedy': int,
  'Crime': int,
  'Documentary': int,
  'Drama': int,
  'Fantasy': int,
  'FilmNoir': int,
  'Horror': int,
  'Musical': int,
  'Mystery': int,
  'Romance': int,
  'SciFi': int,
  'Thriller': int,
  'War': int,
  'Western': int
}

names = ['id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western'];

movies = pd.read_csv('ml-100k/u.item', 
                     sep='|', 
                     encoding='latin-1',
                     dtype=movies_dtype,
                     names=names
                     )
movies.set_index('id', inplace=True)

In [None]:
movies

Unnamed: 0_level_0,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# !grep 'movie.*Four Rooms.*1995' title.basics.tsv

## Extract movie title and release year from original movielens title

In [None]:
movies['title2']=''
movies['release_year']=''

In [None]:
for ind, m in movies.iterrows():
  match = re.search('([^\()]*).* \((\d*)\)', m['title'])
  if match:
    movies.at[ind,'title2'] = str(match.group(1)).replace('  ', ' ')
    movies.at[ind,'release_year'] = match.group(2)*1

In [None]:
movies['title2'] = movies['title2'].map(lambda x: str(x).strip())

In [None]:
# Convert title2 from form of 'Usual Suspects, The' to 'The Usual Suspects'
for ind, m in movies.iterrows():
  match = re.search(', (.*)', str(m['title2']))
  if match:
    movies.at[ind,'title2'] = match.group(1).strip() + ' ' + m['title2'][:(-1*(len(match.group(1))+2))].strip()
    print(movies.at[ind,'title2'])

In [None]:
movies

Unnamed: 0_level_0,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,title2,release_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Toy Story,1995
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,GoldenEye,1995
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,Four Rooms,1995
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,Get Shorty,1995
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,Copycat,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,Mat' i syn,1997
1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,B. Monkey,1998
1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,Sliding Doors,1998
1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,You So Crazy,1994


## Matching IMDb ID by title and release year

In [None]:
movies['imdb_id']=''

In [None]:
# i=0
# for ind, m in movies.iterrows():
#   if i >= 10:
#     break
#   imdb_row = !grep 'movie.*$m['title2'].*$m['release_year']' title.basics.tsv | head -n 1
#   if len(imdb_row) > 0:
#     match = re.search('(tt\d*)', imdb_row[0])
#     if match:
#       movies.at[ind,'imdb_id'] = match.group(1)
#   i+=1

In [None]:
for ind, m in movies.iterrows():
  imdb_row = imdb[((imdb['primaryTitle'] == m['title2']) | (imdb['originalTitle'] == m['title2'])) & ((imdb['titleType'] == 'movie') | (imdb['titleType'] == 'short')) & (imdb['startYear'] == m['release_year'])]
  if len(imdb_row) > 0:
    print(imdb_row.iloc[0]['tconst'])
    movies.at[ind,'imdb_id'] = imdb_row.iloc[0]['tconst']

In [None]:
movies[movies['imdb_id'] == '']

Unnamed: 0_level_0,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,title2,release_year,imdb_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
11,Seven (Se7en) (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Se7en%20(1995),0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,Seven,1995,
14,"Postino, Il (1994)",01-Jan-1994,,"http://us.imdb.com/M/title-exact?Postino,%20Il...",0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,Il Postino,1994,
40,"To Wong Foo, Thanks for Everything! Julie Newm...",01-Jan-1995,,http://us.imdb.com/M/title-exact?To%20Wong%20F...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Thanks for Everything! Julie Newmar To Wong Foo,1995,
44,Dolores Claiborne (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Dolores%20Cla...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,Dolores Claiborne,1994,
55,"Professional, The (1994)",01-Jan-1994,,http://us.imdb.com/Title?L%E9on+(1994),0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,The Professional,1994,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,Getting Away With Murder (1996),12-Apr-1996,,http://us.imdb.com/Title?Getting+Away+With+Mur...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Getting Away With Murder,1996,
1667,"Next Step, The (1995)",13-Jun-1997,,http://us.imdb.com/M/title-exact?Next%20Step%2...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,The Next Step,1995,
1671,"Further Gesture, A (1996)",20-Feb-1998,,http://us.imdb.com/M/title-exact?Further+Gestu...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,A Further Gesture,1996,
1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,Mat' i syn,1997,


In [None]:
# imdb[(re.search('The Usual Suspects', str(imdb['primaryTitle'])) != None) & (imdb['titleType'] == 'movie') & (imdb['startYear'] == str(1995))] 
# !grep 'movie.*Shanghai Triad.*1995' title.basics.tsv
# movies[movies['title2'] == 'The Horseman on the Roof']

## Saving the results

In [None]:
movies['id'] = 0
for ind, m in movies.iterrows():
  movies.at[ind,'id'] = ind

In [None]:
movies[['id', 'title2', 'release_year', 'imdb_id']].to_csv('imdb_id_auto.csv',index=False)

# Matching IMDb URL to Movielens 100K from several sources

## Original Movielens 100K

In [None]:
!rm ml-100k.zip
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

--2021-11-04 16:51:28--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2021-11-04 16:51:29 (9.97 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

Archive:  ml-100k.zip
replace ml-100k/allbut.pl? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100

In [None]:
# !head ml-100k/u.item

In [None]:
# Reading files
movies_dtype={
  'id': int,
  'title': str,
  'release_date': str,
  'video_release_date': str,
  'imdb_url': str,
  'unknown': int,
  'Action': int,
  'Adventure': int,
  'Animation': int,
  'Childrens': int,
  'Comedy': int,
  'Crime': int,
  'Documentary': int,
  'Drama': int,
  'Fantasy': int,
  'FilmNoir': int,
  'Horror': int,
  'Musical': int,
  'Mystery': int,
  'Romance': int,
  'SciFi': int,
  'Thriller': int,
  'War': int,
  'Western': int
}

names=  ['id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western'];

movies = pd.read_csv('ml-100k/u.item', 
                     sep='|', 
                     encoding='latin-1',
                     dtype=movies_dtype,
                     names=names
                     )
movies.set_index('id', inplace=True)
movies.drop(['imdb_url'], axis=1, inplace=True)

print('\nmovies:')
print(len(movies))


movies:
1682


In [None]:
movies

Unnamed: 0_level_0,title,release_date,video_release_date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,Toy Story (1995),01-Jan-1995,,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,Mat' i syn (1997),06-Feb-1998,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,B. Monkey (1998),06-Feb-1998,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,Sliding Doors (1998),01-Jan-1998,,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,You So Crazy (1994),01-Jan-1994,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## IMDb URLs from babu-thomas's Github repository

In [None]:
!rm master.zip
!wget https://github.com/babu-thomas/movielens-posters/archive/refs/heads/master.zip
!unzip master.zip

--2021-11-04 16:51:54--  https://github.com/babu-thomas/movielens-posters/archive/refs/heads/master.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/babu-thomas/movielens-posters/zip/refs/heads/master [following]
--2021-11-04 16:51:55--  https://codeload.github.com/babu-thomas/movielens-posters/zip/refs/heads/master
Resolving codeload.github.com (codeload.github.com)... 140.82.113.9
Connecting to codeload.github.com (codeload.github.com)|140.82.113.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘master.zip.1’

master.zip.1            [ <=>                ] 105.60K   579KB/s    in 0.2s    

2021-11-04 16:51:55 (579 KB/s) - ‘master.zip.1’ saved [108133]

Archive:  master.zip
a55c9ae9bd0545425a2bee7477bdc96fb01289e9
replace movielens-posters-master/.gitignore?

In [None]:
# Reading files
url_dtype={
  'movielens_id': int,
  'imdb_url': str,
}

names=  ['movielens_id', 'imdb_url'];

urls = pd.read_csv('movielens-posters-master/movie_url.csv', 
                     sep=',', 
                     encoding='latin-1',
                     dtype=url_dtype,
                     names=names
                     )
urls.set_index('movielens_id', inplace=True)

print('urls:')
print(len(urls))

urls:
1640


## IMDb IDs from auto detection using Movielens titles and release years

In [None]:
!rm imdb_id_auto.csv
!wget https://csukas.org/dipterv1/imdb_id_auto.csv

--2021-11-04 16:52:46--  https://csukas.org/dipterv1/imdb_id_auto.csv
Resolving csukas.org (csukas.org)... 5.56.38.113
Connecting to csukas.org (csukas.org)|5.56.38.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57650 (56K) [text/csv]
Saving to: ‘imdb_id_auto.csv’


2021-11-04 16:52:49 (172 KB/s) - ‘imdb_id_auto.csv’ saved [57650/57650]



In [None]:
# Reading files
imdb_id_dtype={
  'id': int,
  'title2': str,
  'release_year': str,
  'imdb_id': str
}

names=  ['id', 'title2', 'release_year', 'imdb_id'];

imdb_ids = pd.read_csv('imdb_id_auto.csv', 
                     sep=',', 
                     encoding='utf-8',
                     dtype=imdb_id_dtype,
                     names=names,
                     skiprows=1
                     )
imdb_ids.set_index('id', inplace=True)

print('imdb_ids:')
print(len(imdb_ids))

imdb_ids:
1682


## Merge datasets and generate IMDb URLs

In [None]:
df = pd.concat([movies, urls, imdb_ids], axis=1)

In [None]:
imdb_ids

Unnamed: 0_level_0,title2,release_year,imdb_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,1995,tt0114709
2,GoldenEye,1995,tt0113189
3,Four Rooms,1995,tt0113101
4,Get Shorty,1995,tt0113161
5,Copycat,1995,tt0112722
...,...,...,...
1678,Mat' i syn,1997,
1679,B. Monkey,1998,tt0120594
1680,Sliding Doors,1998,tt0120148
1681,You So Crazy,1994,


In [None]:
extra_ids = df[df['imdb_url'].isnull() & df['imdb_id'].notnull()]
for i, m in extra_ids.iterrows():
  df.at[i, 'imdb_url'] = 'http://www.imdb.com/title/' + m['imdb_id'] + '/'

In [None]:
df[df['imdb_url'].isna()]

Unnamed: 0,title,release_date,video_release_date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,imdb_url,title2,release_year,imdb_id
139,"Love Bug, The (1969)",01-Jan-1969,,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,The Love Bug,1969,
189,"Grand Day Out, A (1992)",01-Jan-1992,,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,A Grand Day Out,1992,
243,Jungle2Jungle (1997),07-Mar-1997,,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,Jungle2Jungle,1997,
539,Mouse Hunt (1997),01-Jan-1997,,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,Mouse Hunt,1997,
600,Daniel Defoe's Robinson Crusoe (1996),01-Jan-1996,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,Daniel Defoe's Robinson Crusoe,1996,
624,"Three Caballeros, The (1945)",01-Jan-1945,,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,,The Three Caballeros,1945,
830,Power 98 (1995),17-May-1996,,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,,Power 98,1995,
861,Nosferatu a Venezia (1986),01-Jan-1986,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,Nosferatu a Venezia,1986,
1056,Cronos (1992),01-Jan-1992,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,,Cronos,1992,
1153,Backbeat (1993),01-Jan-1993,,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,,Backbeat,1993,


## Matching IMDb IDs manually where they are still missing

In [None]:
!rm imdb_id_manual.csv
!wget https://csukas.org/dipterv1/imdb_id_manual.csv

rm: cannot remove 'imdb_id_manual.csv': No such file or directory
--2021-11-04 16:53:01--  https://csukas.org/dipterv1/imdb_id_manual.csv
Resolving csukas.org (csukas.org)... 5.56.38.113
Connecting to csukas.org (csukas.org)|5.56.38.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1636 (1.6K) [text/csv]
Saving to: ‘imdb_id_manual.csv’


2021-11-04 16:53:01 (380 MB/s) - ‘imdb_id_manual.csv’ saved [1636/1636]



In [None]:
# Reading files
imdb_id_dtype={
  'id': int,
  'imdb_url': str,
}

names=  ['id', 'imdb_url2'];

imdb_ids2 = pd.read_csv('imdb_id_manual.csv', 
                     sep=',', 
                     encoding='utf-8',
                     dtype=imdb_id_dtype,
                     names=names,
                     skiprows=1
                     )
imdb_ids2.set_index('id', inplace=True)

print('imdb_ids2:')
imdb_ids2.head()

imdb_ids2:


Unnamed: 0_level_0,imdb_url2
id,Unnamed: 1_level_1
114,http://www.imdb.com/title/tt2185063/
1516,https://www.imdb.com/title/tt0111709/
139,http://www.imdb.com/title/tt0064603/
189,http://www.imdb.com/title/tt0104361/
243,http://www.imdb.com/title/tt0119432/


In [None]:
df = pd.concat([df, imdb_ids2], axis=1)

In [None]:
extra_ids = df[df['imdb_url2'].notnull()]
for i, m in extra_ids.iterrows():
  df.at[i, 'imdb_url'] = m['imdb_url2']

In [None]:
df.drop(['imdb_id', 'imdb_url2'], axis=1, inplace=True)

In [None]:
df[df['imdb_url'].isnull()]

Unnamed: 0,title,release_date,video_release_date,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,FilmNoir,Horror,Musical,Mystery,Romance,SciFi,Thriller,War,Western,imdb_url,title2,release_year


In [None]:
df['imdb_id'] = ''
for i, m in df.iterrows():
  match = re.search('.*/(tt[^/]*)/.*', str(m['imdb_url']))
  if match:
    df.at[i,'imdb_id'] = match.group(1)

In [None]:
df[['title', 'title2', 'release_year', 'release_date', 'video_release_date', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western', 'imdb_id']].to_csv('ml100k-imdb.csv',index=True)

# Downloading IMDb posters using IMDb IDs

In [None]:
!rm ml100k-imdb.csv
!wget https://csukas.org/dipterv1/ml100k-imdb.csv

--2021-11-04 13:00:09--  https://csukas.org/dipterv1/ml100k-imdb.csv
Resolving csukas.org (csukas.org)... 5.56.38.113
Connecting to csukas.org (csukas.org)|5.56.38.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 187887 (183K) [text/csv]
Saving to: ‘ml100k-imdb.csv’


2021-11-04 13:00:10 (368 KB/s) - ‘ml100k-imdb.csv’ saved [187887/187887]



In [None]:
# Reading files
mdf_dtype={
  '': int,
  'title': str,
  'title2': str,
  'release_year': str,
  'release_date': str,
  'video_release_date': str,
  'unknown': int,
  'Action': int,
  'Adventure': int,
  'Animation': int,
  'Childrens': int,
  'Comedy': int,
  'Crime': int,
  'Documentary': int,
  'Drama': int,
  'Fantasy': int,
  'FilmNoir': int,
  'Horror': int,
  'Musical': int,
  'Mystery': int,
  'Romance': int,
  'SciFi': int,
  'Thriller': int,
  'War': int,
  'Western': int,
  'imdb_id': str
}

names=  ['id', 'title', 'title2', 'release_year', 'release_date', 'video_release_date', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'FilmNoir', 'Horror', 'Musical', 'Mystery', 'Romance', 'SciFi', 'Thriller', 'War', 'Western', 'imdb_id'];

mdf = pd.read_csv('ml100k-imdb.csv', 
                     sep=',', 
                     encoding='utf-8',
                     dtype=mdf_dtype,
                     names=names,
                     skiprows=1
                     )
mdf.set_index('id', inplace=True)

print('\movies:')
print(len(mdf))

\movies:
1682


In [None]:
!mkdir img

In [None]:
imdb_datasheet_url = 'https://www.imdb.com/title/'
for i, m in mdf.iterrows():
  if i <= 1515:
    continue

  movie_id = i
  movie_url = imdb_datasheet_url + m['imdb_id'] + '/'
  with urllib.request.urlopen(movie_url) as response:
      html = response.read()
      soup = BeautifulSoup(html, 'html.parser')
      # Get url of poster image
      try:
          image_url = soup.find('div', class_='ipc-poster').div.img['src']
          # TODO: Replace hardcoded extension with extension from string itself
          extension = '.jpg'
          image_url = ''.join(image_url.partition('_')[0]) + extension
          filename = 'img/' + str(movie_id) + extension
          with urllib.request.urlopen(image_url) as response:
              with open(filename, 'wb') as out_image:
                  out_image.write(response.read())
              with open('movie_poster.csv', 'a', newline='') as out_csv:
                  writer = csv.writer(out_csv, delimiter=',')
                  writer.writerow([movie_id, image_url])
      # Ignore cases where no poster image is present
      except AttributeError:
          pass
      except TypeError:
          pass

      print(str(i) + ' ' + image_url)

1516 https://m.media-amazon.com/images/M/MV5BNTA0MTExMDY1NF5BMl5BanBnXkFtZTcwMzI4MjkxMQ@@..jpg
1517 https://m.media-amazon.com/images/M/MV5BZDQ2ZGJlOTAtMmM1Yi00MGNlLWIwMDgtMzkzNTA1NWU3YzhiXkEyXkFqcGdeQXVyMTQxNzMzNDI@..jpg
1518 https://m.media-amazon.com/images/M/MV5BNWFiYzVkN2EtYjhkNi00ZjJiLWJlN2MtN2Y4ZWQ0MTgxZDI2XkEyXkFqcGdeQXVyNjMwMjk0MTQ@..jpg
1519 https://m.media-amazon.com/images/M/MV5BYjRkM2Q5MzAtZTM4Yy00OTEwLTkyZjQtMzliZjA0MzNmZjhiXkEyXkFqcGdeQXVyNzc5MjA3OA@@..jpg
1520 https://m.media-amazon.com/images/M/MV5BY2Y1MGU4MDYtNGIxZi00ZTVkLTg5MWYtOGYzMGE5ZWMyNDk0XkEyXkFqcGdeQXVyMTQxNzMzNDI@..jpg
1521 https://m.media-amazon.com/images/M/MV5BZWI3OWE2ODYtMzE0NC00ZGI4LTk1MTAtZWUxMDU1NDE0M2U3XkEyXkFqcGdeQXVyMTQxNzMzNDI@..jpg
1522 https://m.media-amazon.com/images/M/MV5BMjgyMjYxYzMtOTY4Zi00ZjYxLThkNmEtZmY1NWExZDJjMWU3XkEyXkFqcGdeQXVyNzc5MjA3OA@@..jpg
1523 https://m.media-amazon.com/images/M/MV5BYTQ3YmE2ODItZjY2Yy00MTMxLTliOWQtNWVkMGM5YWY0YTYwXkEyXkFqcGdeQXVyNjMwMjk0MTQ@..jpg
1524 https://m.m

In [None]:
mdf.loc[1516]

title                 Wedding Gift, The (1994)
title2                        The Wedding Gift
release_year                              1994
release_date                       01-Jan-1994
video_release_date                         NaN
unknown                                      0
Action                                       0
Adventure                                    0
Animation                                    0
Childrens                                    0
Comedy                                       0
Crime                                        0
Documentary                                  0
Drama                                        1
Fantasy                                      0
FilmNoir                                     0
Horror                                       0
Musical                                      0
Mystery                                      0
Romance                                      0
SciFi                                        0
Thriller     

In [None]:
!zip -r images.zip ./img/

  adding: img/ (stored 0%)
  adding: img/1046.jpg (deflated 2%)
  adding: img/1526.jpg (deflated 16%)
  adding: img/1067.jpg (deflated 3%)
  adding: img/1182.jpg (deflated 1%)
  adding: img/1487.jpg (deflated 1%)
  adding: img/1265.jpg (deflated 0%)
  adding: img/1534.jpg (deflated 0%)
  adding: img/1213.jpg (deflated 1%)
  adding: img/1362.jpg (deflated 20%)
  adding: img/1194.jpg (deflated 0%)
  adding: img/754.jpg (deflated 1%)
  adding: img/1073.jpg (deflated 16%)
  adding: img/940.jpg (deflated 2%)
  adding: img/780.jpg (deflated 0%)
  adding: img/856.jpg (deflated 0%)
  adding: img/965.jpg (deflated 0%)
  adding: img/1168.jpg (deflated 0%)
  adding: img/1425.jpg (deflated 0%)
  adding: img/930.jpg (deflated 3%)
  adding: img/796.jpg (deflated 4%)
  adding: img/1673.jpg (deflated 0%)
  adding: img/1096.jpg (deflated 0%)
  adding: img/1439.jpg (deflated 0%)
  adding: img/1045.jpg (deflated 0%)
  adding: img/948.jpg (deflated 0%)
  adding: img/1560.jpg (deflated 0%)
  adding: img/10