/
videos_download.py
56 lines (47 loc) · 1.92 KB
/
videos_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import argparse
import multiprocessing as mp
import os
from functools import partial
from time import time as timer
from pytube import YouTube
from tqdm import tqdm
parser = argparse.ArgumentParser()
parser.add_argument('--input_list', type=str, required=True,
help='List of youtube video ids')
parser.add_argument('--output_dir', type=str, default='data/youtube_videos',
help='Location to download videos')
parser.add_argument('--num_workers', type=int, default=8,
help='How many multiprocessing workers?')
args = parser.parse_args()
def download_video(output_dir, video_id):
r"""Download video."""
video_path = '%s/%s.mp4' % (output_dir, video_id)
if not os.path.isfile(video_path):
try:
# Download the highest quality mp4 stream.
yt = YouTube('https://www.youtube.com/watch?v=%s' % (video_id))
stream = yt.streams.filter(subtype='mp4', only_video=True, adaptive=True).first()
if stream is None:
stream = yt.streams.filter(subtype='mp4').first()
stream.download(output_path=output_dir, filename=video_id + '.mp4')
except Exception as e:
print(e)
print('Failed to download %s' % (video_id))
else:
print('File exists: %s' % (video_id))
if __name__ == '__main__':
# Read list of videos.
video_ids = []
with open(args.input_list) as fin:
for line in fin:
video_ids.append(line.strip())
# Create output folder.
os.makedirs(args.output_dir, exist_ok=True)
# Download videos.
downloader = partial(download_video, args.output_dir)
start = timer()
pool_size = args.num_workers
print('Using pool size of %d' % (pool_size))
with mp.Pool(processes=pool_size) as p:
_ = list(tqdm(p.imap_unordered(downloader, video_ids), total=len(video_ids)))
print('Elapsed time: %.2f' % (timer() - start))