___
### Youtube Data Crawling
- video title
- channel name
- script

**::VPN 연결한 상태에서 진행하면 ConnectionError 발생 빈도가 줄어듭니다::**

In [421]:
import os

# pip install necessary library
os.system('pip install --upgrade pip')
library = " ".join(['numpy', 'pandas', 'tqdm', 'requests', 'beautifulsoup4', 'user-agent', 'youtube_transcript_api', 'lxml', 'pyopenssl', 'ndg-httpsclient', 'pyasn1'])
os.system(f'pip install {library}')

import re
import time
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm

import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
from user_agent import generate_user_agent

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api import TranscriptsDisabled



In [466]:
def scrape_transcripts(url: str) -> tuple:
    ''' 유튜브 스크립트 스크레이핑 함수 
    
    :: input data
        * url: 유튜브 링크
        
    :: output data
        * script: 유튜브 스크립트 
    '''
    
    video_id = url[url.find('v=')+2:]
    try:
        userAgent = generate_user_agent(os=('mac', 'linux'), navigator='chrome', device_type='desktop')
        headers = {'user-agent': userAgent}
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=headers)
        status = 200
        
        for transcript in transcript_list:
            # fetch the actual transcript data
            transcripts = transcript.fetch()

            
            script = ""
            for transcript in transcripts:
                script += transcript['text'] + " "
                
        script = re.sub(' +', ' ', script).strip()

        if script == "":
            status = -1
            script = np.nan
            
    except TranscriptsDisabled:
        # 재생 불가 동영상 or 스크립트 누락 동영상
        status = -1
        script = np.nan
        
    except ConnectionError as e:
        # ConnectionError
        status = 403
        print(e, status, '\ntime sleep: 300sec')
        script = np.nan
        time.sleep(300)
        
    return status, video_id, script

In [467]:
def parsing_url(url: str) -> tuple:
    ''' Parsing url & scraping tite, channel_name'''
    
    try:
        # url pasing & user agent
        userAgent = generate_user_agent(os=('mac', 'linux'), navigator='chrome', device_type='desktop')
        headers = {'user-agent': userAgent}
        html = requests.get(url, headers=headers)
        status = html.status_code
    except ConnectionError as e:
        status = 404
        print(e, status, '\ntime sleep: 300sec')
        title, channel_name = np.nan, np.nan
        time.sleep(300)
    
    if status == 200:
        try:
            # scraping title & channel_name
            soup = BeautifulSoup(html.text, 'lxml')
            title = soup.find('title').text.replace('- YouTube', '').strip()
            channel_name = soup.find('div', 'watch-main-col').find('link', {'itemprop': 'name'})['content']   
            
            # thumbnail image
            if soup.find('link', {'rel': 'image_src'}) == None:
                thumbnail_img = np.nan
            else:
                thumbnail_img = soup.find('link', {'rel': 'image_src'})['href']

            # youtuber profile image
            reg = re.compile(r'https\:\/\/yt3\.ggpht\.com\/[a-zA-Z0-9\-\_\/]+\=s48\-c\-k\-c0x00ffffff\-no\-rj')
            if re.search(reg, html.text) == None:
                profile_img = np.nan
            else:
                profile_img = re.search(reg, html.text).group(0)
                
        except AttributeError:
            # scraping failed
            status = -1
            title, channel_name, thumbnail_img, profile_img = np.nan, np.nan, np.nan, np.nan
            
    else:
        title, channel_name, thumbnail_img, profile_img = np.nan, np.nan, np.nan, np.nan
        
    return status, title, channel_name, thumbnail_img, profile_img

In [468]:
def crawling_youtube(urls: list) -> pd.DataFrame:
    ''' 유튜브 데이터 크롤링 실행함수 
    
    :: input data
        * urls: 유튜브 링크 리스트
        
    :: output data
        * scrape_df: 데이터 크롤링 완료 데이터프레임
    '''
    # url dedup
    urls = list(set(urls))
    
    scrapes, error = [], []
    for url in tqdm(urls):

        ''' status
            * 200: url pasing successful
            * 404: url pasing failed
            * 403: url pasing failed
            *  -1: scraping failed
        '''
        status, title, channel_name, thumbnail_img, profile_img = parsing_url(url)
        status, video_id, script = scrape_transcripts(url)

        if (status == 404) | (status == 403):
            error.append(url)
        else:
            scrapes.append([url, title, video_id, channel_name, script, thumbnail_img, profile_img, status])
    
    # error: Re Crawling
    for url in tqdm(error):
        status, title, channel_name, thumbnail_img, profile_img = parsing_url(url)
        script = scrape_transcripts(url)
        scrapes.append([url, title, video_id, channel_name, script, thumbnail_img, profile_img, status])
        
    scrape_df = pd.DataFrame(scrapes, columns=['url', 'title', 'video_id', 'youtuber', 'script', 'thumbnail', 'youtuber_profile', 'status'])
    
    # # Extract only English scripts
    # reg_eng = re.compile('[^a-z^A-Z^0-9^가-힣^\.^!^,^\?^\'^\[^\]^\-^\(^\) ]')
    # scrape_df = scrape_df[scrape_df.script.str.contains(reg_eng, na=False)==False].reset_index(drop=True)
    # _scrape_df = scrape_df[scrape_df.script.str.contains(reg_eng, na=False)].reset_index(drop=True)
    
    return scrape_df

In [469]:
urls = '{input data: 유튜브 링크 url 리스트}'

df = pd.read_excel('/Users/mycelebs_95/Downloads/Beauty_full_final.xlsx')
urls = df.URL.unique().tolist()

In [470]:
# Start Crawling 
scrape_df = crawling_youtube(urls)

  0%|          | 0/921 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')) 403 
time sleep: 300sec
('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')) 404 
time sleep: 300sec
('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')) 404 
time sleep: 300sec


  0%|          | 0/1 [00:00<?, ?it/s]

In [477]:
urls = scrape_df.loc[scrape_df.status==-1, 'url'].tolist()
scrape_df_att_err = crawling_youtube(urls)

  0%|          | 0/61 [00:00<?, ?it/s]

0it [00:00, ?it/s]

In [None]:
''' Script Preprocessing '''

_scrape_df = scrape_df.copy()

# 영문, 특수기호(,.!?')만 추출
reg_eng = re.compile('[^a-zA-Z0-9\.\,\?\!\']')
_scrape_df.loc[:, 'script'] = scrape_df.script.str.replace(reg_eng, ' ').str.replace(' +', ' ')

# 특수기호와 영문 빈도 계산 및 비율 비교를 통한 필터링
_scrape_df_copy = _scrape_df.copy()
_scrape_df_copy.loc[:, 'en'] = _scrape_df_copy.script.str.count('[a-zA-Z]')
_scrape_df_copy.loc[:, 'sp_0'] = _scrape_df_copy.script.str.count('\.')
_scrape_df_copy.loc[:, 'sp_1'] = _scrape_df_copy.script.str.count('\,')
_scrape_df_copy.loc[:, 'sp_2'] = _scrape_df_copy.script.str.count('\?')
_scrape_df_copy.loc[:, 'sp_3'] = _scrape_df_copy.script.str.count('\!')

drop_index = []
for i in range(len(_scrape_df_copy)):
    counts = _scrape_df_copy.iloc[i, -5:].tolist()
    
    if str(counts[0]) == 'nan':
        pass
    else:
        if counts.index(max(counts)) != 0:
            drop_index.append(i)
        elif counts[0] <= sum(counts[1:]) * 13:
            drop_index.append(i)
        else:
            pass

_scrape_df_copy_ = _scrape_df_copy.drop(drop_index).sort_values('youtuber').reset_index(drop=True)

# Upload table into Database
columns = ['url', 'video_title', 'thumbnail', 'youtuber', 'youtuber_profile', 'script']
upload_df =  _scrape_df_copy_.loc[:, columns]

In [9]:
import os
import re
import sys
import time
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from user_agent import generate_user_agent
from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
from user_agent import generate_user_agent

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api import TranscriptsDisabled

import socket
import warnings
warnings.filterwarnings("ignore")

from src.crawling.crawler import get_url, scroll_down
from src.crawling.crawler_youtube import CrawlingYoutube

In [43]:
def get_url(url, window=True, image=True):
    ''' Set up webdriver, useragent & Get url '''
    
    wd = None
    socket.setdefaulttimeout(30)
    error = []
    attempts = 0 # url parsing 시도횟수
    # 10번 이상 parsing 실패시 pass
    while attempts < 10:
        try:  
            attempts += 1
            # user agent
            options = Options() 
            userAgent = generate_user_agent(os=('mac', 'linux'), navigator='chrome', device_type='desktop')
            options.add_argument('window-size=1920x1080')
            options.add_argument("--disable-gpu")
            options.add_argument('--disable-extensions')
            if not window:
                options.add_argument('headless')
            if not image:
                options.add_argument('--blink-settings=imagesEnabled=false')
            options.add_argument(f'user-agent={userAgent}')

            # web driver 
            wd = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=options)
            wd.get(url)
            wd.implicitly_wait(5)
            break

        # 예외처리
        except Exception as e:
            print(f'\n\nError: {str(e)}\n\n')
            time.sleep(300)
            try:
                wd.quit()
            except:
                pass
            wd = None
    return wd

def scroll_down(wd):
    ''' 
    Scroll down to the bottom of the page 
    ** 데스크탑 웹 페이지에서만 사용가능 **
    '''
    
    prev_height = wd.execute_script("return document.body.scrollHeight")
    while True:
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # wd.implicitly_wait(5)
        time.sleep(1)
        current_height = wd.execute_script("return document.body.scrollHeight")

        if prev_height == current_height:
            break
        prev_height = current_height

In [122]:
# searching_word = 'ulta haul'
searching_word = 'sephora makeup'

searching_word = searching_word.replace(' ', '+')
filter_query = 'sp=EgYIBBABKAE%253D'
search_url = f'https://www.youtube.com/results?search_query={searching_word}&{filter_query}'
wd  = get_url(search_url, window=False, image=False)

try:
    xpath_filter = '//*[@id="container"]/ytd-toggle-button-renderer/a'
    WebDriverWait(wd, 15).until(EC.element_to_be_clickable((By.XPATH, xpath_filter)))
    # wd.find_element_by_xpath(xpath_filter).click()
except TimeoutException as e:
    print(e)



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


In [123]:
# scroll down
cnt = 0
while True:
    height = wd.execute_script("return document.body.scrollHeight")
    wd.find_element_by_tag_name('body').send_keys(Keys.END)
    time.sleep(5)
    if int(height) == 0:
        cnt += 1
        if cnt == 50:
            break

In [124]:
# scraping video id (href)
soup = BeautifulSoup(wd.page_source, 'lxml')
url_a = soup.find_all('a', 'yt-simple-endpoint style-scope ytd-video-renderer')
urls = []
for a in tqdm(url_a):
    _id = a['href']
    url = f'https://www.youtube.com{_id}'
    urls.append(url)
wd.quit()

100%|██████████| 424/424 [00:00<00:00, 655263.41it/s]


In [2]:
# searching_word = 'ulta haul'
searching_word = 'sephora makeup'

def crawling_url(searching_word):
    searching_word = searching_word.replace(' ', '+')
    filter_query = 'sp=EgYIBBABKAE%253D'
    search_url = f'https://www.youtube.com/results?search_query={searching_word}&{filter_query}'
    wd = get_url(search_url, window=False, image=False)

    try:
        xpath_filter = '//*[@id="container"]/ytd-toggle-button-renderer/a'
        WebDriverWait(wd, 15).until(EC.element_to_be_clickable((By.XPATH, xpath_filter)))
    except TimeoutException:
        wd.quit()
        wd = None
    except Exception as e:
        print(e)
        wd = None
        
    if wd == None:
        pass
    else:
        # page scroll down
        wd = scroll_down(wd, sleep_time=5, check_count=50)
        
        # scraping video id (href)
        soup = BeautifulSoup(wd.page_source, 'lxml')
        url_a = soup.find_all('a', 'yt-simple-endpoint style-scope ytd-video-renderer')
        urls = []
        for a in url_a:
            _id = a['href']
            url = f'https://www.youtube.com{_id}'
            urls.append(url)
    return urls

In [10]:
crw = CrawlingYoutube()
makeup = ['sephora makeup', 'ulta makeup', 'sephora makeup tutorial', 'ulta makeup tutorial'] + ['sephora haul', 'sephora sale', 'ulta haul', 'ulta sale', 'sephora review', 'ulta review']

urls, error = [], []
for s in tqdm(makeup):
    url, status = crw.crawling_url(searching_word)
    if status == 1:
        urls += url
    else:
        error.append(s)

error_ = []
for s in tqdm(error):
    urls += crw.crawling_url(searching_word)[0]
    error_.append(s)

  0%|          | 0/10 [00:00<?, ?it/s]

Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache
 10%|█         | 1/10 [04:22<39:23, 262.59s/it]

Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache
 20%|██        | 2/10 [08:42<34:48, 261.12s/it]

Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache
 30%|███       | 3/10 [13:02<30:22, 260.41s/it]

Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] 

In [12]:
len(set(urls))

466

___
### Test: Crawling Youtube Data

In [4]:
from tqdm.auto import tqdm
from src.crawling.crawler import *
from src.crawling.crawler_youtube import CrawlingYoutube

In [5]:
crw = CrawlingYoutube()

makeup = ['sephora makeup', 'ulta makeup', 'sephora makeup tutorial', 'ulta makeup tutorial']
info = ['sephora haul', 'sephora sale', 'ulta haul', 'ulta sale', 'sephora review', 'ulta review']
skincare = ['sephora skincare', 'ulta skincare']
fragrance = ['sephora fragrance', 'ulta fragrance']        
searching_words = makeup + info + skincare + fragrance

In [8]:
urls, error_word = [], []
for word in tqdm(searching_words):
    url, status = crw.crawling_url(word)
    
    if status == 1:
        urls += url
    else:
        error_word.append(word)

  0%|          | 0/14 [00:00<?, ?it/s]



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/mycelebs_95/.wdm/drivers/chromedriver/mac6

In [13]:
urls = list(set(urls))
urls

['https://www.youtube.com/watch?v=-MHUrmlE1TQ',
 'https://www.youtube.com/watch?v=Ze637H39a6s',
 'https://www.youtube.com/shorts/aT7weMFi-rk',
 'https://www.youtube.com/watch?v=zlHBymgX_0Q',
 'https://www.youtube.com/watch?v=9iL6P4oF5Xg',
 'https://www.youtube.com/watch?v=cnblLm1dzwA',
 'https://www.youtube.com/watch?v=_1iYWNdrHzw',
 'https://www.youtube.com/watch?v=3rLm_aRjaoQ',
 'https://www.youtube.com/watch?v=RJ_rO5osxWE',
 'https://www.youtube.com/watch?v=KdutCtLeAEQ',
 'https://www.youtube.com/watch?v=RPhy6Q52DIA',
 'https://www.youtube.com/watch?v=Sl6dk7zaWgQ',
 'https://www.youtube.com/watch?v=j9WJBwE1CKI',
 'https://www.youtube.com/watch?v=CgS4Nc0sDOE',
 'https://www.youtube.com/watch?v=ifa0ztghOQw',
 'https://www.youtube.com/watch?v=hvUKD4iTGy4',
 'https://www.youtube.com/watch?v=ctTSH_HOBg0',
 'https://www.youtube.com/watch?v=LM91i5bnYSg',
 'https://www.youtube.com/watch?v=nGNXjU2yPu0',
 'https://www.youtube.com/watch?v=eN1PmKH9ABk',
 'https://www.youtube.com/watch?v=NgOuUN6

In [15]:
scrapes, error_url = [], []
for url in tqdm(urls):
    status, video_id, script = crw.scrape_transcripts(url)
    status, title, channel_name, thumbnail_img, profile_img = crw.parsing_url(url)
    
    if (status == 404) | (status == 403) | (status == -1):
        error_url.append(url)
        
    else:
        scrapes.append([url, title, video_id, channel_name, script, thumbnail_img, profile_img, status])

  0%|          | 0/1036 [00:00<?, ?it/s]

In [20]:
scrape_df = pd.DataFrame(scrapes, columns=['url', 'video_title', 'video_id', 'youtuber', 'script', 'thumbnail', 'youtuber_profile', 'status'])
scrape_df

Unnamed: 0,url,video_title,video_id,youtuber,script,thumbnail,youtuber_profile,status
0,https://www.youtube.com/watch?v=-MHUrmlE1TQ,fenty skin cherry treat conditioning + strengt...,-MHUrmlE1TQ,Alex Greyson,hi everyone my name is alex if you're new to m...,https://i.ytimg.com/vi/-MHUrmlE1TQ/hqdefault.jpg,https://yt3.ggpht.com/WDJFjMzZuGGHS-YYxW_3GvcV...,200
1,https://www.youtube.com/watch?v=Ze637H39a6s,【Switch Sports】美女の真剣勝負。【ホロライブ/紫咲シオン】,Ze637H39a6s,Shion Ch. 紫咲シオン,[Music] [Applause] good morning good morning t...,https://i.ytimg.com/vi/Ze637H39a6s/hqdefault.jpg,https://yt3.ggpht.com/AyUL9W0ltc_aJr_MysuZBx8h...,200
2,https://www.youtube.com/shorts/aT7weMFi-rk,60 Seconds With Val Garland #ValGarland #Nikki...,ttps://www.youtube.com/shorts/aT7weMFi-rk,British Vogue,,https://i.ytimg.com/vi/aT7weMFi-rk/hq2.jpg,https://yt3.ggpht.com/lb79Cav62zXkSba7QKsyFA28...,200
3,https://www.youtube.com/watch?v=zlHBymgX_0Q,REVIEW-DR VANITA RATTAN SKINCARE - (No Affilia...,zlHBymgX_0Q,Skincare by Tahwah,in today's video i'm going to be reviewing dr ...,https://i.ytimg.com/vi/zlHBymgX_0Q/maxresdefau...,https://yt3.ggpht.com/smARDGFlrABy-sSUXZsp7Q15...,200
4,https://www.youtube.com/watch?v=9iL6P4oF5Xg,आखिरकार जेठालाल को गड़ा इलेक्ट्रॉनिक्स वापिस मि...,9iL6P4oF5Xg,Filmitelly,sweeten your mouth I bring good news your shop...,https://i.ytimg.com/vi/9iL6P4oF5Xg/maxresdefau...,https://yt3.ggpht.com/tjkllHcIuRoUdTJf-kRSkGVF...,200
...,...,...,...,...,...,...,...,...
1031,https://www.youtube.com/watch?v=n0YSg_G9txM,Y Yves Saint Laurent – Perfume Masculino – Eau...,n0YSg_G9txM,LCB PERFUMES,e [Aplausos] [Música] [Aplausos] esse perfume ...,https://i.ytimg.com/vi/n0YSg_G9txM/maxresdefau...,https://yt3.ggpht.com/ytc/AKedOLRGGexCyM8jmcNR...,200
1032,https://www.youtube.com/watch?v=807kZSnugXE,Exodo 1-7 | Moises El Libertador,807kZSnugXE,Oasis de Amor,el aprendizaje de la palabra para los que se e...,https://i.ytimg.com/vi/807kZSnugXE/hqdefault.jpg,https://yt3.ggpht.com/ytc/AKedOLSB0DqbK5D475yD...,200
1033,https://www.youtube.com/watch?v=l8DIQsK2ynU,SUB) IKEA 2022 New Items Kitchenware Recommend...,l8DIQsK2ynU,Jins'Mommy,मैं आज आईकेईए गया मुझे आईकेईए नीला बैग पसंद है...,https://i.ytimg.com/vi/l8DIQsK2ynU/maxresdefau...,https://yt3.ggpht.com/i8T6LKdMoD0nh23JtNtS-_XJ...,200
1034,https://www.youtube.com/watch?v=NZi1gjz7xgs,"Recordando el 31 de mayo pasado, presente y pe...",NZi1gjz7xgs,CIP tv,muy buenas noches colegas y amigos y bienvenid...,https://i.ytimg.com/vi/NZi1gjz7xgs/hqdefault.jpg,https://yt3.ggpht.com/ytc/AKedOLRi7hXIdSHBp3H8...,200


In [25]:
_scrape_df = scrape_df[scrape_df.video_title.notnull() & scrape_df.script.notnull()].reset_index(drop=True)

# 영문, 특수기호(,.!?')만 추출
reg_eng = re.compile('[^a-zA-Z0-9\.\,\?\!\']')
_scrape_df.loc[:, 'script'] = _scrape_df.script.str.replace(reg_eng, ' ').str.replace(' +', ' ')

# 특수기호와 영문 빈도 계산 및 비율 비교를 통한 필터링
_scrape_df_copy = _scrape_df.copy()
_scrape_df_copy.loc[:, 'en'] = _scrape_df_copy.script.str.count('[a-zA-Z]')
_scrape_df_copy.loc[:, 'num'] = _scrape_df_copy.script.str.count('[0-9]')
_scrape_df_copy.loc[:, 'sp_0'] = _scrape_df_copy.script.str.count('\.') 
_scrape_df_copy.loc[:, 'sp_1'] = _scrape_df_copy.script.str.count('\,')
_scrape_df_copy.loc[:, 'sp_2'] = _scrape_df_copy.script.str.count('\?')
_scrape_df_copy.loc[:, 'sp_3'] = _scrape_df_copy.script.str.count('\!')

drop_index = []
for i in range(len(_scrape_df_copy)):
    counts = _scrape_df_copy.iloc[i, -6:].tolist()

    if str(counts[0]) == 'nan':
        pass
    else:
        if counts.index(max(counts)) != 0:
            drop_index.append(i)
        elif counts[0] <= sum(counts[1:]) * 10:
            drop_index.append(i)
        else:
            pass

_scrape_df_copy_ = _scrape_df_copy.drop(drop_index).sort_values('youtuber').reset_index(drop=True)

# Upload table into Database
columns = ['url', 'video_title', 'thumbnail', 'youtuber', 'youtuber_profile', 'script']
upload_df =  _scrape_df_copy_.loc[:, columns]

In [26]:
upload_df

Unnamed: 0,url,video_title,thumbnail,youtuber,youtuber_profile,script
0,https://www.youtube.com/watch?v=ip3gDWXj38s,In Search Of Golden | SHIMANO,https://i.ytimg.com/vi/ip3gDWXj38s/maxresdefau...,#RideShimano,https://yt3.ggpht.com/ytc/AKedOLQ4aSePzYIKQvlC...,Valokuvaajana etsin aina vaikuttavia n kymi . ...
1,https://www.youtube.com/watch?v=qulrHuPYM3g,skincare routine ♡ hydrated and glowy skin - s...,https://i.ytimg.com/vi/qulrHuPYM3g/maxresdefau...,02dolls,https://yt3.ggpht.com/Hkei20kxFm3IzLsmhdACWVYt...,est tout le monde aujourd'hui on se retrouve p...
2,https://www.youtube.com/watch?v=Ien54o_tCvU,Drone aids in arrest of armed robbery suspect ...,https://i.ytimg.com/vi/Ien54o_tCvU/hqdefault.jpg,11Alive,https://yt3.ggpht.com/ytc/AKedOLR_q1u-FxCmrzGp...,robbery suspects are cornered in broad dayligh...
3,https://www.youtube.com/watch?v=EX0yPDL4TRM,"SHOCKING VIDEO: Teen, officer and driver injur...",https://i.ytimg.com/vi/EX0yPDL4TRM/maxresdefau...,6abc Philadelphia,https://yt3.ggpht.com/bVeSeSOK0hwA_Iq4_LuXDVNz...,surveillance video showing police flying throu...
4,https://www.youtube.com/watch?v=x9im22SzvCQ,"'Watch out, I have a killer': Man, puppy recov...",https://i.ytimg.com/vi/x9im22SzvCQ/maxresdefau...,6abc Philadelphia,https://yt3.ggpht.com/bVeSeSOK0hwA_Iq4_LuXDVNz...,residents in cherry hill new jersey are up in ...
...,...,...,...,...,...,...
903,https://www.youtube.com/watch?v=_HCbP1SZjJ0,👾Aritzia haul - 2022 Spring/Summer👾,https://i.ytimg.com/vi/_HCbP1SZjJ0/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do this au...
904,https://www.youtube.com/watch?v=34HcyPLZpS0,🦧Aritzia try on haul (50% Off) 🦧,https://i.ytimg.com/vi/34HcyPLZpS0/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do an eras...
905,https://www.youtube.com/watch?v=cKbVKtPtKNY,🐋Lululemon try on haul (like a cloud bra/align...,https://i.ytimg.com/vi/cKbVKtPtKNY/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do another...
906,https://www.youtube.com/watch?v=3NASrEUoMe8,LA MER- IS IT REALLY LONG WEAR?,https://i.ytimg.com/vi/3NASrEUoMe8/hqdefault.jpg,💄💕XAStrAxBEauTYx,https://yt3.ggpht.com/qOMz1XmdoYlVhZ2Sfvjf0maD...,hi guys how are y'all doing i hope you are ver...


In [22]:
upload_df

Unnamed: 0,url,video_title,thumbnail,youtuber,youtuber_profile,script
0,https://www.youtube.com/watch?v=ip3gDWXj38s,In Search Of Golden | SHIMANO,https://i.ytimg.com/vi/ip3gDWXj38s/maxresdefau...,#RideShimano,https://yt3.ggpht.com/ytc/AKedOLQ4aSePzYIKQvlC...,Valokuvaajana etsin aina vaikuttavia n kymi . ...
1,https://www.youtube.com/watch?v=qulrHuPYM3g,skincare routine ♡ hydrated and glowy skin - s...,https://i.ytimg.com/vi/qulrHuPYM3g/maxresdefau...,02dolls,https://yt3.ggpht.com/Hkei20kxFm3IzLsmhdACWVYt...,est tout le monde aujourd'hui on se retrouve p...
2,https://www.youtube.com/watch?v=Ien54o_tCvU,Drone aids in arrest of armed robbery suspect ...,https://i.ytimg.com/vi/Ien54o_tCvU/hqdefault.jpg,11Alive,https://yt3.ggpht.com/ytc/AKedOLR_q1u-FxCmrzGp...,robbery suspects are cornered in broad dayligh...
3,https://www.youtube.com/watch?v=YXAX7pOQco8,나스닥 0.73% 하락! 23억 투자자가 말하는 2022년 하반기 투자 전략! (T...,https://i.ytimg.com/vi/YXAX7pOQco8/hqdefault.jpg,5마일_미국주식으로 경제적 자유,https://yt3.ggpht.com/sKz7wGPADBcvQ-k-KRU8QKo8...,6 9 4 100cm 500 11 16 2 5 4 77 5811 5 1 smp 5...
4,https://www.youtube.com/watch?v=vDoB1PNs0e8,Worker attacked at 30th Street Station shares ...,https://i.ytimg.com/vi/vDoB1PNs0e8/maxresdefau...,6abc Philadelphia,https://yt3.ggpht.com/bVeSeSOK0hwA_Iq4_LuXDVNz...,custodian at 30th street station who was attac...
...,...,...,...,...,...,...
941,https://www.youtube.com/watch?v=34HcyPLZpS0,🦧Aritzia try on haul (50% Off) 🦧,https://i.ytimg.com/vi/34HcyPLZpS0/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do an eras...
942,https://www.youtube.com/watch?v=cKbVKtPtKNY,🐋Lululemon try on haul (like a cloud bra/align...,https://i.ytimg.com/vi/cKbVKtPtKNY/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do another...
943,https://www.youtube.com/watch?v=_HCbP1SZjJ0,👾Aritzia haul - 2022 Spring/Summer👾,https://i.ytimg.com/vi/_HCbP1SZjJ0/maxresdefau...,👽My littlespace👽,https://yt3.ggpht.com/WNiDi4HMYBYsuuta-U5KXLMd...,hello friends today we are going to do this au...
944,https://www.youtube.com/watch?v=3NASrEUoMe8,LA MER- IS IT REALLY LONG WEAR?,https://i.ytimg.com/vi/3NASrEUoMe8/hqdefault.jpg,💄💕XAStrAxBEauTYx,https://yt3.ggpht.com/qOMz1XmdoYlVhZ2Sfvjf0maD...,hi guys how are y'all doing i hope you are ver...
