In [1]:
import asyncio
import re
import aiohttp
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
data=pd.read_csv('Data/tv_shows.csv')

In [3]:
info=data[['Title','Year']]
scrap_names=[x.replace(' ','%20') for x in data.Title]

In [5]:
scrap_names[:10]

['Breaking%20Bad',
 'Stranger%20Things',
 'Money%20Heist',
 'Sherlock',
 'Better%20Call%20Saul',
 'The%20Office',
 'Black%20Mirror',
 'Supernatural',
 'Peaky%20Blinders',
 'Avatar:%20The%20Last%20Airbender']

In [6]:
class ProjectMainSpider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    url_pattern: the regex pattern of the web urls to scape
    pages_to_scrape: how many pages to scrape
    sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.
    content_parser: a function reference that will extract the intended info from the scraped content.
    """
    def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=True, get_agent=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.get_agent=get_agent
        self.content_parser = None
        self.fails=[]
        self.fails2=[]
        self.links=[]
        self.titles=[]
        self.country=[]
        self.language=[]
        self.runtime=[]
        self.genre=[]
        self.creator=[]
        self.episode=[]
        self.soups=[]


    async def fetch_headers(self,url,headers):
        async with aiohttp.ClientSession() as s, s.get(url,headers=headers) as res:
            ret=await res.read()
            status=res.status
            return status,ret
    """
    Scrape the content of a single url.
    """
    
    async def scrape_url(self,url,comb):
        agent=self.get_agent()
        headers = {
            'user-agent': agent,
            'authority': 'www.imdb.com',
            'accept': 'application/json, text/plain, */*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'en-US,en;q=0.9,es;q=0.8,fr;q=0.7,gl;q=0.6',
            'referer':'https://www.google.com'
        } 
        try:
            status,content= await self.fetch_headers(url,headers)
            if status<300:
                self.soups.append(content)
                result = self.content_parser(content,comb)
            elif status in range(400,500):
                print('Error {} in "{}", request failed because the resource either does not exist or is forbidden.'
                      .format(status,url))
                try:
                    self.fails.append(comb.Title)
                except:
                    self.fails2.append(url)
            else:
                print('Error {}, in "{}", request failed because the response server encountered an error.'
                      .format(status,url))
                try:
                    self.fails.append(comb.Title)
                except:
                    self.fails2.append(url)
        except aiohttp.ServerTimeoutError:
            print('Error= Timeout.'.format(url))
            try:
                self.fails.append(comb.Title)
            except:
                self.fails2.append(url)
        except aiohttp.TooManyRedirects:
            print('Error= TooManyRedirects, in {}.'.format(url))
            try:
                self.fails.append(comb.Title)
            except:
                self.fails2.append(url)
        except aiohttp.ClientSSLError:
            print('Error= SSLError, in {}.'.format(url))
            try:
                self.fails.append(comb.Title)
            except:
                self.fails2.append(url)
        except aiohttp.ClientError as e:
            print('Error= {}, in {}.'.format(e,url))
            try:
                self.fails.append(comb.Title)
            except:
                self.fails2.append(url)
    
    def random_interval(self):
        delays = [1, 2, 3, 4, 5, 7]
        perm = np.random.RandomState()
        #return random x in [0.5,n+0.5]
        return(perm.permutation(range(np.random.choice(delays)*100))[0]+5)/100
    """
    After the class is instantiated, call this function to start the scraping jobs.
    This function uses a FOR loop to call `scrape_url()` for each url to scrape.
    """
    async def kickstart(self,parser,main=True):
        self.content_parser=parser
        self.fails=[]
        self.fails2=[]
        #Get links from search
        if main:
            #send requests in n groups of n where n=sqrt(total requests)
            for j in range(int(np.sqrt(self.pages_to_scrape))):
                await asyncio.wait([self.scrape_url(self.url_pattern.format(scrap_names[i]),info.loc[i]) for i in range(self.pages_to_scrape)
                                   if i%(int(np.sqrt(self.pages_to_scrape)))==j])
                if self.sleep_interval:
                    await asyncio.sleep(self.random_interval())
                #Progress indicator
                print('Group {} finished from {}'.format(j+1,int(np.sqrt(self.pages_to_scrape))))
        #Get info from each tv show
        else:
            #send requests in n groups of n where n=sqrt(total requests)
            for j in range(int(np.sqrt(len(self.links)))):
                await asyncio.wait([self.scrape_url(i[0],i[1]) for x,i in enumerate(self.links) 
                                    if x%(int(np.sqrt(len(self.links))))==j])
                if self.sleep_interval:
                    await asyncio.sleep(self.random_interval())
                    #Progress indicator
                print('Group {} finished from {}'.format(j+1,int(np.sqrt(len(self.links)))))
            #Save data in DF
            data=zip(self.titles,self.country,self.language,self.runtime,self.genre,self.creator,self.episode)
            self.df=pd.DataFrame(data,columns=['Title','Country','Language','Runtime','Genre','Creator','Episodes'])
        print('Done')
            
    def async_kickstart(self,parser,main=True):
        try:
            loop=asyncio.get_event_loop()
            loop.run_until_complete(self.kickstart(parser,main))
        except:
            pass
        
                
    def links_parser(self,content,comb):
        soup=BeautifulSoup(content)
        try:
            temp_links=[]
            for x in soup.find_all('td',{'class':'result_text'}):
                temp_links.append([x.a['href'], x.text])
            for x,y in temp_links:
                if re.findall(str(comb.Year),y):
                    link=x
                    break
            self.links.append(['https://www.imdb.com'+x,comb.Title])
        except:
            self.fails.append(comb.Title)
    def data_parser(self,content,name):
        soup=BeautifulSoup(content)
        try:
            details=soup.select('#titleDetails')[0].text
            details=re.sub('\xa0|\n',' ',details)
        except:
            details='none'
        all_dets=re.findall('Country: *((?:\S*(?: *\| *)?)*)|Language: *((?:\S*(?: *\| *)?)*)|Runtime: *(\S*)',details)
        try:
            story_line=soup.select('#titleStoryLine')[0].text
            story_line=re.sub('\xa0|\n',' ',story_line)
        except:
            story_line='none'
        try:
            country=all_dets[0][0]
            country=re.sub(' ','',country)
            country=re.sub('\|',',',country)
        except:
            country=np.NaN
        country
        try:
            language=all_dets[1][1]
            language=re.sub(' ','',language)
            language=re.sub('\|',',',language)
        except:
            language=np.NaN
        try:
            runtime=all_dets[2][2]
        except:
            runtime=np.NaN
        try:
            genre=re.findall('Genres: *((?:\S*(?: \| *)?)*)',story_line)[0]
            genre=re.sub(' ','',genre)
            genre=re.sub('\|',',',genre)
        except:
            genre=np.NaN
        try:
            creators=soup.find('div',{'class':'credit_summary_item'})
            creator=','.join([x.text for x in creators.find_all('a')])
        except:
            creator=np.NaN
        try:
            episode=soup.select('.bp_content .bp_sub_heading')[0].text
            episode=re.findall('\d+',episode)[0]
        except:
            episode=np.NaN
        self.titles.append(name)
        self.country.append(country)
        self.language.append(language)
        self.runtime.append(runtime)
        self.genre.append(genre)
        self.creator.append(creator)
        self.episode.append(episode)


def random_agent():
    with open('agents.txt',encoding="utf8") as f:
        lines = f.readlines()
        perm = np.random.RandomState()
        index = perm.permutation(len(lines) - 1)[0]
        agent = lines[int(index)].strip()
        return agent

In [7]:
URL_PATTERN = 'https://www.imdb.com/find?q={}&s=tt&ttype=tv&ref_=fn_tv'
PAGES_TO_SCRAPE = len(scrap_names)
my_main_spider = ProjectMainSpider(URL_PATTERN, PAGES_TO_SCRAPE,get_agent=random_agent,sleep_interval=True)
#get links
my_main_spider.async_kickstart(my_main_spider.links_parser)

Error= , in https://www.imdb.com/find?q=En%20La%20Boca%20Del%20Lobo&s=tt&ttype=tv&ref_=fn_tv.
Group 1 finished from 74
Group 2 finished from 74
Error= [WinError 10053] Se ha anulado una conexión establecida por el software en su equipo host, in https://www.imdb.com/find?q=Golden%20Time&s=tt&ttype=tv&ref_=fn_tv.
Group 3 finished from 74
Group 4 finished from 74
Error= , in https://www.imdb.com/find?q=Miami%20Ink&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=Nightmare%20in%20Suburbia&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=The%20Mindy%20Project&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=On%20Children&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=Jake%20and%20the%20Never%20Land%20Pirates&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=Hunter%20Street&s=tt&ttype=tv&ref_=fn_tv.
Error= , in https://www.imdb.com/find?q=Longmire&s=tt&ttype=tv&ref_=fn_tv.
Group 5 finished from 74
Group 6 f

In [18]:
info=data[data.Title.isin(my_main_spider.fails)][['Title','Year']]
info.reset_index(inplace=True,drop=True)
scrap_names=[x.replace(' ','%20') for x in data[data.Title.isin(my_main_spider.fails)].Title]
my_main_spider.pages_to_scrape = len(scrap_names)

In [24]:
len(my_main_spider.fails)

363

In [22]:
len(my_main_spider.links)

5249

In [19]:
my_main_spider.async_kickstart(my_main_spider.links_parser)

Group 1 finished from 19
Group 2 finished from 19
Group 3 finished from 19
Group 4 finished from 19
Group 5 finished from 19
Group 6 finished from 19
Group 7 finished from 19
Group 8 finished from 19
Group 9 finished from 19
Group 10 finished from 19
Group 11 finished from 19
Group 12 finished from 19
Group 13 finished from 19
Group 14 finished from 19
Group 15 finished from 19
Group 16 finished from 19


In [25]:
#get data
my_main_spider.async_kickstart(my_main_spider.data_parser,main=False)

Group 1 finished from 72
Group 2 finished from 72
Group 3 finished from 72
Group 4 finished from 72
Group 5 finished from 72
Group 6 finished from 72
Group 7 finished from 72
Group 8 finished from 72
Group 9 finished from 72
Group 10 finished from 72
Group 11 finished from 72
Group 12 finished from 72
Group 13 finished from 72
Group 14 finished from 72
Group 15 finished from 72
Group 16 finished from 72
Group 17 finished from 72
Group 18 finished from 72
Group 19 finished from 72
Group 20 finished from 72
Group 21 finished from 72
Group 22 finished from 72
Group 23 finished from 72
Group 24 finished from 72
Group 25 finished from 72
Group 26 finished from 72
Group 27 finished from 72
Group 28 finished from 72
Group 29 finished from 72
Group 30 finished from 72
Group 31 finished from 72
Group 32 finished from 72
Group 33 finished from 72
Group 34 finished from 72
Group 35 finished from 72
Group 36 finished from 72
Group 37 finished from 72
Group 38 finished from 72
Group 39 finished fro

Task exception was never retrieved
future: <Task finished coro=<ProjectMainSpider.scrape_url() done, defined at <ipython-input-6-8afc616ac529>:40> exception=TimeoutError()>
Traceback (most recent call last):
  File "<ipython-input-6-8afc616ac529>", line 51, in scrape_url
    status,content= await self.fetch_headers(url,headers)
  File "<ipython-input-6-8afc616ac529>", line 33, in fetch_headers
    ret=await res.read()
  File "C:\Users\Usuario_Asignado\Anaconda3\envs\ironhack_conda\lib\site-packages\aiohttp\client_reqrep.py", line 973, in read
    self._body = await self.content.read()
  File "C:\Users\Usuario_Asignado\Anaconda3\envs\ironhack_conda\lib\site-packages\aiohttp\streams.py", line 358, in read
    block = await self.readany()
  File "C:\Users\Usuario_Asignado\Anaconda3\envs\ironhack_conda\lib\site-packages\aiohttp\streams.py", line 380, in readany
    await self._wait('readany')
  File "C:\Users\Usuario_Asignado\Anaconda3\envs\ironhack_conda\lib\site-packages\aiohttp\streams.

Group 41 finished from 72
Group 42 finished from 72
Group 43 finished from 72
Group 44 finished from 72
Error= , in https://www.imdb.com/title/tt7985540/?ref_=fn_tv_tt_1.
Error= , in https://www.imdb.com/title/tt2226342/?ref_=fn_tv_tt_1.
Group 45 finished from 72
Group 46 finished from 72
Group 47 finished from 72
Group 48 finished from 72
Group 49 finished from 72
Group 50 finished from 72
Group 51 finished from 72
Group 52 finished from 72
Group 53 finished from 72
Group 54 finished from 72
Group 55 finished from 72
Group 56 finished from 72
Group 57 finished from 72
Group 58 finished from 72
Group 59 finished from 72
Group 60 finished from 72
Group 61 finished from 72
Group 62 finished from 72
Group 63 finished from 72
Group 64 finished from 72
Group 65 finished from 72
Group 66 finished from 72
Group 67 finished from 72
Group 68 finished from 72
Group 69 finished from 72
Group 70 finished from 72
Group 71 finished from 72
Group 72 finished from 72
Done


In [33]:
my_main_spider.links=[x for x in my_main_spider.links if x[0] in my_main_spider.fails2]

In [35]:
my_main_spider.async_kickstart(my_main_spider.data_parser,main=False)

Group 1 finished from 1
Done


In [36]:
my_main_spider.df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5248 entries, 0 to 5247
Data columns (total 7 columns):
Title       5248 non-null object
Country     5184 non-null object
Language    5070 non-null object
Runtime     3292 non-null object
Genre       5118 non-null object
Creator     4994 non-null object
Episodes    4920 non-null object
dtypes: object(7)
memory usage: 287.1+ KB


In [38]:
my_main_spider.df.to_csv('Data/extra_tv_shows.csv',index=False,)