
# Simple API Example with Requests

In [3]:
import json
import requests
response = requests.get('https://jsonplaceholder.typicode.com/todos')
results = response.json()
results

[{'userId': 1, 'id': 1, 'title': 'delectus aut autem', 'completed': False},
 {'userId': 1,
  'id': 2,
  'title': 'quis ut nam facilis et officia qui',
  'completed': False},
 {'userId': 1, 'id': 3, 'title': 'fugiat veniam minus', 'completed': False},
 {'userId': 1, 'id': 4, 'title': 'et porro tempora', 'completed': True},
 {'userId': 1,
  'id': 5,
  'title': 'laboriosam mollitia et enim quasi adipisci quia provident illum',
  'completed': False},
 {'userId': 1,
  'id': 6,
  'title': 'qui ullam ratione quibusdam voluptatem quia omnis',
  'completed': False},
 {'userId': 1,
  'id': 7,
  'title': 'illo expedita consequatur quia in',
  'completed': False},
 {'userId': 1,
  'id': 8,
  'title': 'quo adipisci enim quam ut ab',
  'completed': True},
 {'userId': 1,
  'id': 9,
  'title': 'molestiae perspiciatis ipsa',
  'completed': False},
 {'userId': 1,
  'id': 10,
  'title': 'illo est ratione doloremque quia maiores aut',
  'completed': True},
 {'userId': 1,
  'id': 11,
  'title': 'vero rerum

In [4]:
import pandas as pd
data = pd.DataFrame(results)
data.head()

Unnamed: 0,completed,id,title,userId
0,False,1,delectus aut autem,1
1,False,2,quis ut nam facilis et officia qui,1
2,False,3,fugiat veniam minus,1
3,True,4,et porro tempora,1
4,False,5,laboriosam mollitia et enim quasi adipisci qui...,1


# A bit complex requests

In [25]:
response = requests.get('https://api.github.com/events')
# data = pd.DataFrame(response.json())
# data.head()
response.json()

{'message': "API rate limit exceeded for 195.134.167.217. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)",
 'documentation_url': 'https://developer.github.com/v3/#rate-limiting'}

In [15]:
dict(data.actor[0:2])

{0: {'id': 45290401,
  'login': 'vmlankub',
  'display_login': 'vmlankub',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/vmlankub',
  'avatar_url': 'https://avatars.githubusercontent.com/u/45290401?'},
 1: {'id': 60173845,
  'login': 'Razarsho',
  'display_login': 'Razarsho',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/Razarsho',
  'avatar_url': 'https://avatars.githubusercontent.com/u/60173845?'}}

In [20]:
nested_columns = ['actor', 'org', 'payload', 'repo']
flattened = pd.DataFrame(dict(data.actor)).transpose()
flattened.columns = ['actor'+'_'+ col for col in flattened.columns]
data = pd.concat([data, flattened], axis=1)
data = data.drop(columns='actor', axis=1)
data.head()

Unnamed: 0,created_at,id,org,payload,public,repo,type,actor_avatar_url,actor_display_login,actor_gravatar_id,actor_id,actor_login,actor_url,actor_avatar_url.1,actor_display_login.1,actor_gravatar_id.1,actor_id.1,actor_login.1,actor_url.1
0,2020-01-22T08:52:29Z,11331681619,"{'id': 57382670, 'login': 'urlib', 'gravatar_i...","{'push_id': 4530344976, 'size': 1, 'distinct_s...",True,"{'id': 235262995, 'name': 'urlib/1e9-9', 'url'...",PushEvent,https://avatars.githubusercontent.com/u/45290401?,vmlankub,,45290401,vmlankub,https://api.github.com/users/vmlankub,https://avatars.githubusercontent.com/u/45290401?,vmlankub,,45290401,vmlankub,https://api.github.com/users/vmlankub
1,2020-01-22T08:52:29Z,11331681616,,"{'action': 'closed', 'number': 1, 'pull_reques...",True,"{'id': 235530864, 'name': 'Razarsho/da', 'url'...",PullRequestEvent,https://avatars.githubusercontent.com/u/60173845?,Razarsho,,60173845,Razarsho,https://api.github.com/users/Razarsho,https://avatars.githubusercontent.com/u/60173845?,Razarsho,,60173845,Razarsho,https://api.github.com/users/Razarsho
2,2020-01-22T08:52:29Z,11331681614,,{'action': 'started'},True,"{'id': 88464704, 'name': 'PanJiaChen/vue-eleme...",WatchEvent,https://avatars.githubusercontent.com/u/6790273?,chenglong-do,,6790273,chenglong-do,https://api.github.com/users/chenglong-do,https://avatars.githubusercontent.com/u/6790273?,chenglong-do,,6790273,chenglong-do,https://api.github.com/users/chenglong-do
3,2020-01-22T08:52:29Z,11331681608,,"{'ref': None, 'ref_type': 'repository', 'maste...",True,"{'id': 235533284, 'name': 'Gimindika/go-basics...",CreateEvent,https://avatars.githubusercontent.com/u/52320237?,Gimindika,,52320237,Gimindika,https://api.github.com/users/Gimindika,https://avatars.githubusercontent.com/u/52320237?,Gimindika,,52320237,Gimindika,https://api.github.com/users/Gimindika
4,2020-01-22T08:52:29Z,11331681595,"{'id': 9841374, 'login': 'SkillsFundingAgency'...","{'push_id': 4530344969, 'size': 1, 'distinct_s...",True,"{'id': 157214972, 'name': 'SkillsFundingAgency...",PushEvent,https://avatars.githubusercontent.com/u/36482610?,thelious1974,,36482610,thelious1974,https://api.github.com/users/thelious1974,https://avatars.githubusercontent.com/u/36482610?,thelious1974,,36482610,thelious1974,https://api.github.com/users/thelious1974


In [21]:
def flatten(data, col_list):
    for column in col_list:
        flattened = pd.DataFrame(dict(data[column])).transpose()
        columns = [str(col) for col in flattened.columns]
        flattened.columns = [column+'_'+ colname for colname in columns]
        data = pd.concat([data, flattened], axis=1)
        data = data.drop(columns=column, axis=1)
    return data


In [22]:
nested_columns = ['actor', 'org', 'payload', 'repo']
flat = flatten(data, nested_columns)
flat.head()

KeyError: 'actor'

In [26]:
from pandas.io.json import json_normalize
results = response.json()
flattened_data = json_normalize(results)
flattened_data.head()

Unnamed: 0,documentation_url,message
0,https://developer.github.com/v3/#rate-limiting,API rate limit exceeded for 195.134.167.217. (...


# Web Scraping 

In [28]:
url = 'https://www.reuters.com/article/us-shazam-m-a-apple-eu/eu-clears-apples-purchase-of-shazam-idUSKCN1LM1TZ'
html = requests.get(url).content
html[0:300]

b'<!--[if !IE]> This has been served from cache <![endif]-->\n<!--[if !IE]> Request served from apache server: prodie--i-033b54ceb277a312e <![endif]-->\n<!--[if !IE]> Cached on Wed, 22 Jan 2020 09:47:33 GMT and will expire on Wed, 22 Jan 2020 10:02:33 GMT <![endif]-->\n<!--[if !IE]> token: bb09831f-9252-'

In [29]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
soup

<!--[if !IE]> This has been served from cache <![endif]--><!--[if !IE]> Request served from apache server: prodie--i-033b54ceb277a312e <![endif]--><!--[if !IE]> Cached on Wed, 22 Jan 2020 09:47:33 GMT and will expire on Wed, 22 Jan 2020 10:02:33 GMT <![endif]--><!--[if !IE]> token: bb09831f-9252-457c-b7fd-7734f8f53b67 <![endif]--><!--[if !IE]> App Server /prodie--i-0f16a7b4f3b0ef730/ <![endif]--><!DOCTYPE html>
<html data-edition="BETAUS" lang="en">
<head>
<title>
                EU clears Apple's purchase of Shazam - Reuters</title>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta charset="utf-8"/><meta content="on" http-equiv="x-dns-prefetch-control"/><link href="//s1.reutersmedia.net" rel="dns-prefetch"/><link href="//s2.reutersmedia.net" rel="dns-prefetch"/><link href="//s3.reutersmedia.net" rel="dns-prefetch"/><link href="//s4.reutersmedia.net" rel="dns-prefetch"/><link href="//static.reuters.com" rel="dns-prefetch"/><link href="//www.googletagservices.com" rel="dns-pre

In [34]:
tags = ['h1', 'h2','h3','h4', 'h5','h6','h7','p']
text = [i.text for i in soup.find_all(tags)]
text

["EU clears Apple's purchase of Shazam",
 '2 Min Read',
 'BRUSSELS (Reuters) - The European Union approved Apple’s planned acquisition of British music discovery app Shazam on Thursday, saying an EU antitrust investigation showed it would not harm competition in the bloc. ',
 'The deal, announced in December last year, would help the iPhone maker better compete with Spotify, the industry leader in music streaming services. Shazam identifies songs when a smartphone is pointed at an audio source. ',
 '“After thoroughly analyzing Shazam’s user and music data, we found that their acquisition by Apple would not reduce competition in the digital music streaming market,” EU competition commissioner Margrethe Vestager said in a statement. ',
 '“Data is key in the digital economy. We must therefore carefully review transactions which lead to the acquisition of important sets of data, including potentially commercially sensitive ones,” she added. ',
 'The European Commission opened a full-scale 

In [35]:
url = 'https://en.wikipedia.org/wiki/List_of_European_countries_by_life_expectancy'
html = requests.get(url).content
soup = BeautifulSoup(html, 'lxml')
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of European countries by life expectancy - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XigDlApAICAAAE0RRiQAAAAA","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_European_countries_by_life_expectancy","wgTitle":"List of European countries by life expectancy","wgCurRevisionId":924434866,"wgRevisionId":924434866,"wgArticleId":22175559,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgC

In [44]:
table = soup.find_all('table', {'class':'sortable wikitable'})[0]
table

<table class="sortable wikitable">
<tbody><tr bgcolor="#efefef">
<th>Rank
</th>
<th>Country</th>
<th><a href="/wiki/List_of_countries_by_life_expectancy" title="List of countries by life expectancy">Life expectancy</a><sup class="reference" id="cite_ref-:0_1-1"><a href="#cite_note-:0-1">[1]</a></sup>
</th></tr>
<tr>
<td>1
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="800" data-file-width="1000" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/19px-Flag_of_Monaco.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/29px-Flag_of_Monaco.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Flag_of_Monaco.svg/38px-Flag_of_Monaco.svg.png 2x" width="19"/> </span><a href="/wiki/Monaco" title="Monaco">Monaco</a><sup class="reference" id="cite_ref-2"><a href="#cite_note-2">[2]</a></sup>
</td>
<td>89.4
</td></tr>
<tr>
<td>2
</td>
<td><span class="flagicon"><

In [65]:
rows = table.find_all('tr')
rows = [list(filter(None, row.text.strip().split('\n'))) for row in rows]
rows

[['Rank', 'Country', 'Life expectancy[1]'],
 ['1', '\xa0Monaco[2]', '89.4'],
 ['2', '\xa0San Marino[3]', '83.4'],
 ['3', '\xa0\xa0Switzerland', '83.0'],
 ['4', '\xa0Spain', '82.8'],
 ['5', '\xa0Liechtenstein', '82.7'],
 ['6', '\xa0Italy', '82.5'],
 ['7', '\xa0Norway', '82.5'],
 ['8', '\xa0Iceland', '82.5'],
 ['9', '\xa0Luxembourg', '82.3'],
 ['10', '\xa0France', '82.3'],
 ['11', '\xa0Sweden', '82.2'],
 ['12', '\xa0Malta', '81.8'],
 ['13', '\xa0Finland', '81.8'],
 ['14', '\xa0Ireland', '81.6'],
 ['15', '\xa0Netherlands', '81.5'],
 ['16', '\xa0Portugal', '81.1'],
 ['17', '\xa0Greece', '81.0'],
 ['18', '\xa0United Kingdom', '81.0'],
 ['19', '\xa0Belgium', '81.0'],
 ['20', '\xa0Austria', '80.9'],
 ['21', '\xa0Slovenia', '80.8'],
 ['22', '\xa0Denmark', '80.7'],
 ['23', '\xa0Germany', '80.6'],
 ['24', '\xa0Cyprus', '80.5'],
 ['25', '\xa0Albania', '78.3'],
 ['26', '\xa0Czech Republic', '78.3'],
 ['27', '\xa0Croatia', '78.0'],
 ['28', '\xa0Estonia', '77.7'],
 ['29', '\xa0Poland', '77.5'],
 ['3

In [68]:
colnames = rows[0]
data = rows[1:]
df = pd.DataFrame(data, columns=colnames)
df.set_index('Rank',drop=True)

Unnamed: 0_level_0,Country,Life expectancy[1]
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Monaco[2],89.4
2,San Marino[3],83.4
3,Switzerland,83.0
4,Spain,82.8
5,Liechtenstein,82.7
6,Italy,82.5
7,Norway,82.5
8,Iceland,82.5
9,Luxembourg,82.3
10,France,82.3


In [119]:
url = 'https://www.zalando.fr/api/catalog/articles?categories=promo-enfant&limit=84&offset=84&sort=sale'
response = requests.get(url, headers = headers)
response

<Response [200]>

In [72]:
headers = {
'cookie': 'Zalando-Client-Id=480b38a1-e6eb-45b4-9fe9-cace41cfc1db; _abck=36335C81B17CEB1BDF07B5ABCC7D7C99~0~YAAQBv4BF2Nj6X5sAQAA/n7kqAK/0t79lfMy3JgxLRex014E5E2iQjaFAIV6Fi2tLGTsk4wMynFOrVjodKivrgT+79f8hK/fAkAyiELhqbStYW9tN85UNm3uSxtWFXcXumQ5Vwrxa50yikQxINOdnahzRTw3o7TEmSleBWt6pOj61x+6vcyCs5sx+IKgl56ajYdBnph+gYQIHb4jKwgW+sV4ij4/r1/HeygP1HPmQ1f7Yd73toKPrXwwSjdJuqGWyww1xtonstbG4PToBXMOocOoENC++5FiKqE/Wql4FGZe0kB4jN87bQo=~-1~-1~-1; _gcl_au=1.1.2080958251.1575048981; _ga=GA1.2.861778358.1575048979; PESSIONID=1n1h8ucifh5dw-z1e0tc0036m76m-fac1ym; BIGipServerzalando_http=404832266.20480.0000; sqt_cap=1579618684054; _gid=GA1.2.845371904.1579618686; bm_sz=10FAC9E75BA783369F4A9D9A6087237D~YAAQfiMVAjnx7shvAQAAkuLXzAYEEP8/Je1jpSewRVBWDvTzT9kW7jdvkb1LruOf/GeGzV5GUUc56+/4d1Yupt1vWjAus9vcGtFPGkIcUtdPUympNg8nrvtXgV2Tr42jCrWoMguONLezm0UC23hxWzhg7iDtVhye+qmYh/drh5HSsQWj3Q+Rk+09G4X9OYwp; JSESSIONID=CADB0BE0854BED8DB4B1FBC6F003B419.jvm_http24_p0120; CUSTOMER=GzMrtc9tEOfZpFAyxieacLhWylFaA1x8G7aAkL5vDn4VJm8bMDP7rUir5p66F6WKvqI97yD30d5kEVJoQ9MAQNRBL8P1dBOFlyUz4TljIP+ZQOzYazkaCsaUFHipRignzKFAhghfAGn8ZJKSenooKhUmbxswM/utSKvmnroXpYrZhANsrwcR3SONJ/VPpnLBqllaH+wjciz2po7D21ZXjF4nY0EC6900RzTq6kA1Bs0=; bm_mi=AC3D16683E70F9ABA5B5F667C65CEB7B~IVLCKSosHOhmHTKu0RyQPQan6xeCGYVV321bh5WfPzNQ0nvfs73c74GARBtQ6XJj412wdwQswf7zBCfRQrSfjjjf5E/05XmuEJJ5K6rfY3Zys8Y6e4VzYkCCg+sulD6mx99EbiiF1S7t5lcoBiXQL/2i9cRlIdt5vMk5XqVIT0POg7tryYEcudM1/IjBAjGxrWTEVLoV+TxJdsnA0hMQvk55m5HTRJs5SlSn3tieWLTDLGtb0I/6n52DkPqqI/qKLW/OqoW4Wo8K0P6oNLGFSKn0q7DNLfX0W5mF66IHqQQ=; ak_bmsc=450446C427E171592F1A6374E44A0E8E0215237EB03F0000E126285EA1FD0D2C~plUFH++dWBggjEirxaRhq9w1LDFRyiN1TUx99Ez7NOaWP2lgrUlqrBQhudfzqpG46cSLEjj/5kDBdYZNyGNuS1/DqQOdkfUf7wYyj5LlovRxmhYag7o4dzVeHK5d/ESYyUAcoRZ9RdIxOrd/6ZqdjOstBegCD/Q0sOyhDlap7NBCEY5GF7VVQWuhUoKwD3ze+0oyYoBCeOoTOTrd2l2Yj3nw8KlIw82bHSQUOiF/uKkKTshBEUoBEd4iR9hszeV0oC; ncx=k; frsx=AAAAAGjWv7SySil-RjssXcS22vG2bNJTvxRVqZ1WU0BosqPHfZpG9pLlpAWuGDYQ4gbSVzBdzOAOvyS0zZI8DCRm2Dw5SLdNBV-yWsAfe-vxeW9A-GJfJmmxt4K12ANIwnDMZ87gmQCHu408vfb1cn0=',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'x-xsrf-token': 'AAAAAGjWv7SySil-RjssXcS22vG2bNJTvxRVqZ1WU0BosqPHfZpG9pLlpAWuGDYQ4gbSVzBdzOAOvyS0zZI8DCRm2Dw5SLdNBV-yWsAfe-vxeW9A-GJfJmmxt4K12ANIwnDMZ87gmQCHu408vfb1cn0=',
'x-zalando-catalog-nakadi-context': '%7B%22previous_categories%22%3A%5B%22promo-enfant%22%5D%2C%22previous_selected_filters%22%3A%5B%5D%2C%22preselected_filters%22%3A%5B%5D%7D',
'x-zalando-octopus-tests': '%5B%7B%22testName%22%3A%22count-test%22%2C%22testVariant%22%3A%22default_count%22%2C%22testFeedbackId%22%3A%2200000000-0000-0000-0000-000000000000%3A__EMPTY__%22%7D%2C%7B%22testName%22%3A%22default-sorting-variant-sales-test%22%2C%22testVariant%22%3A%22sorting-default%22%2C%22testFeedbackId%22%3A%2200000000-0000-0000-0000-000000000000%3A__EMPTY__%22%7D%5D'
}

In [120]:
result = response.json()
result

{'total_count': 26650,
 'pagination': {'page_count': 318, 'current_page': 2, 'per_page': 84},
 'sort': 'sale',
 'articles': [{'sku': 'KIJ23D008-N11',
   'name': 'KONRUBY BUTTON  - Combinaison - olive',
   'price': {'original': '49,95\xa0€',
    'promotional': '14,95\xa0€',
    'has_different_prices': True,
    'has_different_original_prices': False,
    'has_different_promotional_prices': True,
    'has_discount_on_selected_sizes_only': False},
   'sizes': ['6a', '7a', '8a', '9a', '10a', '12a', '13a', '14a'],
   'url_key': 'kids-only-konruby-button-combinaison-olive-kij23d008-n11',
   'media': [{'path': 'KI/J2/3D/00/8N/11/KIJ23D008-N11@2.jpg',
     'role': 'DEFAULT',
     'packet_shot': False}],
   'brand_name': 'Kids ONLY',
   'is_premium': False,
   'family_articles': [],
   'flags': [{'key': 'campaign',
     'value': '-15% EXTRA',
     'tracking_value': 'fr_aw19_eoss_wave_1_2019_50'},
    {'key': 'discountRate',
     'value': 'Jusqu’à -70%',
     'tracking_value': 'discount rate'}],

In [121]:
flat_data = json_normalize(result)
flat_data

Unnamed: 0,articles,articlesToShow,breadcrumbs,carouselTeaser,categoryTree,collection,contentPositions.entry-point-teasers,contentPositions.in-cat-carousel,contentPositions.in-cat-carousel-fullwidth,contentPositions.in-cat-carousel-mobile,...,upperInCatTeaser,variants.fullWidthCatalog,variants.hideCategories,variants.mobileLightFilters,variants.myBrandsFilter,variants.premiumCatalog,variants.salesBannerCountdown,variants.topTeaserPlaceholder,variants.truncatedCount,wishlist
0,"[{'sku': 'KIJ23D008-N11', 'name': 'KONRUBY BUT...",84,"[{'items': [{'label': 'Enfant', 'url_key': 'en...",,"[{'label': 'Soldes', 'id': '9574', 'url_key': ...",,"[7, 14, 20, 26]",9,8,6,...,,False,False,True,True,False,False,False,False,


In [128]:
flat_data1 = json_normalize(flat_data['articles'][0])
flat_data1.shape

(84, 18)

In [87]:
84*5

420

# Loop within the pages

In [131]:
import time
import requests

flat_data_final = pd.DataFrame()
for k in range(5):
    response = requests.get(f'https://www.zalando.fr/api/catalog/articles?categories=promo-enfant&limit=84&offset={k*84}&sort=sale', headers=headers)
    time.sleep(4)
    result = response.json()
    flat_data = json_normalize(result)
    flat_data1 = json_normalize(flat_data['articles'][0])
    flat_data_final = flat_data_final.append(flat_data1)

In [130]:
for i in range(4):
    print('late by 4 secs')
    time.sleep(4)

late by 4 secs
late by 4 secs
late by 4 secs
late by 4 secs


In [132]:
flat_data_final.shape

(420, 18)

# IMDB

In [133]:
url = 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=1&ref_=adv_nxt'
html = requests.get(url).content
soup = BeautifulSoup(html, 'lxml')
soup

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>IMDb "Top 250"
(Sorted by IMDb Rating Descending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
</script>
<link href="ht

In [145]:
movies = [i.text for i in soup.select('h3.lister-item-header>a')]


In [139]:
Rank = [int(i.text.strip('.')) for i in soup.select('span.lister-item-index')]

In [147]:
Year = [int(i.text.replace('(','').replace(')','')) for i in soup.select('span.lister-item-year')]

In [150]:
imdb = pd.DataFrame()
imdb['Movies'] = movies
imdb['Rank'] = Rank
imdb['Year'] = Year
imdb.head()
imdb.set_index('Rank')

Unnamed: 0_level_0,Movies,Year
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Les évadés,1994
2,Le parrain,1972
3,The Dark Knight: Le chevalier noir,2008
4,"Le parrain, 2ème partie",1974
5,Le seigneur des anneaux: Le retour du roi,2003
6,Pulp Fiction,1994
7,La liste de Schindler,1993
8,Douze hommes en colère,1957
9,Inception,2010
10,Fight Club,1999


In [152]:
Movies1 = []
Rank1 = []
Year1 = []
for k in range(1,202,50):
    url = f'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start={k}&ref_=adv_nxt'
    print(url)
    html = requests.get(url)
    time.sleep(2)
    html = html.content
    soup = BeautifulSoup(html, 'lxml')
    movies = [i.text for i in soup.select('h3.lister-item-header>a')]
    Movies1.append(movies)
    Rank = [int(i.text.strip('.')) for i in soup.select('span.lister-item-index')]
    Rank1.append(Rank)
    Year = [int(i.text.strip('I) (')) for i in soup.select('span.lister-item-year')]
    if Year is None:
        Year1.append(0)
    else:
        Year1.append(Year)
    

In [156]:
imdb = pd.DataFrame()
imdb['Titles'] = [j for i in Movies1 for j in i]
imdb['Rank'] = [j for i in Rank1 for j in i]
imdb['Year'] = [j for i in Year1 for j in i]
imdb.head()

Unnamed: 0,Titles,Rank,Year
0,Entre le ciel et l'enfer,51,1963
1,Psychose,52,1960
2,Casablanca,53,1942
3,Le dictateur,54,1940
4,Les temps modernes,55,1936


In [158]:
for k in range(1,202,50):
    url = f'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start={k}&ref_=adv_nxt'
    print(url)

https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=1&ref_=adv_nxt
https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=51&ref_=adv_nxt
https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=101&ref_=adv_nxt
https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=151&ref_=adv_nxt
https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=201&ref_=adv_nxt
