# Web Scraping - Monthly Trip data in Bulk

In [1]:
#import dependencies
import bs4
from bs4 import BeautifulSoup as bs
import requests

import os
import time
import pandas as pd
import numpy as np
import csv
import zipfile

In [2]:
url= 'https://s3.amazonaws.com/tripdata/index.html'

### Inspect page to find which tags or classes where information reside
1) Right-click and inspect index.html

2) Go to "Network" tab, make sure "All" is selected on ribbon

3) Click "index.html" => go to section "Request Headers"

4) Copy element "User-Agent" from the bottim section

In [3]:
response= requests.get(url, headers={
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"    
})

In [4]:
with requests.get(url) as response: 
    soup=bs(response.content, "lxml") ## "html5lib" works as well
    print(soup)

<html>
<head>
<!--

  Amazon S3 Bucket listing.


  Copyright (C) 2008 Francesco Pasqualini

      This program is free software: you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published by
      the Free Software Foundation, either version 3 of the License, or
      (at your option) any later version.

      This program is distributed in the hope that it will be useful,
      but WITHOUT ANY WARRANTY; without even the implied warranty of
      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      GNU General Public License for more details.

      You should have received a copy of the GNU General Public License
      along with this program.  If not, see <http://www.gnu.org/licenses/>.

  -->
<!--

  Modified by Nolan Lawson!  (http://nolanlawson.com).  I'm keeping the spirit of the
  GPL alive by issuing this with the same license!

  -->
<title>Bucket loading...</title>
<link href="//netdna

In [5]:
table= soup.find_all("div", class_ ="container")
print(table)

## NOTE: 
#  class="hide-while-loading table table-striped" has running with some javascript, while inspecting the UI, 
#  zip file names are resided in tag <td> <a href=""> </a> </td>

[<div class="container">
<h1 id="h1-title">Bucket loading...</h1>
<table class="hide-while-loading table table-striped">
<thead>
<tr>
<th>Name</th>
<th>Date Modified</th>
<th>Size</th>
<th>Type</th>
</tr>
</thead>
<tbody id="tbody-content">
</tbody>
</table>
</div>]


## Move URL directory one level up 

--'https://s3.amazonaws.com/tripdata/'

In [6]:
tripdata_url='https://s3.amazonaws.com/tripdata/'

In [7]:
def get_soup(tripdata_url):
    return bs(requests.get(tripdata_url).content, "lxml")

print(get_soup(tripdata_url).prettify())

<?xml version="1.0" encoding="UTF-8"?>
<html>
 <body>
  <listbucketresult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
   <name>
    tripdata
   </name>
   <prefix>
   </prefix>
   <marker>
   </marker>
   <maxkeys>
    1000
   </maxkeys>
   <istruncated>
    false
   </istruncated>
   <contents>
    <key>
     201306-citibike-tripdata.zip
    </key>
    <lastmodified>
     2018-04-30T13:18:55.000Z
    </lastmodified>
    <etag>
     "b520a12de58eea58a3586f89bfcfbd9d-2"
    </etag>
    <size>
     16785103
    </size>
    <storageclass>
     STANDARD
    </storageclass>
   </contents>
   <contents>
    <key>
     201307-201402-citibike-tripdata.zip
    </key>
    <lastmodified>
     2017-01-18T22:23:25.000Z
    </lastmodified>
    <etag>
     "7b3b260b2ab2e5349320121d04bd821c-22"
    </etag>
    <size>
     178262576
    </size>
    <storageclass>
     STANDARD
    </storageclass>
   </contents>
   <contents>
    <key>
     201307-citibike-tripdata.zip
    </key>
    <lastmodified>

In [8]:
#Declare file format to parse and append all ".zip" files
filetype='.zip'
zip_file_list=[]
dup_file = '201307-201402-citibike-tripdata.zip'


for link in get_soup(tripdata_url).find_all('key'):
    file_link=link.text
    if filetype in file_link:
#         print(file_link)
        zip_file_list.append(file_link)

In [9]:
#Print Zip_file_list & Exam the list to see if any DUPLICATED or UNWANTED files
zip_file_list

['201306-citibike-tripdata.zip',
 '201307-201402-citibike-tripdata.zip',
 '201307-citibike-tripdata.zip',
 '201308-citibike-tripdata.zip',
 '201309-citibike-tripdata.zip',
 '201310-citibike-tripdata.zip',
 '201311-citibike-tripdata.zip',
 '201312-citibike-tripdata.zip',
 '201401-citibike-tripdata.zip',
 '201402-citibike-tripdata.zip',
 '201403-citibike-tripdata.zip',
 '201404-citibike-tripdata.zip',
 '201405-citibike-tripdata.zip',
 '201406-citibike-tripdata.zip',
 '201407-citibike-tripdata.zip',
 '201408-citibike-tripdata.zip',
 '201409-citibike-tripdata.zip',
 '201410-citibike-tripdata.zip',
 '201411-citibike-tripdata.zip',
 '201412-citibike-tripdata.zip',
 '201501-citibike-tripdata.zip',
 '201502-citibike-tripdata.zip',
 '201503-citibike-tripdata.zip',
 '201504-citibike-tripdata.zip',
 '201505-citibike-tripdata.zip',
 '201506-citibike-tripdata.zip',
 '201507-citibike-tripdata.zip',
 '201508-citibike-tripdata.zip',
 '201509-citibike-tripdata.zip',
 '201510-citibike-tripdata.zip',
 '2

In [10]:
dup_file = '201307-201402-citibike-tripdata.zip'
file_list =[]
for i in zip_file_list: 
    if i.find("JC"):
        file_list.append(i)
        
        if i in dup_file: 
            file_list.remove(i)
            
file_list

['201306-citibike-tripdata.zip',
 '201307-citibike-tripdata.zip',
 '201308-citibike-tripdata.zip',
 '201309-citibike-tripdata.zip',
 '201310-citibike-tripdata.zip',
 '201311-citibike-tripdata.zip',
 '201312-citibike-tripdata.zip',
 '201401-citibike-tripdata.zip',
 '201403-citibike-tripdata.zip',
 '201404-citibike-tripdata.zip',
 '201405-citibike-tripdata.zip',
 '201406-citibike-tripdata.zip',
 '201407-citibike-tripdata.zip',
 '201408-citibike-tripdata.zip',
 '201409-citibike-tripdata.zip',
 '201410-citibike-tripdata.zip',
 '201411-citibike-tripdata.zip',
 '201412-citibike-tripdata.zip',
 '201501-citibike-tripdata.zip',
 '201502-citibike-tripdata.zip',
 '201503-citibike-tripdata.zip',
 '201504-citibike-tripdata.zip',
 '201505-citibike-tripdata.zip',
 '201506-citibike-tripdata.zip',
 '201507-citibike-tripdata.zip',
 '201508-citibike-tripdata.zip',
 '201509-citibike-tripdata.zip',
 '201510-citibike-tripdata.zip',
 '201511-citibike-tripdata.zip',
 '201512-citibike-tripdata.zip',
 '201601-c

## Loop through the file_list 
    1) Loop through the "file_list" 
    2) open each zip file
    3) unzip the file
    4) extract .csv from each zip file
    5) remove zipped file before next iteration
    6) print execution time 
    
    https://docs.python.org/3/library/zipfile.html
    

In [11]:
start_time = time.time()

for file in file_list:
    
    """  Concatenate two variables to get each zip file's url, 
         we do this because we were unable to extract the zip 
         file url from html tag, "<table> </table> " """
    file_url = tripdata_url + file  
    
    #open and download files
    with open(file, "wb") as openfile:
            response = requests.get(file_url)
            openfile.write(response.content)
            
    # class <zipfile.ZipFile> is for reading and writting ZIP files
    with zipfile.ZipFile(file, "r") as zip_file:
        zip_file.extractall("tripdata")   
       
    # remove zipped file before next iteration
    os.remove(file)

elapsed_time = round(((time.time() - start_time)/60),6)
print ("Execution time: " + str(elapsed_time) + " minutes")   
    

Execution time: 6.614868 minutes


## Standarize file names in tripdata folder

In [21]:
# rename files 
directory = 'tripdata/'
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.csv'):
        new_filename1 = filename.replace(' ','').replace('-','_').lower()
        new_filename= new_filename1.strip('_')
        os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))

In [None]:
# # rename files 
# directory = 'tripdata/'
# for file in os.listdir(directory):
#     filename = os.fsdecode(file)
#     if filename.endswith('.csv'):
#         new_filename = filename.replace(' ','').lower().split('ci', 1)[0].strip('-').replace('-','_')
#         os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename + '.csv'))
    

In [None]:
# #load into DataFrames into dfs dictionary
# directory = 'tripdata/'
# dfs = {}

# for file in os.listdir(directory):
#     filename = os.fsdecode(file)
#     if filename.endswith('.csv'):
#         dfs[filename.split('.')[0]] = pd.read_csv(os.path.join(directory, filename)) 

In [None]:
directory = 'tripdata/'
# keyword = ''
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print(filename)
#     grep - r 'JC'
 