In [13]:
%rest -f

UsageError: Line magic function `%rest` not found.


# USCIS i-485 analysis 
This is a test code to pre process the following websites. 

https://www.immihelp.com/i-485-tracker/
https://www.trackitt.com/usa-immigration-trackers/i485-eb

The above website shows the code, let's try to download them.

Steps
1. Analyze the website and see how the data is stored
2. use request and bs4 to download data if possible 
3. save it in excel

Step 1: Analyze the website and see how the data is stored 

Why: This step is important becaues without understanding the structure of the HTML code we would not be able to parse it by beautiful soup. 
The structure of this website is that the data is stored in the class called "c-Sticky-table" and structure is as follows: 
c-Sticky-table
- thead - tr - <th> text </th>
- tbody 
  - tr - <td> text </td>
  - tr - <td> text </td> .... 

Based on the above, it is good to perform the following
1. Obtain the header by parsing the html as table>thead>th 
2. Obtain the data by iterating each tr and collect the td from the text 

# Web scraping portion

<h>Import necessary libraries</h>

In [14]:
# standard library 
import requests
import csv, re
import time 
import cProfile
import pstats

# external library 
from bs4 import BeautifulSoup
from tqdm import tqdm 


In [15]:
# declare infos
base_url = "https://www.immihelp.com/i-485-tracker/"


In [16]:
def get_last_page(session, url):
    """This function takes session and url as an argument and return the last page number of the url. 

    Args:
        session (request.session): session object generated by request
        url (string): target url 

    Returns:
        last_page (int): last page number of the table 
    """
    soup = get_soup(session, url)
    # Find how many pages are there
    last_page_url = soup.find(title="Last Page", href=True)
    # This will result in "/i-485-tracker/100/"
    last_page_url = last_page_url['href']
    # Remove the leading and trailing "/"
    last_page_url =last_page_url.strip('/')
    # Split by "/" delimiter
    last_page_url = last_page_url.split('/')
    # the last item is the page number
    last_page_num = last_page_url[-1]
    # It is string, hence convert to integer
    last_page_num = int(last_page_num)
    print(f"last page of the website is p.{last_page_num}")
    return last_page_num

In [17]:
def get_header(header_table):
    """This function takes the target table and return header list 

    Args:
        header_table (table object from bs4): 

    Returns:
        list: header list
    """
    headers = []
    # For each html tag "th" in the table, append the text inside it. This will obtain the header of the table 
    for col in header_table.find_all("th"):
        headers.append(col.text)
    print("Obtained the header")
    return headers

In [18]:
def get_table(session, url, target):
    soup = get_soup(session, url)
    table = soup.find(class_=target)
    return table

In [19]:
def get_soup(session, url):
    response = session.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup

In [24]:
def get_data():
    # iterate through pages version 
    with requests.Session() as s:
        print("Session initialized") 
        data =[]
        last_page = get_last_page(s, base_url)
        
        target_table = "c-Sticky-table"
        table = get_table(s, base_url,target_table)
        # Initialize headers list 
        headers = get_header(table.thead)
        # last_page = 3
        # For each page 
        for page in range(1,last_page+1):
            if page%10==0:
                print(f"working on page {page}")
            # Initialize body table and raw data list 
            current_page = base_url + f"{page}/"
            table = get_table(s,current_page,target_table)
            body_table = table.tbody
            
            # For each row in the body of the table 
            for row in body_table.find_all("tr"):
                t_row ={}
                # For each col of the row 
                for col,header in zip(row.find_all("td"),headers):
                    # print(f"printing column {col.text}")
                    # Store them in dictionary, header is the key and col is the data. make sure to remove "\n" and extra spaces
                    t_row[header] = col.text.replace("\n","").strip()
                # print(t_row)
                data.append(t_row) 
    return data

In [21]:
def to_csv(data):
    keys = data[0].keys()
    file_name = 'test.csv'
    with open(file_name, 'w', encoding='utf-8', newline="") as data_file:
        dict_writer = csv.DictWriter(data_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(data) 

In [22]:
def main():
    data = get_data()
    to_csv(data)
# profile = cProfile.Profile()
# profile.runcall(get_data)
# ps = pstats.Stats(profile)
# ps.print_stats()

In [25]:
if __name__ == "__main__":
    main()

Session initialized
last page of the website is p.101
Obtained the header
working on page 10
working on page 20
working on page 30
working on page 40
working on page 50
working on page 60
working on page 70
working on page 80
working on page 90
working on page 100
