In [10]:
import zipfile
from datetime import datetime
from bs4 import BeautifulSoup
from prettytable import PrettyTable

def process_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    experiences = soup.find('section', {'id': 'experience-section'})

    if experiences:
        job_list = experiences.find_all('li', {'class': 'pv-profile-section__list-item'})
        job_table = PrettyTable(["Title", "Time Interval"])

        for job in job_list:
            title_tag = job.find('h3', {'class': 't-16'})
            title = title_tag.get_text().strip() if title_tag else ""
            
            if " at " in title.lower():
                title = title.split(" at ")[0].strip()
                
            date_range_element = job.find('h4', {'class': 'pv-entity__date-range'})
            if not date_range_element:
                continue

            time_interval = date_range_element.find_all('span')[1].get_text().strip()

            excluded_titles = ['intern', 'internship', 'student', 'contract', 'freelance', 'part-time']
            if any(excluded_title in title.lower() for excluded_title in excluded_titles):
                continue

            date_range = time_interval.split(' – ')
            start_date_str = date_range[0].strip()
            end_date_str = date_range[1].strip() if len(date_range) > 1 else ''

            try:
                start_date = datetime.strptime(start_date_str, '%b %Y')
            except ValueError:
                try:
                    start_date = datetime.strptime(start_date_str, '%Y')
                except ValueError:
                    continue

            try:
                end_date = datetime.strptime(end_date_str, '%b %Y') if end_date_str and end_date_str != 'Present' else datetime.now()
            except ValueError:
                try:
                    end_date = datetime.strptime(end_date_str, '%Y')
                except ValueError:
                    continue

            interval_str = f"{start_date.strftime('%Y.%m')} – {end_date.strftime('%Y.%m')}"
            job_table.add_row([title, interval_str])

        print(job_table)
        return True
    return False

with zipfile.ZipFile('linkedin_profiles.zip', 'r') as zip_file:
    successful_files = 0
    total_files = len([name for name in zip_file.namelist() if not (name.endswith("/") or name.startswith("__MACOSX"))])

    for file_name in zip_file.namelist():
        if file_name.endswith("/") or file_name.startswith("__MACOSX"):
            continue

        with zip_file.open(file_name, 'r') as html_file:
            html = html_file.read().decode('utf-8', errors='replace')
            print(f"Processing file {successful_files + 1} of {total_files}: {file_name}")
            print(f"Content preview: {html[:100]}\n")
            
            if process_html(html):
                successful_files += 1
            print(f"\nTotal successfully processed files so far: {successful_files}\n")


Processing file 1 of 447: linkedin_profiles/profile_jswong65.htm
Content preview: <html lang="en" class="theme theme--mercado artdeco windows"><head>
    <script type="application/j

+-----------------------------+-------------------+
|            Title            |   Time Interval   |
+-----------------------------+-------------------+
|      Software Engineer      | 2021.07 – 2023.03 |
|    Software Engineer III    | 2019.01 – 2021.07 |
| Graduate Teaching Assistant | 2013.08 – 2018.12 |
|        Summer Analyst       | 2018.05 – 2018.08 |
|      Research Associate     | 2015.06 – 2015.08 |
+-----------------------------+-------------------+

Total successfully processed files so far: 1

Processing file 2 of 447: linkedin_profiles/profile_sunnychu.htm
Content preview: <html lang="en" class="theme theme--mercado artdeco windows"><head>
    <script type="application/j

+------------------------------+-------------------+
|            Title             |   Time Interval   |
+------------