# YouTube Watch History Cleaning

In [None]:
# Code to load and parse watch-history.html will go here

In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define the path to the watch history file
watch_history_file = '../data/watch-history-light.html'

# Open and read the HTML file
with open(watch_history_file, 'r', encoding='utf-8') as f:
    html_content = f.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find all watch history entries
entries = soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

data = []
for entry in entries:
    # Find the content cell
    content_cell = entry.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
    if not content_cell:
        continue

    # Extract video title and URL
    video_link = content_cell.find('a')
    if not video_link:
        continue
    title = video_link.text.strip()
    video_url = video_link['href']

    # Extract channel name and timestamp
    br_tag = content_cell.find('br')
    if not br_tag:
        continue
    
    channel_link = br_tag.find_next_sibling('a')
    if not channel_link:
        channel_name = 'N/A'
    else:
        channel_name = channel_link.text.strip()

    all_text_nodes = content_cell.find_all(string=True, recursive=False)
    timestamp_str = ''
    for text_node in all_text_nodes:
      clean_text = text_node.strip()
      if re.match(r'\w{3} \d{1,2}, \d{4}, \d{1,2}:\d{2}:\d{2}\s+(AM|PM)\s+UTC', clean_text):
        timestamp_str = clean_text
        break
    
    if not timestamp_str:
        last_br = content_cell.find_all('br')[-1] if content_cell.find_all('br') else None
        if last_br and last_br.next_sibling and last_br.next_sibling.strip():
            timestamp_str = last_br.next_sibling.strip()
        else:
            raw_text_parts = [text.strip() for text in content_cell.stripped_strings]
            potential_timestamp = raw_text_parts[-1] if raw_text_parts else ''
            if 'UTC' in potential_timestamp:
                 timestamp_str = potential_timestamp
            else:
                 timestamp_str = 'N/A'

    timestamp_utc = timestamp_str.replace(' UTC', '')

    data.append({
        'title': title,
        'video_url': video_url,
        'channel_name': channel_name,
        'timestamp_utc': timestamp_utc
    })

df = pd.DataFrame(data)
df.head()

## Data Cleaning

In [None]:
# Display DataFrame information
df.info()

# Convert 'timestamp_utc' to datetime objects
df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'], errors='coerce')

# Check for rows with NaT in 'timestamp_utc'
print("Rows with NaT in timestamp_utc after conversion:")
print(df[df['timestamp_utc'].isnull()])

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Display shape after dropping duplicates
print(f"\nShape after dropping duplicates: {df.shape}")

# Display the first few rows of the cleaned DataFrame
print("\nCleaned DataFrame head:")
df.head()

## Save Cleaned Data

In [None]:
# Define the output path
output_path = '../data/cleaned_watch_history.csv'

# Save the DataFrame to a CSV file
df.to_csv(output_path, index=False)

# Print confirmation message
print(f"Cleaned data saved to {output_path}")