<a href="https://colab.research.google.com/github/who0liebo0lie/ArticleTraffic/blob/main/Scrape_%26_Evaluate_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4 pandas nltk

import requests
from bs4 import BeautifulSoup
import json
import time
from urllib.parse import urljoin
import re
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Try to download NLTK data
try:
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    from nltk.tokenize import sent_tokenize
    NLTK_AVAILABLE = True
except:
    print("NLTK not available - using simple sentence splitting")
    NLTK_AVAILABLE = False

print("Setup complete!")

Setup complete!


In [2]:
class JWStruggleAnalysisScraper:
    def __init__(self, base_url="https://wol.jw.org"):
        self.base_url = base_url
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

        # Struggle indicators
        self.struggle_indicators = [
            'struggled', 'difficulty', 'challenge', 'problem', 'issue', 'obstacle',
            'hardship', 'trial', 'suffering', 'pain', 'hurt', 'difficult',
            'troubled', 'worried', 'anxious', 'depressed', 'overwhelmed',
            'addiction', 'abuse', 'divorce', 'death', 'illness', 'financial',
            'unemployment', 'rejection', 'loneliness', 'fear', 'doubt',
            'temptation', 'persecution', 'opposition', 'criticism', 'failure',
            'lost', 'confused', 'helpless', 'hopeless', 'discouraged',
            'disappointed', 'frustrated', 'angry', 'bitter', 'resentful'
        ]

        # Struggle categories
        self.struggle_categories = {
            'health_mental': ['depression', 'anxiety', 'mental', 'emotional', 'psychological', 'stressed', 'overwhelmed', 'panic', 'bipolar', 'ptsd'],
            'health_physical': ['illness', 'disease', 'cancer', 'disability', 'chronic', 'pain', 'medical', 'surgery', 'accident', 'injury'],
            'addiction': ['alcohol', 'drugs', 'gambling', 'smoking', 'substance', 'addiction', 'addicted', 'dependency'],
            'relationships': ['divorce', 'marriage', 'family', 'spouse', 'children', 'parents', 'relationship', 'domestic', 'abuse', 'cheating'],
            'financial': ['money', 'financial', 'debt', 'unemployment', 'job', 'poverty', 'homeless', 'foreclosure', 'bankruptcy'],
            'spiritual': ['faith', 'doubt', 'belief', 'god', 'prayer', 'bible', 'congregation', 'disfellowship', 'guilt', 'sin'],
            'social': ['rejection', 'persecution', 'discrimination', 'bullying', 'loneliness', 'isolation', 'friends', 'social'],
            'loss_grief': ['death', 'died', 'funeral', 'grief', 'mourning', 'loss', 'bereaved', 'widow', 'orphan'],
            'education_career': ['school', 'education', 'career', 'work', 'college', 'university', 'study', 'academic', 'professional'],
            'identity_purpose': ['identity', 'purpose', 'meaning', 'direction', 'confused', 'lost', 'self-worth', 'confidence']
        }

    def get_page_content(self, url):
        """Fetch and parse webpage content"""
        try:
            print(f"  Fetching: {url}")
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            return BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            print(f"  Error fetching {url}: {e}")
            return None

    def simple_sentence_split(self, text):
        """Simple sentence splitting when NLTK is not available"""
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if len(s.strip()) > 10]

    def extract_personal_experiences(self, soup):
        """Extract sections that contain personal experiences and struggles"""
        experience_data = {
            'struggle_sentences': [],
            'personal_stories': [],
            'quotes': []
        }

        # Get all text content
        full_text = soup.get_text()

        # Split into sentences
        if NLTK_AVAILABLE:
            try:
                sentences = sent_tokenize(full_text)
            except:
                sentences = self.simple_sentence_split(full_text)
        else:
            sentences = self.simple_sentence_split(full_text)

        # Find struggle sentences
        for sentence in sentences:
            if len(sentence) < 20:
                continue

            sentence_lower = sentence.lower()

            # Check for struggle indicators
            for indicator in self.struggle_indicators:
                if indicator in sentence_lower:
                    experience_data['struggle_sentences'].append({
                        'sentence': sentence,
                        'indicator': indicator
                    })
                    break

        # Extract quotes
        quotes = soup.find_all(['blockquote', 'q'])
        for quote in quotes:
            quote_text = quote.get_text(strip=True)
            if len(quote_text) > 30:
                quote_lower = quote_text.lower()

                struggle_indicators_found = []
                for indicator in self.struggle_indicators:
                    if indicator in quote_lower:
                        struggle_indicators_found.append(indicator)

                if struggle_indicators_found:
                    experience_data['quotes'].append({
                        'text': quote_text,
                        'indicators': struggle_indicators_found
                    })

        # Find first-person narratives
        first_person_patterns = [
            r'I (was|am|had|have|felt|experienced|struggled|faced)',
            r'My (life|family|marriage|health|faith|experience)',
            r'When I (was|felt|experienced|faced|struggled)'
        ]

        for sentence in sentences:
            if len(sentence) < 20:
                continue

            for pattern in first_person_patterns:
                if re.search(pattern, sentence, re.IGNORECASE):
                    sentence_lower = sentence.lower()
                    for indicator in self.struggle_indicators:
                        if indicator in sentence_lower:
                            experience_data['personal_stories'].append({
                                'sentence': sentence,
                                'indicator': indicator
                            })
                            break
                    break

        return experience_data

    def categorize_struggles(self, experience_data):
        """Categorize struggles into predefined categories"""
        categorized = {category: [] for category in self.struggle_categories.keys()}
        categorized['uncategorized'] = []

        all_experiences = (
            experience_data['struggle_sentences'] +
            experience_data['personal_stories'] +
            experience_data['quotes']
        )

        for experience in all_experiences:
            text = experience.get('sentence', experience.get('text', ''))
            text_lower = text.lower()
            categorized_flag = False

            for category, keywords in self.struggle_categories.items():
                for keyword in keywords:
                    if keyword in text_lower:
                        categorized[category].append({
                            'text': text,
                            'matched_keyword': keyword,
                            'original_data': experience
                        })
                        categorized_flag = True
                        break
                if categorized_flag:
                    break

            if not categorized_flag:
                categorized['uncategorized'].append({
                    'text': text,
                    'original_data': experience
                })

        return categorized

    def extract_article_struggles(self, article_url):
        """Extract struggles from a single article"""
        soup = self.get_page_content(article_url)
        if not soup:
            return None

        article_data = {
            'url': article_url,
            'scraped_at': datetime.now().isoformat()
        }

        # Extract title
        title = soup.find('h1')
        if title:
            article_data['title'] = title.get_text(strip=True)
        else:
            article_data['title'] = 'Unknown Title'

        # Extract experiences
        experiences = self.extract_personal_experiences(soup)
        article_data['experiences'] = experiences

        # Categorize struggles
        categorized = self.categorize_struggles(experiences)
        article_data['struggle_categories'] = categorized

        # Count total struggles
        total_struggles = sum(len(struggles) for struggles in categorized.values())
        article_data['total_struggles'] = total_struggles

        return article_data

    def get_article_list(self, url):
        """Get article URLs from main page"""
        soup = self.get_page_content(url)
        if not soup:
            return []

        articles = []

        # Find article links
        all_links = soup.find_all('a', href=True)

        for link in all_links:
            try:
                article_title = link.get_text(strip=True)
                article_url = urljoin(self.base_url, link['href'])

                # Basic filtering
                if (len(article_title) > 10 and
                    "from our readers" not in article_title.lower() and
                    '/wol/d/' in article_url):

                    articles.append({
                        'title': article_title,
                        'url': article_url
                    })
            except:
                continue

        # Remove duplicates
        seen_urls = set()
        unique_articles = []
        for article in articles:
            if article['url'] not in seen_urls:
                seen_urls.add(article['url'])
                unique_articles.append(article)

        return unique_articles[:50]  # Limit to first 50 for safety

    def scrape_struggles(self, url, max_articles=5):
        """Main scraping function"""
        print(f"Starting scrape of {url}")
        print(f"Maximum articles: {max_articles}")
        print("-" * 50)

        # Get article list
        print("Getting article list...")
        articles = self.get_article_list(url)

        if not articles:
            print("No articles found!")
            return []

        print(f"Found {len(articles)} potential articles")

        # Limit articles
        if max_articles:
            articles = articles[:max_articles]

        # Scrape each article
        dataset = []
        for i, article in enumerate(articles, 1):
            print(f"\nArticle {i}/{len(articles)}: {article['title'][:60]}...")

            struggle_data = self.extract_article_struggles(article['url'])

            if struggle_data and struggle_data['total_struggles'] > 0:
                dataset.append(struggle_data)
                print(f"  ✓ Found {struggle_data['total_struggles']} struggles")
            else:
                print(f"  - No struggles found")

            # Delay between requests
            time.sleep(2)

        print(f"\n{'='*50}")
        print(f"SCRAPING COMPLETE")
        print(f"Articles processed: {len(articles)}")
        print(f"Articles with struggles: {len(dataset)}")
        print(f"{'='*50}")

        return dataset


In [3]:
# Configure the scraper
url = "https://wol.jw.org/en/wol/d/r1/lp-e/1200273453"
max_articles = 8  # Start small for testing

# Create scraper and run
scraper = JWStruggleAnalysisScraper()
dataset = scraper.scrape_struggles(url, max_articles=max_articles)

# CELL 4: Process Results
if dataset:
    print("\nProcessing results...")

    # Create analysis dataset
    analysis_data = []
    category_counts = {}

    for article in dataset:
        for category, struggles in article['struggle_categories'].items():
            if category not in category_counts:
                category_counts[category] = 0
            category_counts[category] += len(struggles)

            for struggle in struggles:
                analysis_data.append({
                    'article_title': article['title'],
                    'article_url': article['url'],
                    'struggle_category': category,
                    'struggle_text': struggle['text'],
                    'matched_keyword': struggle.get('matched_keyword', ''),
                    'text_length': len(struggle['text']),
                    'scraped_at': article['scraped_at']
                })

    # Create DataFrame
    df = pd.DataFrame(analysis_data)

    # Display results
    print(f"\nRESULTS SUMMARY:")
    print(f"Total articles with struggles: {len(dataset)}")
    print(f"Total struggle instances: {len(df)}")
    print(f"Average struggles per article: {len(df)/len(dataset):.1f}")

    print(f"\nCATEGORY DISTRIBUTION:")
    for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
        if count > 0:
            percentage = (count / len(df)) * 100
            print(f"  {category}: {count} ({percentage:.1f}%)")

    # Save files
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    json_filename = f'jw_struggles_{timestamp}.json'
    csv_filename = f'jw_struggles_{timestamp}.csv'

    # Save JSON
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, indent=2, ensure_ascii=False)

    # Save CSV
    df.to_csv(csv_filename, index=False, encoding='utf-8')

    print(f"\nFiles saved:")
    print(f"  {json_filename}")
    print(f"  {csv_filename}")

    # Show sample data
    print(f"\nSAMPLE DATA:")
    print(df.head())

    print(f"\nDataset ready for analysis!")

else:
    print("No data collected. Try adjusting the URL or parameters.")


Starting scrape of https://wol.jw.org/en/wol/d/r1/lp-e/1200273453
Maximum articles: 8
--------------------------------------------------
Getting article list...
  Fetching: https://wol.jw.org/en/wol/d/r1/lp-e/1200273453
Found 4 potential articles

Article 1/4: List by Name...
  Fetching: https://wol.jw.org/en/wol/d/r1/lp-e/1200273453#p9
  ✓ Found 29 struggles

Article 2/4: List by Title...
  Fetching: https://wol.jw.org/en/wol/d/r1/lp-e/1200273453#p621
  ✓ Found 29 struggles

Article 3/4: List by Name...
  Fetching: https://wol.jw.org/en/wol/d/r1/lp-e/1200273453#h=9
  ✓ Found 29 struggles

Article 4/4: List by Title...
  Fetching: https://wol.jw.org/en/wol/d/r1/lp-e/1200273453#h=621
  ✓ Found 29 struggles

SCRAPING COMPLETE
Articles processed: 4
Articles with struggles: 4

Processing results...

RESULTS SUMMARY:
Total articles with struggles: 4
Total struggle instances: 116
Average struggles per article: 29.0

CATEGORY DISTRIBUTION:
  relationships: 56 (48.3%)
  spiritual: 28 (24.1%)
 

In [4]:
#Download files to your computer
from google.colab import files
files.download(json_filename)
files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
# Example: Load and clean the data
import pandas as pd
df = pd.read_csv('jw_struggles_20250703_175239')

# Remove duplicates and clean text
df = df.drop_duplicates(subset=['struggle_text'])
df['struggle_text'] = df['struggle_text'].str.strip()