## Scraping Wiki articles

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re


In [None]:
def scrape_random_wikipedia_article():
    """Scrape a random Wikipedia article"""
    url = "https://en.wikipedia.org/wiki/Special:Random"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    response = requests.get(url, headers=headers, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Get the article title for logging
    title = soup.find('h1', {'id': 'firstHeading'})
    title_text = title.get_text() if title else "Unknown"
    
    sentences = []
    paragraphs = soup.find_all('p')
    
    for p in paragraphs:
        text = p.get_text().strip()
        if len(text) > 50:
            sents = text.split('.')
            for sent in sents:
                sent = sent.strip()
                if 20 < len(sent) < 250:
                    sentences.append(sent)
    
    return sentences, title_text

print("Scraping random Wikipedia articles...")
print("Target: 25,000 sentences\n")

all_sentences = []
articles_processed = 0
articles_with_content = 0

while len(all_sentences) < 25000:
    try:
        sentences, title = scrape_random_wikipedia_article()
        articles_processed += 1
        
        if len(sentences) > 0:
            all_sentences.extend(sentences)
            articles_with_content += 1
            
            if articles_processed % 10 == 0:
                print(f"Articles processed: {articles_processed} | "
                      f"With content: {articles_with_content} | "
                      f"Total sentences: {len(all_sentences)}")
        
        time.sleep(0.5)  # Be polite to Wikipedia
        
    except Exception as e:
        print(f"Error: {e}")
        time.sleep(1)
        continue

# Save first 25000
df = pd.DataFrame({
    'sentence': all_sentences[:25000],
    'tag': 'non_info'
})

print(f"\n✓ Collection complete!")
print(f"✓ Articles processed: {articles_processed}")
print(f"✓ Articles with content: {articles_with_content}")
print(f"✓ Total sentences collected: {len(df)}")

df.to_csv('non_info_data.csv', index=False)
print("✓ Saved to non_info_data.csv")

# Show some samples
print("\nSample sentences:")
for sent in df['sentence'].sample(5):
    print(f"  - {sent}")

In [None]:
def clean_non_info_sentences(df):
    """
    Clean non-info sentences with specific rules:
    1. Remove sentences ending in ":"
    2. Remove unicode characters from sentences (keep the sentence)
    3. Remove sentences that are coordinates
    4. Remove sentences with multiple consecutive special characters
    """
    
    print(f"Starting size: {len(df)}")
    
    # 1. Remove sentences ending with ":"
    df = df[~df['sentence'].str.endswith(':')].copy()
    print(f"After removing sentences ending in ':': {len(df)}")
    
    # 2. Remove unicode characters from sentences (but keep the sentences)
    df['sentence'] = df['sentence'].str.replace('\ufeff', '', regex=False)
    df['sentence'] = df['sentence'].str.replace('\xa0', ' ', regex=False)
    df['sentence'] = df['sentence'].str.replace('\u200b', '', regex=False)
    df['sentence'] = df['sentence'].str.replace('\u200e', '', regex=False)
    df['sentence'] = df['sentence'].str.replace('\u200f', '', regex=False)
    print(f"Unicode characters removed from sentences")
    
    # 3. Remove sentences that are coordinates (like "21°23′35″N 92°00′02″E")
    coordinate_pattern = r'\d+°\d+′\d+″'
    df = df[~df['sentence'].str.contains(coordinate_pattern, regex=True, na=False)].copy()
    print(f"After removing coordinate sentences: {len(df)}")
    
    # 4. Remove sentences with 4+ consecutive special characters
    multiple_special_pattern = r'[^\w\s]{4,}'
    df = df[~df['sentence'].str.contains(multiple_special_pattern, regex=True, na=False)].copy()
    print(f"After removing sentences with multiple special characters: {len(df)}")
    
    # Clean up any extra whitespace created by unicode removal
    df['sentence'] = df['sentence'].str.strip()
    df['sentence'] = df['sentence'].str.replace(r'\s+', ' ', regex=True)
    
    print(f"\nFinal size: {len(df)}")
    print(f"Total removed: {len(df) - len(df)}")
    
    return df

# Use it
df = pd.read_csv('non_info_data.csv')
df_clean = clean_non_info_sentences(df)

# Show some examples
print("\n=== Sample cleaned sentences ===")
for sent in df_clean['sentence'].sample(10):
    print(f"- {sent}")

# Save
df_clean.to_csv('non_info_data_clean.csv', index=False)
print(f"\n✓ Saved to non_info_data_clean.csv")

In [7]:
non_info_clean = pd.read_csv('non_info_data_clean.csv')  # has 'sentence' and 'tag'

In [5]:
data = pd.read_csv('argumentdetection6.csv')

In [8]:
non_info_clean = non_info_clean.rename(columns={'tag': 'category'})


In [10]:
combined_df = pd.concat([data, non_info_clean], ignore_index=True)

In [12]:
combined_df['category'].value_counts()

evidence    833983
non_info     25221
claim         1662
Name: category, dtype: int64

In [13]:
combined_df.to_csv('argumentdetection7.csv', index=False)