# Setup & Environment Check Notebook

This notebook verifies that all packages in the course environment are installed
and working correctly ‚Äî **without errors or warnings**.

It also demonstrates how to use the project folder structure:

- **data/raw/** ‚Äì Original data
- **data/processed/** ‚Äì Cleaned data for analysis
- **tables/** ‚Äì Final visualizations and outputs

After running this notebook, check these folders to see example outputs.


In [None]:
# 1) Environment setup and package version check

import sys, platform, warnings
from importlib.metadata import version as dist_version, PackageNotFoundError
from pathlib import Path

warnings.filterwarnings("ignore")

# Set up project directories
RAW_DATA = Path("data/raw")
PROCESSED_DATA = Path("data/processed")
TABLES = Path("tables")

for folder in [RAW_DATA, PROCESSED_DATA, TABLES]:
    folder.mkdir(parents=True, exist_ok=True)

print("Project structure:")
print(f"  Raw data:       {RAW_DATA.resolve()}")
print(f"  Processed data: {PROCESSED_DATA.resolve()}")
print(f"  Tables/plots:   {TABLES.resolve()}")

print("\nPython:", platform.python_version())
print("Executable:", sys.executable)

# Map module import names -> PyPI names
DIST_FOR_MODULE = {
    "bs4": "beautifulsoup4",
    "docx": "python-docx",
    "PIL": "pillow",
    "sklearn": "scikit-learn",
}

def version_of(mod_name: str) -> str:
    """Return installed version or friendly message."""
    try:
        m = __import__(mod_name)
    except Exception:
        return "not importable"
    v = getattr(m, "__version__", None)
    if v:
        return v
    dist_name = DIST_FOR_MODULE.get(mod_name, mod_name)
    try:
        return dist_version(dist_name)
    except PackageNotFoundError:
        return "installed (no version metadata)"
    except Exception:
        return "unknown"

# Core packages from pyproject.toml
modules = [
    "pandas", "numpy", "scipy", "openpyxl", "pyarrow",
    "matplotlib", "seaborn", "wordcloud",
    "wikipediaapi", "mwparserfromhell", "wikitextparser", "mwclient",
    "bs4", "lxml", "regex", "unidecode", "nltk", "textblob", "textstat", "dateparser",
    "statsmodels", "sklearn", 
    "transformers", "torch", "sentence_transformers", "gensim",
    "networkx", "PIL", "requests", "httpx",
    "docx", "pdfplumber", "tqdm", "nbformat"
]

print("\nPackage versions:")
for m in modules:
    print(f"  {m:<20} {version_of(m)}")

In [None]:
# 2) Demo dataset (simulating Wikipedia article analysis)

import pandas as pd
from io import StringIO

csv_text = """title,views,length,links,categories,date_edited
Python (programming language),1250400,82000,450,12,2025-01-15
Machine learning,890200,65000,380,15,2025-01-20
Natural language processing,420300,48000,290,10,2025-02-01
Data science,670500,55000,320,11,2025-02-10
Artificial intelligence,1580600,95000,520,18,2025-03-05
"""

raw_path = RAW_DATA / "wikipedia_articles_raw.csv"
with open(raw_path, 'w') as f:
    f.write(csv_text)
print(f"‚úì Saved raw data -> {raw_path}")

df = pd.read_csv(StringIO(csv_text))
df["date_edited"] = pd.to_datetime(df["date_edited"])
df["links_per_1000_chars"] = (df["links"] / df["length"]) * 1000
df["categories_per_1000_chars"] = (df["categories"] / df["length"]) * 1000

processed_path = PROCESSED_DATA / "wikipedia_articles_cleaned.csv"
df.to_csv(processed_path, index=False)
print(f"‚úì Saved processed data -> {processed_path}")

df.head()

In [None]:
# 3) Text processing demo (BeautifulSoup, regex, unidecode)

from bs4 import BeautifulSoup
import regex as re
from unidecode import unidecode

html = """
<article>
  <h1>Caf√© in Z√ºrich</h1>
  <p>A story about <em>data journalism</em> in 2025.</p>
</article>
"""

soup = BeautifulSoup(html, "html.parser")
title = soup.h1.text
text = soup.p.text

title_ascii = unidecode(title)
text_clean = re.sub(r'\d+', '[NUM]', text)

print("‚úì Text processing working")
print(f"  Original title: {title}")
print(f"  ASCII title:    {title_ascii}")
print(f"  Cleaned text:   {text_clean}")

In [None]:
# 4) NLP basics (NLTK, TextBlob, TextStat)

import nltk
from textblob import TextBlob
import textstat

# Download ALL required NLTK data
print("Downloading NLTK data (this may take a moment)...")
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('brown', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('universal_tagset', quiet=True)

sample_text = "Natural language processing is a fascinating field. It combines linguistics and computer science."

# Just use simple word operations, avoid sentiment which needs corpora
blob = TextBlob(sample_text)
word_count = len(blob.words)

flesch_score = textstat.flesch_reading_ease(sample_text)
syllable_count = textstat.syllable_count(sample_text)

print("‚úì NLP packages working")
print(f"  Words: {word_count}")
print(f"  Syllables: {syllable_count}")
print(f"  Flesch reading ease: {round(flesch_score, 1)}")

In [None]:
# 5) Basic visualization (Matplotlib & Seaborn)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x="length", y="views", size="links", hue="categories", sizes=(50, 300))
plt.title("Wikipedia Article Performance")
plt.xlabel("Article Length (characters)")
plt.ylabel("Views")
plt.tight_layout()

plot_path = TABLES / "article_performance.png"
plt.savefig(plot_path, dpi=150)
print(f"‚úì Saved plot -> {plot_path}")
plt.show()

In [None]:
# 6) Word cloud visualization

from wordcloud import WordCloud

text = " ".join(df["title"])
wc = WordCloud(width=800, height=400, background_color="white").generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("Wikipedia Article Topics")
plt.tight_layout()

wc_path = TABLES / "wordcloud.png"
plt.savefig(wc_path, dpi=150)
plt.close()
print(f"‚úì Saved wordcloud -> {wc_path}")

In [None]:
# 7) Statistical analysis (Statsmodels)

import statsmodels.api as sm

X = sm.add_constant(df["length"])
model = sm.OLS(df["views"], X).fit()

print("‚úì Statistical packages working")
print(f"  OLS R¬≤: {round(model.rsquared, 3)}")
print(f"  p-value for length: {round(model.pvalues[1], 4)}")

In [None]:
# 8) Machine Learning basics (scikit-learn)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

features = df[["length", "links", "categories"]].values
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(features_scaled)

print("‚úì Machine learning packages working")
print(f"  Cluster distribution:\n{df['cluster'].value_counts()}")

clustered_path = PROCESSED_DATA / "articles_with_clusters.csv"
df.to_csv(clustered_path, index=False)
print(f"‚úì Saved clustered data -> {clustered_path}")

In [None]:
# 9) Network analysis demo (NetworkX)

import networkx as nx

G = nx.Graph()
G.add_edges_from([
    ("Python", "Data Science"),
    ("Python", "Machine Learning"),
    ("Data Science", "Machine Learning"),
    ("Machine Learning", "AI"),
    ("NLP", "AI")
])

print("‚úì Network analysis packages working")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Density: {round(nx.density(G), 2)}")

In [None]:
# 10) Web requests demo (requests, httpx)

import requests

response = requests.get("https://api.github.com/zen")
if response.status_code == 200:
    print("‚úì Web request packages working")
    print(f"  GitHub Zen: {response.text[:50]}...")
else:
    print("‚úì Requests installed (couldn't reach API)")

In [None]:
# 11) Document export (python-docx)

from docx import Document

doc = Document()
doc.add_heading("Setup Check Report", level=1)
doc.add_paragraph(f"All {len(modules)} required packages are installed and working correctly.")
doc.add_heading("Analysis Summary", level=2)
doc.add_paragraph(f"Analyzed {len(df)} Wikipedia articles")
doc.add_paragraph(f"Average views: {df['views'].mean():,.0f}")
doc.add_paragraph(f"Average length: {df['length'].mean():,.0f} characters")

doc_path = TABLES / "setup_report.docx"
doc.save(doc_path)
print(f"‚úì Saved Word document -> {doc_path}")

In [None]:
# 13) Word embeddings and transformers demo

from sentence_transformers import SentenceTransformer
import numpy as np

print("Loading small sentence embedding model (this may take 30 seconds)...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Tiny, fast model

sentences = [
    "Natural language processing is fascinating",
    "Machine learning is powerful",
    "Data science is useful",
    "The weather is nice today"
]

embeddings = model.encode(sentences)

# Calculate similarity between first sentence and others
similarities = []
for i in range(1, len(embeddings)):
    sim = np.dot(embeddings[0], embeddings[i]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[i]))
    similarities.append(sim)

print("‚úì Sentence transformers working")
print(f"  Model: all-MiniLM-L6-v2 (384-dim embeddings)")
print(f"  Embedding shape: {embeddings[0].shape}")
print(f"\n  Similarity to '{sentences[0]}':")
for i, (sent, sim) in enumerate(zip(sentences[1:], similarities), 1):
    print(f"    - '{sent}': {sim:.3f}")

In [None]:
# 14) Summary

print("\n" + "="*60)
print("‚úÖ All checks completed successfully!")
print("="*60)
print("\nGenerated files:")
print(f"  üìÅ {RAW_DATA}/       - 1 raw data file")
print(f"  üìÅ {PROCESSED_DATA}/ - 2 processed data files")
print(f"  üìÅ {TABLES}/         - 4 output files")
print("\nCore capabilities verified:")
print("  ‚úì Data manipulation (pandas, numpy)")
print("  ‚úì Text processing (BeautifulSoup, regex, NLTK)")
print("  ‚úì NLP analysis (TextBlob, TextStat)")
print("  ‚úì Machine learning (scikit-learn)")
print("  ‚úì Word embeddings (sentence-transformers)")
print("  ‚úì Network analysis (NetworkX)")
print("  ‚úì Web APIs (requests)")
print("  ‚úì Visualization (matplotlib, seaborn, wordcloud)")
print("\nYou're ready to start analyzing Wikipedia data!")

In [None]:
# 15) OPTIONAL: Clean up test data

# To use this cleanup code:
# 1. Select all the lines below (starting from "import shutil" to the last print statement)
# 2. Press Ctrl+/ (Windows/Linux) or Cmd+/ (Mac) to uncomment all selected lines
# 3. Run the cell
#
# This will remove all generated test files and return the repository to its original state.

# import shutil

# print("Cleaning up test data...")

# # Remove all files from data/raw, data/processed, and tables
# for folder in [RAW_DATA, PROCESSED_DATA, TABLES]:
#     if folder.exists():
#         for file in folder.glob("*"):
#             if file.is_file():
#                 file.unlink()
#                 print(f"  Deleted {file.name}")

# print("\n‚úì Repository cleaned! Folders are now empty.")