In [82]:
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup
import re

import warnings
warnings.filterwarnings("ignore")

In [83]:
url = "https://www.mtl.org/en/experience/ultimate-guide-vintage-shopping-montreal"

response = requests.get(url)
response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

soup = BeautifulSoup(response.content, "html.parser")

#  Extract sections divided by <p class="h3"> headings and store information
sections = {}
stores = []

for section in soup.find_all("section", class_="block-wysiwyg"):
    heading = section.find("p", class_="h3")
    if heading:
        title = heading.get_text(strip=True)
        stores_info = []
        text = section.get_text(" ", strip=True)  # Extract section text as a whole
        
        for link in section.find_all("a", href=True):
            name = link.get_text(strip=True)
            url = link["href"]
            match = re.search(rf"{re.escape(name)}\s*\(([^)]+)\)", text)  # Extract address next to the store name
            address = match.group(1) if match else np.nan
            stores_info.append({"name": name, "url": url, "address": address})
            stores.append({"section": title, "name": name, "url": url, "address": address})
        sections[title] = {"content": text, "stores": stores_info}

# Create a pandas DataFrame
df = pd.DataFrame(stores).dropna()\
                         .reset_index(drop = True)  # All stores have addresses, if empty data was parsed incorrectly as store
df

Unnamed: 0,section,name,url,address
0,Hand-Picked Cool,Annex x LOCAL,https://www.annexvintage.com/us/,5364 Saint-Laurent Boulevard
1,Hand-Picked Cool,Ex-Voto,https://exvoto.ca/en,6534 Saint-Laurent Boulevard
2,Hand-Picked Cool,LNF,https://www.lnfshop.com/,5319 Park Avenue
3,Hand-Picked Cool,Lazy Vintage,https://www.lazymtl.com/,"1682 Mont-Royal Avenue East, 3730 Ontario Stre..."
4,Hand-Picked Cool,Le Ninety,https://www.instagram.com/le.ninety/?hl=en,4361 Saint-Denis Street
5,Hand-Picked Cool,Bohème Vintage,https://bohemevintage.com/,206 Saint-Viateur West
6,Hand-Picked Cool,Shwap Club,https://www.shwapclub.com/,4710 Saint-Ambroise Street #265
7,Hand-Picked Cool,Common Sort,https://commonsort.com/,3667 Saint-Laurent Boulevard
8,Hand-Picked Cool,Empire Exchange,https://empiremtl.com/,5225 & 6796 Saint-Laurent Boulevard
9,Restored & Upcycled,Citizen Vintage,https://www.citizenvintage.com/,5330 Saint-Laurent Boulevard


In [101]:
df.to_csv('thrift_stores.csv', index = False)

#### Extract Features from 'Content' section

In [87]:
# Method 1: Extracting Keywords
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract content sections
content_list = [section.get_text(" ", strip=True) for section in soup.find_all("section", class_="block-wysiwyg")]

# Convert to DataFrame
df_content = pd.DataFrame({"section": sections.keys(), "content": content_list})

# Apply TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=20)
tfidf_matrix = vectorizer.fit_transform(df_content["content"])
feature_names = vectorizer.get_feature_names_out()

# Create DataFrame with extracted features
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
df_features = pd.concat([df_content, df_tfidf], axis=1)

df_features.head()


Unnamed: 0,section,content,avenue,boulevard,denis,east,garments,goods,hand,laurent,little,modern,mont,montréal,pieces,royal,saint,secondhand,selection,store,street,vintage
0,Hand-Picked Cool,Hand-Picked Cool It is generally understood th...,0.156113,0.338155,0.090387,0.210957,0.0,0.0,0.374804,0.338155,0.078056,0.210957,0.090387,0.090387,0.078056,0.090387,0.541048,0.0,0.067631,0.078056,0.234169,0.338155
1,Restored & Upcycled,Restored & Upcycled Another way to approach su...,0.0,0.338451,0.0,0.0,0.316712,0.105571,0.250088,0.338451,0.078125,0.105571,0.0,0.090466,0.15625,0.0,0.338451,0.211142,0.06769,0.078125,0.078125,0.609212
2,Hunter's Paradise,Hunter's Paradise Looking to get lost? There's...,0.258914,0.336499,0.299814,0.174936,0.174936,0.0,0.0,0.336499,0.129457,0.0,0.149907,0.299814,0.0,0.149907,0.560831,0.174936,0.112166,0.129457,0.129457,0.112166
3,Bring it to the Runway,Bring it to the Runway If you're looking to in...,0.143287,0.372449,0.165922,0.193626,0.193626,0.0,0.0,0.372449,0.143287,0.0,0.165922,0.0,0.143287,0.165922,0.496598,0.0,0.12415,0.143287,0.286575,0.372449
4,Unleash the Niche,Unleash the Niche As thrifting continues to ri...,0.350686,0.227886,0.101521,0.0,0.0,0.236943,0.0,0.227886,0.175343,0.0,0.304563,0.0,0.087671,0.304563,0.303847,0.0,0.075962,0.087671,0.087671,0.607695


In [93]:
# Method 2: Sentiment Analysis
from textblob import TextBlob

df_content["sentiment"] = df_content["content"].apply(lambda text: TextBlob(text).sentiment.polarity)

print(df_content[["section", "sentiment"]])


                      section  sentiment
0            Hand-Picked Cool   0.183482
1         Restored & Upcycled   0.121042
2           Hunter's Paradise   0.127465
3      Bring it to the Runway   0.129924
4           Unleash the Niche   0.116488
5  Home Sweet Secondhand Home   0.411538


In [96]:
# Method 3: Named Entity Recognition (NER)
import spacy

nlp = spacy.load("en_core_web_sm")
df_content["entities"] = df_content["content"].apply(lambda text: [(ent.text, ent.label_) for ent in nlp(text).ents])

df_content[["section", "entities"]]

Unnamed: 0,section,entities
0,Hand-Picked Cool,"[(seasonal, DATE), (Montréal, ORG), (Mile End'..."
1,Restored & Upcycled,"[(Restored & Upcycled, ORG), (5330, DATE), (Sa..."
2,Hunter's Paradise,"[(Hunter, PERSON), (afternoon, TIME), (Main, E..."
3,Bring it to the Runway,"[(Runway, ORG), (the 30s through the 90s, DATE..."
4,Unleash the Niche,"[(Niche, ORG), (Palmo Goods, FAC), (263, CARDI..."
5,Home Sweet Secondhand Home,"[(Montréal, PERSON), (Style Labo, ORG), (5595,..."
