In [1]:
from bs4 import BeautifulSoup
import requests

host = "https://refactoring.guru/design-patterns/"
image_host = "https://refactoring.guru"

page = requests.get(host + "catalog/")
print(page.status_code)

200


In [2]:
soup = BeautifulSoup(page.content, 'html.parser')

patterns = soup.select(".pattern-card")
pattern_names = [p.text for pattern in patterns for p in pattern.select(".pattern-name")]
pattern_names = set(pattern_names)
pattern_names

{'Abstract Factory',
 'Adapter',
 'Bridge',
 'Builder',
 'Chain of Responsibility',
 'Command',
 'Composite',
 'Decorator',
 'Facade',
 'Factory Method',
 'Flyweight',
 'Iterator',
 'Mediator',
 'Memento',
 'Observer',
 'Prototype',
 'Proxy',
 'Singleton',
 'State',
 'Strategy',
 'Template Method',
 'Visitor'}

In [3]:
# because every pattern pages link suffixed by modified pattern name with changing whitespace to '-'
# we can generate link to every patterns
pattern_links = [host + p.lower().replace(" ", "-") for p in pattern_names]


# save pattern name and links

pattern_link_rels = []
for n, l in zip(pattern_names, pattern_links):
  pattern_link_rel = {}
  pattern_link_rel["pattern_name"] = n;
  pattern_link_rel["link"] = l;
  pattern_link_rels.append(pattern_link_rel)

print(pattern_link_rels)


[{'pattern_name': 'Bridge', 'link': 'https://refactoring.guru/design-patterns/bridge'}, {'pattern_name': 'Abstract Factory', 'link': 'https://refactoring.guru/design-patterns/abstract-factory'}, {'pattern_name': 'Builder', 'link': 'https://refactoring.guru/design-patterns/builder'}, {'pattern_name': 'Iterator', 'link': 'https://refactoring.guru/design-patterns/iterator'}, {'pattern_name': 'Prototype', 'link': 'https://refactoring.guru/design-patterns/prototype'}, {'pattern_name': 'Adapter', 'link': 'https://refactoring.guru/design-patterns/adapter'}, {'pattern_name': 'Memento', 'link': 'https://refactoring.guru/design-patterns/memento'}, {'pattern_name': 'Decorator', 'link': 'https://refactoring.guru/design-patterns/decorator'}, {'pattern_name': 'Factory Method', 'link': 'https://refactoring.guru/design-patterns/factory-method'}, {'pattern_name': 'Composite', 'link': 'https://refactoring.guru/design-patterns/composite'}, {'pattern_name': 'Visitor', 'link': 'https://refactoring.guru/des

In [4]:

# access all the data
patterns_pages = [requests.get(link) for link in pattern_links]

In [5]:
pattern_soups = [BeautifulSoup(p.content, 'html.parser') for p in patterns_pages]

# data we want to retrieve
# - every part of the page design pattern structure
# - for part that has specific steps or list, parse it again
# - for part that emphasize specific keyword, safe it
# - for relations with other design pattern we can create the connection and the text
# - for any part, save the text with BIGTEXT
# - for any part, save the pictures
# - preprocess each paragraph with https://rapidapi.com/MeaningCloud/api/summarization

In [6]:
# see if each segments are the same
names_pages = zip(pattern_names, pattern_soups)

segment = set()
for name, page in names_pages:
  segments = [s.text for s in page.select("article > .section > h2")]
  print(name, ": ", ", ".join(segments), "\n\n")
  if segment == set():
    segment = set(segments)
  elif segment != set(segments):
    print("segment differ ", segment.symmetric_difference(set(segments)),  "\n")
print(list(names_pages))

Bridge :   Intent,  Problem,  Solution,  Structure,  Pseudocode,  Applicability,  How to Implement,  Pros and Cons,  Relations with Other Patterns,  Code Examples 


Abstract Factory :   Intent,  Problem,  Solution,  Structure,  Pseudocode,  Applicability,  How to Implement,  Pros and Cons,  Relations with Other Patterns,  Code Examples,  Extra Content 


segment differ  {' Extra Content'} 

Builder :   Intent,  Problem,  Solution,  Structure,  Pseudocode,  Applicability,  How to Implement,  Pros and Cons,  Relations with Other Patterns,  Code Examples 


Iterator :   Intent,  Problem,  Solution,  Real-World Analogy,  Structure,  Pseudocode,  Applicability,  How to Implement,  Pros and Cons,  Relations with Other Patterns,  Code Examples 


segment differ  {' Real-World Analogy'} 

Prototype :   Intent,  Problem,  Solution,  Real-World Analogy,  Structure,  Pseudocode,  Applicability,  How to Implement,  Pros and Cons,  Relations with Other Patterns,  Code Examples 


segment differ  {

because there is different segment in each page, we will make segment as its own table


In [7]:
segment_entities = set()
segment_pattern_relation = []

names_pages = zip(pattern_names, pattern_soups)

for name,page in names_pages:
  segments = [s.text.strip() for s in page.select("article > .section > h2")]
  segment_pattern_relation.append([name, segments])
  for s in segments:
    segment_entities.add(s.strip())

segment_entities_SQL = list(segment_entities)
segment_entities_SQL = [{"segment_name": s} for s in segment_entities]

segment_pattern_relation_SQL = [{"pattern_name": n, "segment_name": s} for n,ss in segment_pattern_relation for s in ss]


print(segment_pattern_relation_SQL)

[{'pattern_name': 'Bridge', 'segment_name': 'Intent'}, {'pattern_name': 'Bridge', 'segment_name': 'Problem'}, {'pattern_name': 'Bridge', 'segment_name': 'Solution'}, {'pattern_name': 'Bridge', 'segment_name': 'Structure'}, {'pattern_name': 'Bridge', 'segment_name': 'Pseudocode'}, {'pattern_name': 'Bridge', 'segment_name': 'Applicability'}, {'pattern_name': 'Bridge', 'segment_name': 'How to Implement'}, {'pattern_name': 'Bridge', 'segment_name': 'Pros and Cons'}, {'pattern_name': 'Bridge', 'segment_name': 'Relations with Other Patterns'}, {'pattern_name': 'Bridge', 'segment_name': 'Code Examples'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Intent'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Problem'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Solution'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Structure'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Pseudocode'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Applicabili

In each page segment of every pattern usually there is a image, this image can be valuable abstraction of information from the website. We will make the image become a weak entity with the pattern name

In [8]:
pattern_segment_image_relation_SQL = {}
p_s_w = {}
word_sql = {}
import re
names_pages = zip(pattern_names, pattern_soups)

for name, page in names_pages:
  pattern_segment_image_relation_SQL[name] = dict();
  p_s_w[name] = dict();

  for s in page.select("article > .section"):
    segment = s.select("h2")[0].text.strip()
    images = s.select("h2 ~ figure > img")
    paragraphs = s.select("h2 ~ p")
    if len(images) > 0:
      link = image_host + images[0].attrs["src"]
      alt = images[0].attrs["alt"]
      pattern_segment_image_relation_SQL[name][segment] = [link, alt]

    if len(paragraphs) > 0:
      wordCount = dict()
      for p in paragraphs:
        word = map(lambda x: re.sub('[^A-Za-z0-9]+', '', x).lower(),p.text.split())
        for w in word:
          if w in wordCount:
            wordCount[w] += 1
          else:
            wordCount[w] = 1

          if w in word_sql:
            word_sql[w] += 1
          else:
            word_sql[w] = 1

      p_s_w[name][segment] = wordCount


p_s_w_sql = [{
  "pattern_name": p,
  "segment_name": s,
  "word": w,
  "count": p_s_w[p][s][w]
} for p in p_s_w for s in p_s_w[p] for w in p_s_w[p][s]]


word_sql = [{
  "word": w,
  "total_count": word_sql[w]}
 for w in word_sql]

pattern_segment_image_relation_SQL = [
  {
    "pattern_name": p,
    "segment_name": s,
    "link": pattern_segment_image_relation_SQL[p][s][0],
    "alt": pattern_segment_image_relation_SQL[p][s][1],
  }
  for p in pattern_segment_image_relation_SQL for s in pattern_segment_image_relation_SQL[p]
]

print(pattern_segment_image_relation_SQL)




[{'pattern_name': 'Bridge', 'segment_name': 'Intent', 'link': 'https://refactoring.guru/images/patterns/content/bridge/bridge.png?id=bd543d4fb32e11647767301581a5ad54', 'alt': 'Bridge design&nbsp;pattern'}, {'pattern_name': 'Bridge', 'segment_name': 'Problem', 'link': 'https://refactoring.guru/images/patterns/diagrams/bridge/problem-en.png?id=81f8ed6e6f5d673e15203b22a7a3c502', 'alt': 'Bridge pattern problem'}, {'pattern_name': 'Bridge', 'segment_name': 'Solution', 'link': 'https://refactoring.guru/images/patterns/diagrams/bridge/solution-en.png?id=b72caae18c400d6088072f2f3adda7cd', 'alt': 'Solution suggested by the Bridge pattern'}, {'pattern_name': 'Bridge', 'segment_name': 'Pseudocode', 'link': 'https://refactoring.guru/images/patterns/diagrams/bridge/example-en.png?id=e6902e5ed2791ba4adf4ce20f677fcb1', 'alt': 'Structure of the Bridge pattern example'}, {'pattern_name': 'Abstract Factory', 'segment_name': 'Intent', 'link': 'https://refactoring.guru/images/patterns/content/abstract-fac

We can retrieve every word in the data so that we can use it in the future

In [9]:
json_data = {}

json_data["patterns"] = pattern_link_rels
json_data["segments"] = segment_entities_SQL
json_data["pattern_segment"] = segment_pattern_relation_SQL
json_data["words"] = word_sql
json_data["word_locations"] = p_s_w_sql
json_data["images"] = pattern_segment_image_relation_SQL

import json
with open('../data/data_based_on_ER_model.json', 'w', encoding='utf-8') as f:
  json.dump(json_data, f, ensure_ascii=False, indent=4)