In [1]:
from urllib.request import urlopen
from typing import List

from bs4 import BeautifulSoup


def get_n_grams(content: str, n: int) -> List[List[str]]:
    words = content.split(" ")
    gram_start_indices = range(len(words) - n + 1)

    n_grams = [words[i : i + n] for i in gram_start_indices]

    return n_grams

In [3]:
html_url = "http://en.wikipedia.org/wiki/Python_(programming_language)"

with urlopen(html_url) as html:
    bs = BeautifulSoup(html.read(), "html.parser")
content = bs.find("div", {"id": "mw-content-text"}).get_text()  # type: ignore

n_grams = get_n_grams(content, 2)

print(n_grams)
print(f"2-grams count is: {len(n_grams)}")

2-grams count is: 12670


In [18]:
import re


def split_into_words(content: str):
    # 改行文字/引用インデックス(ex. [12])/utf-8文字をフィルタ
    content = re.sub(r"\n|[\[\d+\]]", " ", content)
    content = bytes(content, "utf-8").decode("ascii", "ignore")

    words = content.split(" ")
    return words


def get_n_grams(words: List[str], n: int):
    gram_start_indices = range(len(words) - n + 1)

    n_grams = [words[i : i + n] for i in gram_start_indices]
    return n_grams

In [19]:
html_url = "http://en.wikipedia.org/wiki/Python_(programming_language)"

with urlopen(html_url) as html:
    bs = BeautifulSoup(html.read(), "html.parser")
content = bs.find("div", {"id": "mw-content-text"}).get_text()  # type: ignore

words = split_into_words(content)
n_grams = get_n_grams(words, 2)

print(n_grams)
print(f"2-grams count is: {len(n_grams)}")

2-grams count is: 19848


In [20]:
from string import punctuation, whitespace


def split_into_clean_words(sentence: str):
    def is_valid_word(word: str):
        # 2文字以上の文字列とa(冠詞), I(主格名詞)を許容する
        return len(word) > 1 or word.lower() in ["a", "i"]

    def strip_word(word: str):
        return word.strip(punctuation + whitespace)

    words = sentence.split(" ")

    words = [strip_word(word) for word in words]
    words = [word for word in words if is_valid_word(word)]

    return words


def split_into_clean_sentences(content: str):
    # 改行文字/引用インデックス(ex. [12])/utf-8文字をフィルタ
    content = re.sub(r"\n|[\[\d+\]]", " ", content)
    content = bytes(content, "utf-8").decode("ascii", "ignore")

    sentences = content.split(". ")
    return sentences


def get_n_grams_from_words(words: List[str], n: int):
    gram_start_indices = range(len(words) - n + 1)

    n_grams = [words[i : i + n] for i in gram_start_indices]
    return n_grams


def get_n_grams(content: str, n: int):
    sentences = split_into_clean_sentences(content)
    n_grams: List[List[str]] = []

    for sentence in sentences:
        words = split_into_clean_words(sentence)
        n_grams_for_sentence = get_n_grams_from_words(words, n)
        n_grams += n_grams_for_sentence

    return n_grams

In [8]:
html_url = "http://en.wikipedia.org/wiki/Python_(programming_language)"

with urlopen(html_url) as html:
    bs = BeautifulSoup(html.read(), "html.parser")
content = bs.find("div", {"id": "mw-content-text"}).get_text()  # type: ignore

n_grams = get_n_grams(content, 2)
print(len(n_grams))

10278


In [16]:
from collections import Counter


def split_into_clean_upper_words(sentence: str):
    def is_valid_word(word: str):
        # 2文字以上の文字列とa(冠詞), I(主格名詞)を許容する
        return len(word) > 1 or word.lower() in ["a", "i"]

    def strip_word(word: str):
        return word.strip(punctuation + whitespace)

    # 単語の前処理
    words = sentence.split(" ")
    words = [strip_word(word) for word in words]
    words = [word for word in words if is_valid_word(word)]
    # 単語の連続性を見れればいいので、大文字小文字を識別しない
    words = [word.upper() for word in words]

    return words


def get_flatten_n_grams_from_words(words: List[str], n: int):
    gram_start_indices = range(len(words) - n + 1)

    n_grams = [words[i : i + n] for i in gram_start_indices]
    flatten_n_grams = [" ".join(n_gram) for n_gram in n_grams]

    return flatten_n_grams


def get_n_grams_counter(content: str, n: int):
    n_grams_counter: Counter[str] = Counter()

    sentences = split_into_clean_sentences(content)
    for sentence in sentences:
        words = split_into_clean_upper_words(sentence)
        n_grams_for_sentence = get_flatten_n_grams_from_words(words, n)
        n_grams_counter.update(n_grams_for_sentence)

    return n_grams_counter

In [17]:
n_grams_counter = get_n_grams_counter(content, 2)
print(n_grams_counter)

