From 779a954f397e189d03f7078ef8ab470e68e3dc54 Mon Sep 17 00:00:00 2001 From: Daria Zakharova Date: Mon, 3 Feb 2020 20:32:30 +0300 Subject: [PATCH] Add words frequency function, change structure of tests --- src/vocabulary_generator/__init__.py | 2 +- src/vocabulary_generator/core.py | 33 +++-- tests/test_core.py | 195 ++++++++++++++------------- 3 files changed, 120 insertions(+), 110 deletions(-) diff --git a/src/vocabulary_generator/__init__.py b/src/vocabulary_generator/__init__.py index f70b180..64874a1 100644 --- a/src/vocabulary_generator/__init__.py +++ b/src/vocabulary_generator/__init__.py @@ -1,3 +1,3 @@ -from .core import get_unknown_words +from .core import get_unknown_words, get_words_frequency __version__ = "0.0.1" diff --git a/src/vocabulary_generator/core.py b/src/vocabulary_generator/core.py index 782ea48..745ec8b 100644 --- a/src/vocabulary_generator/core.py +++ b/src/vocabulary_generator/core.py @@ -1,4 +1,5 @@ -from typing import List, Set +from collections import Counter +from typing import List from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet @@ -15,12 +16,6 @@ def get_all_words_in_text(text: str) -> List[str]: return words -def get_unique_words(words: List[str]) -> Set[str]: - unique_words = set(words) - - return unique_words - - def get_initial_form(word: str) -> str: lemmatizer = WordNetLemmatizer() @@ -40,25 +35,27 @@ def get_pos_tag(word: str) -> str: return tag_dict.get(tag, wordnet.NOUN) -def get_all_words_in_initial_form(words: Set[str]) -> Set[str]: - initial_words = set() +def get_words_in_initial_form(words: List[str]) -> List[str]: + initial_words = [] for word in words: initial_word = get_initial_form(word) - initial_words.add(initial_word) + initial_words.append(initial_word) return initial_words -def get_unique_words_in_initial_form(text: str) -> Set[str]: - words = get_all_words_in_text(text) - unique_words = get_unique_words(words) - words_in_initial_form = get_all_words_in_initial_form(unique_words) +def get_words_frequency(words: List[str]) -> Counter: + return Counter(words) - return words_in_initial_form +def get_unknown_words(text: str, known_words: List[str]) -> List[str]: + words = get_all_words_in_text(text) + words_in_initial_form = get_words_in_initial_form(words) + known_words_in_initial_form = get_words_in_initial_form(known_words) + unknown_words = [] -def get_unknown_words(text: str, known_words: Set[str]) -> Set[str]: - words_from_text = get_unique_words_in_initial_form(text) - unknown_words = words_from_text.difference(known_words) + for word in words_in_initial_form: + if word not in known_words_in_initial_form: + unknown_words.append(word) return unknown_words diff --git a/tests/test_core.py b/tests/test_core.py index 96fbeb9..c9b8a7b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,140 +1,153 @@ -from vocabulary_generator.core import get_all_words_in_text, get_unique_words, get_initial_form -from vocabulary_generator.core import get_all_words_in_initial_form, get_pos_tag, get_unique_words_in_initial_form -from vocabulary_generator.core import get_unknown_words +from vocabulary_generator.core import get_all_words_in_text, get_initial_form +from vocabulary_generator.core import get_pos_tag, get_words_in_initial_form +from vocabulary_generator.core import get_words_frequency, get_unknown_words -def test_get_all_words_in_text(): - text = 'Hello world' - actual = get_all_words_in_text(text) - expected = ['hello', 'world'] +class TestGetAllWords: + def test_get_all_words_in_text(self): + text = 'Hello world' + actual = get_all_words_in_text(text) + expected = ['hello', 'world'] - assert actual == expected + assert actual == expected + def test_get_all_words_in_empty_text(self): + text = '' + actual = get_all_words_in_text(text) + expected = [] -def test_get_all_words_in_empty_text(): - text = '' - actual = get_all_words_in_text(text) - expected = [] + assert actual == expected - assert actual == expected + def test_get_all_words_in_text_with_spaces(self): + text = ' ' + actual = get_all_words_in_text(text) + expected = [] + assert actual == expected -def test_get_all_words_in_text_with_spaces(): - text = ' ' - actual = get_all_words_in_text(text) - expected = [] - assert actual == expected +class TestInitialForm: + def test_get_initial_form_of_subject(self): + word = 'cats' + actual = get_initial_form(word) + expected = 'cat' + assert actual == expected -def test_get_unique_words_count(): - words = ['first', 'second', 'third', 'fourth'] - actual = len(get_unique_words(words)) - expected = 4 + def test_get_initial_form_of_irregular_verb(self): + word = 'took' + actual = get_initial_form(word) + expected = 'take' - assert actual == expected + assert actual == expected + def test_get_initial_form_of_regular_verb(self): + word = 'tested' + actual = get_initial_form(word) + expected = 'test' -def test_get_unique_words_with_repeat_sequence_count(): - words = ['first', 'second', 'first', 'second'] - actual = len(get_unique_words(words)) - expected = 2 + assert actual == expected - assert actual == expected + def test_get_words_in_initial_form(self): + words = ['been', 'had', 'done', 'languages', 'cities', 'mice'] + actual = get_words_in_initial_form(words) + expected = ['be', 'have', 'do', 'language', 'city', 'mouse'] -def test_get_unique_words_with_empty_list(): - words = [] - actual = get_unique_words(words) - expected = set() + assert actual == expected - assert actual == expected +class TestPosTags: + def test_get_pos_tag_adjective(self): + word = 'small' -def test_get_initial_form_of_subject(): - word = 'cats' - actual = get_initial_form(word) - expected = 'cat' + actual = get_pos_tag(word) + expected = 'a' - assert actual == expected + assert actual == expected + def test_get_pos_tag_noun(self): + word = 'languages' -def test_get_initial_form_of_irregular_verb(): - word = 'took' - actual = get_initial_form(word) - expected = 'take' + actual = get_pos_tag(word) + expected = 'n' - assert actual == expected + assert actual == expected + def test_get_pos_tag_verb(self): + word = 'took' -def test_get_initial_form_of_regular_verb(): - word = 'tested' - actual = get_initial_form(word) - expected = 'test' + actual = get_pos_tag(word) + expected = 'v' - assert actual == expected + assert actual == expected -def test_get_pos_tag_adjective(): - word = 'small' +class TestCountWords: + def test_words_frequency(self): + words = ['been', 'had', 'been', 'had', 'be', 'was'] - actual = get_pos_tag(word) - expected = 'a' + actual = get_words_frequency(words) + expected = { + 'been': 2, + 'had': 2, + 'be': 1, + 'was': 1, + } - assert actual == expected + assert actual == expected + def test_frequency_of_empty_list(self): + words = [] -def test_get_pos_tag_noun(): - word = 'languages' + actual = get_words_frequency(words) + expected = {} - actual = get_pos_tag(word) - expected = 'n' + assert actual == expected - assert actual == expected +class TestUnknownWords: + def test_get_unknown_words(self): + text = 'been had done languages cities mice feet took went' + known_words = ['be', 'do', 'take'] -def test_get_pos_tag_verb(): - word = 'took' + actual = get_unknown_words(text, known_words) + expected = ['have', 'language', 'city', 'mouse', 'foot', 'go'] - actual = get_pos_tag(word) - expected = 'v' + assert actual == expected - assert actual == expected + def test_get_unknown_words_with_not_unknown_words(self): + text = 'been done took' + known_words = ['be', 'do', 'take'] + actual = get_unknown_words(text, known_words) + expected = [] -def test_get_all_words_in_initial_form(): - words = {'been', 'had', 'done', 'languages', 'cities', 'mice'} + assert actual == expected - actual = get_all_words_in_initial_form(words) - expected = {'be', 'have', 'do', 'language', 'city', 'mouse'} + def test_get_unknown_words_with_not_known_words(self): + text = 'been done took' + known_words = [] - assert actual == expected + actual = get_unknown_words(text, known_words) + expected = ['be', 'do', 'take'] + assert actual == expected -def test_get_unique_words_in_initial_form(): - text = 'been had done languages cities mice been had done languages cities mice been had done languages cities mice' + def test_get_unknown_words_with_frequency(self): + text = 'been done took been done took been done took been done' + known_words = [] - actual = get_unique_words_in_initial_form(text) - expected = {'be', 'have', 'do', 'language', 'city', 'mouse'} + actual = get_unknown_words(text, known_words) + expected = ['be', 'do', 'take', 'be', 'do', 'take', 'be', 'do', 'take', 'be', 'do'] - assert actual == expected + assert actual == expected + actual = get_words_frequency(actual) + expected = { + 'be': 4, + 'do': 4, + 'take': 3, + } -def test_get_unknown_words(): - text = 'been had done languages cities mice feet took went' - known_words = {'be', 'do', 'take'} - - actual = get_unknown_words(text, known_words) - expected = {'have', 'language', 'city', 'mouse', 'foot', 'go'} - - assert actual == expected - - -def test_get_unknown_words_with_not_unknown_words(): - text = 'been done took' - known_words = {'be', 'do', 'take'} - - actual = get_unknown_words(text, known_words) - expected = set() - - assert actual == expected + assert actual == expected