Skip to content

Commit

Permalink
Add words frequency function, change structure of tests
Browse files Browse the repository at this point in the history
  • Loading branch information
zakharovadaria committed Feb 3, 2020
1 parent 58b9224 commit 779a954
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 110 deletions.
2 changes: 1 addition & 1 deletion src/vocabulary_generator/__init__.py
@@ -1,3 +1,3 @@
from .core import get_unknown_words
from .core import get_unknown_words, get_words_frequency

__version__ = "0.0.1"
33 changes: 15 additions & 18 deletions src/vocabulary_generator/core.py
@@ -1,4 +1,5 @@
from typing import List, Set
from collections import Counter
from typing import List

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
Expand All @@ -15,12 +16,6 @@ def get_all_words_in_text(text: str) -> List[str]:
return words


def get_unique_words(words: List[str]) -> Set[str]:
unique_words = set(words)

return unique_words


def get_initial_form(word: str) -> str:
lemmatizer = WordNetLemmatizer()

Expand All @@ -40,25 +35,27 @@ def get_pos_tag(word: str) -> str:
return tag_dict.get(tag, wordnet.NOUN)


def get_all_words_in_initial_form(words: Set[str]) -> Set[str]:
initial_words = set()
def get_words_in_initial_form(words: List[str]) -> List[str]:
initial_words = []
for word in words:
initial_word = get_initial_form(word)
initial_words.add(initial_word)
initial_words.append(initial_word)

return initial_words


def get_unique_words_in_initial_form(text: str) -> Set[str]:
words = get_all_words_in_text(text)
unique_words = get_unique_words(words)
words_in_initial_form = get_all_words_in_initial_form(unique_words)
def get_words_frequency(words: List[str]) -> Counter:
return Counter(words)

return words_in_initial_form

def get_unknown_words(text: str, known_words: List[str]) -> List[str]:
words = get_all_words_in_text(text)
words_in_initial_form = get_words_in_initial_form(words)
known_words_in_initial_form = get_words_in_initial_form(known_words)
unknown_words = []

def get_unknown_words(text: str, known_words: Set[str]) -> Set[str]:
words_from_text = get_unique_words_in_initial_form(text)
unknown_words = words_from_text.difference(known_words)
for word in words_in_initial_form:
if word not in known_words_in_initial_form:
unknown_words.append(word)

return unknown_words
195 changes: 104 additions & 91 deletions tests/test_core.py
@@ -1,140 +1,153 @@
from vocabulary_generator.core import get_all_words_in_text, get_unique_words, get_initial_form
from vocabulary_generator.core import get_all_words_in_initial_form, get_pos_tag, get_unique_words_in_initial_form
from vocabulary_generator.core import get_unknown_words
from vocabulary_generator.core import get_all_words_in_text, get_initial_form
from vocabulary_generator.core import get_pos_tag, get_words_in_initial_form
from vocabulary_generator.core import get_words_frequency, get_unknown_words


def test_get_all_words_in_text():
text = 'Hello world'
actual = get_all_words_in_text(text)
expected = ['hello', 'world']
class TestGetAllWords:
def test_get_all_words_in_text(self):
text = 'Hello world'
actual = get_all_words_in_text(text)
expected = ['hello', 'world']

assert actual == expected
assert actual == expected

def test_get_all_words_in_empty_text(self):
text = ''
actual = get_all_words_in_text(text)
expected = []

def test_get_all_words_in_empty_text():
text = ''
actual = get_all_words_in_text(text)
expected = []
assert actual == expected

assert actual == expected
def test_get_all_words_in_text_with_spaces(self):
text = ' '
actual = get_all_words_in_text(text)
expected = []

assert actual == expected

def test_get_all_words_in_text_with_spaces():
text = ' '
actual = get_all_words_in_text(text)
expected = []

assert actual == expected
class TestInitialForm:
def test_get_initial_form_of_subject(self):
word = 'cats'
actual = get_initial_form(word)
expected = 'cat'

assert actual == expected

def test_get_unique_words_count():
words = ['first', 'second', 'third', 'fourth']
actual = len(get_unique_words(words))
expected = 4
def test_get_initial_form_of_irregular_verb(self):
word = 'took'
actual = get_initial_form(word)
expected = 'take'

assert actual == expected
assert actual == expected

def test_get_initial_form_of_regular_verb(self):
word = 'tested'
actual = get_initial_form(word)
expected = 'test'

def test_get_unique_words_with_repeat_sequence_count():
words = ['first', 'second', 'first', 'second']
actual = len(get_unique_words(words))
expected = 2
assert actual == expected

assert actual == expected
def test_get_words_in_initial_form(self):
words = ['been', 'had', 'done', 'languages', 'cities', 'mice']

actual = get_words_in_initial_form(words)
expected = ['be', 'have', 'do', 'language', 'city', 'mouse']

def test_get_unique_words_with_empty_list():
words = []
actual = get_unique_words(words)
expected = set()
assert actual == expected

assert actual == expected

class TestPosTags:
def test_get_pos_tag_adjective(self):
word = 'small'

def test_get_initial_form_of_subject():
word = 'cats'
actual = get_initial_form(word)
expected = 'cat'
actual = get_pos_tag(word)
expected = 'a'

assert actual == expected
assert actual == expected

def test_get_pos_tag_noun(self):
word = 'languages'

def test_get_initial_form_of_irregular_verb():
word = 'took'
actual = get_initial_form(word)
expected = 'take'
actual = get_pos_tag(word)
expected = 'n'

assert actual == expected
assert actual == expected

def test_get_pos_tag_verb(self):
word = 'took'

def test_get_initial_form_of_regular_verb():
word = 'tested'
actual = get_initial_form(word)
expected = 'test'
actual = get_pos_tag(word)
expected = 'v'

assert actual == expected
assert actual == expected


def test_get_pos_tag_adjective():
word = 'small'
class TestCountWords:
def test_words_frequency(self):
words = ['been', 'had', 'been', 'had', 'be', 'was']

actual = get_pos_tag(word)
expected = 'a'
actual = get_words_frequency(words)
expected = {
'been': 2,
'had': 2,
'be': 1,
'was': 1,
}

assert actual == expected
assert actual == expected

def test_frequency_of_empty_list(self):
words = []

def test_get_pos_tag_noun():
word = 'languages'
actual = get_words_frequency(words)
expected = {}

actual = get_pos_tag(word)
expected = 'n'
assert actual == expected

assert actual == expected

class TestUnknownWords:
def test_get_unknown_words(self):
text = 'been had done languages cities mice feet took went'
known_words = ['be', 'do', 'take']

def test_get_pos_tag_verb():
word = 'took'
actual = get_unknown_words(text, known_words)
expected = ['have', 'language', 'city', 'mouse', 'foot', 'go']

actual = get_pos_tag(word)
expected = 'v'
assert actual == expected

assert actual == expected
def test_get_unknown_words_with_not_unknown_words(self):
text = 'been done took'
known_words = ['be', 'do', 'take']

actual = get_unknown_words(text, known_words)
expected = []

def test_get_all_words_in_initial_form():
words = {'been', 'had', 'done', 'languages', 'cities', 'mice'}
assert actual == expected

actual = get_all_words_in_initial_form(words)
expected = {'be', 'have', 'do', 'language', 'city', 'mouse'}
def test_get_unknown_words_with_not_known_words(self):
text = 'been done took'
known_words = []

assert actual == expected
actual = get_unknown_words(text, known_words)
expected = ['be', 'do', 'take']

assert actual == expected

def test_get_unique_words_in_initial_form():
text = 'been had done languages cities mice been had done languages cities mice been had done languages cities mice'
def test_get_unknown_words_with_frequency(self):
text = 'been done took been done took been done took been done'
known_words = []

actual = get_unique_words_in_initial_form(text)
expected = {'be', 'have', 'do', 'language', 'city', 'mouse'}
actual = get_unknown_words(text, known_words)
expected = ['be', 'do', 'take', 'be', 'do', 'take', 'be', 'do', 'take', 'be', 'do']

assert actual == expected
assert actual == expected

actual = get_words_frequency(actual)
expected = {
'be': 4,
'do': 4,
'take': 3,
}

def test_get_unknown_words():
text = 'been had done languages cities mice feet took went'
known_words = {'be', 'do', 'take'}

actual = get_unknown_words(text, known_words)
expected = {'have', 'language', 'city', 'mouse', 'foot', 'go'}

assert actual == expected


def test_get_unknown_words_with_not_unknown_words():
text = 'been done took'
known_words = {'be', 'do', 'take'}

actual = get_unknown_words(text, known_words)
expected = set()

assert actual == expected
assert actual == expected

0 comments on commit 779a954

Please sign in to comment.