In [34]:
import math
from typing import List

import spacy

In [2]:
nlp = spacy.load("ja_ginza")

In [3]:
text1 = "私が学生で、彼は社会人だ。"
text2 = "私は学生で、彼も学生だ。"

In [19]:
# 総語数
def get_total(text: str) -> int:
    return len([token.text for token in nlp(text)])
print(get_total(text1))
print(get_total(text2))

10
10


In [21]:
# 異語数
def get_diff(text: str) -> int:
    return len({token.text for token in nlp(text)})
print(get_diff(text1))
print(get_diff(text2))

10
9


In [22]:
# 異語率
def get_diff_rate(text: str) -> float:
    return get_diff(text) / get_total(text)
print(get_diff_rate(text1))
print(get_diff_rate(text2))

1.0
0.9


In [27]:
# Guiraud指数
def get_guiraud(text: str) -> float:
    return get_diff(text) / math.sqrt(get_total(text))
print(get_guiraud(text1))
print(get_guiraud(text2))

3.162277660168379
2.846049894151541


In [30]:
# 単純頻度
def get_simple_freq(text: str, word: str) -> int:
    return [token.text for token in nlp(text)].count(word)
print(get_simple_freq(text1, "学生"))
print(get_simple_freq(text2, "学生"))

1
2


In [31]:
# 相対頻度
def get_relative_freq(text: str, word: str) -> int:
    return get_simple_freq(text, word) / get_total(text)
print(get_relative_freq(text1, "学生"))
print(get_relative_freq(text2, "学生"))

0.1
0.2


In [39]:
text3 = "To Sherlock Holmes she is always the woman."
text4 = "I have seldom heard him mention her under any other name."
text5 = "In his eyes she eclipses and predominates the whole of her sex."

In [45]:
# 平均文長
def get_mean_sentence_length(texts: List[str]) -> float:
    total = 0
    for text in texts:
        total += len(text.replace(".", "").split())
    return total / len(texts)

print(get_mean_sentence_length([text3, text4, text5]))

10.333333333333334


In [43]:
# 平均単語長
def get_mean_word_length(text: str) -> float:
    tokens = text.replace(".", "").split()
    total = 0
    for token in tokens:
        total += len(token)
    return total / len(tokens)

print(get_mean_word_length(text3))

4.375


In [84]:
week1 = [79, 100, 94, 51, 49, 44, 31]
week2 = [58, 42, 40, 27, 28, 49, 39]
week3 = [49, 30, 33, 42, 27, 17, 15]
week4 = [16, 20, 30, 20, 14, 11, 10]

In [77]:
internal_equinox1 = (3 * 1 + 1 * len(week1)) / (1 + 3) - 1
internal_equinox2 = (2 * 1 + 2 * len(week1)) / (2 + 2) - 1
internal_equinox3 = (1 * 1 + 3 * len(week1)) / (3 + 1) - 1
print(internal_equinox1)
print(internal_equinox2)
print(internal_equinox3)

1.5
3.0
4.5


In [78]:
sorted_week1 = sorted(week1)
print(sorted_week1)

[31, 44, 49, 51, 79, 94, 100]


In [79]:
quantile1 = sorted_week1[math.floor(internal_equinox1)] + (sorted_week1[math.ceil(internal_equinox1)] - sorted_week1[math.floor(internal_equinox1)])*0.5
print(quantile1)

46.5

In [82]:
quantile3 = sorted_week1[math.floor(internal_equinox3)] + (sorted_week1[math.ceil(internal_equinox3)] - sorted_week1[math.floor(internal_equinox3)])*0.5
print(quantile3)

86.5


In [83]:
quantile_range = quantile3 - quantile1
quantile_variance = quantile_range / 2.0
print(quantile_range)
print(quantile_variance)

40.0
20.0
