In [1]:
"""
Word counter.
"""
from collections import Counter
import pandas as pd


def read_file(path: str) -> str:
    with open(path, 'r') as f:
        full_text = f.read()
    return full_text


def clean_text(text: str, unnecessary_char: str = "?!,.;…“”’:-—") -> str:
    for i in range(226):
        text = text.replace("Page {} of 226".format(i), "")
    for character in unnecessary_char:
        text = text.replace(character, " ")
    return text


def extract_words_from_text(text: str) -> list:
    # text has to be cleaned from punctuation
    return [word.lower() for word in text.split()]


def is_palindrom(word: str) -> bool:
    return word == word[::-1]


In [18]:
# preparing text for inserting in DataFrame
PATH = "hpv2_text.txt"

hp_content = read_file(PATH)
cleaned_hp_content = clean_text(hp_content)
words = extract_words_from_text(cleaned_hp_content)
counter = Counter(words)

data = pd.DataFrame(counter.items(), columns=['words', 'counter'])
data = (data
       .sort_values(by=['counter'], ascending=False)
       .assign(length=data['words'].apply(len))
       .assign(first_letter=data['words'].apply(lambda x: x[0]))
       .assign(is_palindrom=data['words'].apply(is_palindrom)))

data = data[data['length'] != 1]
# data[data['is_palindrom']==True]
data

Unnamed: 0,words,counter,length,first_letter,is_palindrom
3,the,3627,3,t,False
2,and,1925,3,a,False
26,to,1859,2,t,False
64,he,1758,2,h,False
0,harry,1326,5,h,False
...,...,...,...,...,...
3533,blissfully,1,10,b,False
308,weirdos,1,7,w,False
3540,upper,1,5,u,False
3541,flagged,1,7,f,False


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 3 to 5804
Data columns (total 5 columns):
words           5782 non-null object
counter         5782 non-null int64
length          5782 non-null int64
first_letter    5782 non-null object
is_palindrom    5782 non-null bool
dtypes: bool(1), int64(2), object(2)
memory usage: 231.5+ KB


In [19]:
data.groupby('first_letter').count().sort_values(by='words', ascending=False)['words']

first_letter
s    885
c    468
b    441
p    381
t    371
f    331
d    324
w    270
h    264
m    257
r    253
a    236
g    228
l    216
e    166
n    126
i     98
o     94
u     81
k     61
v     53
j     53
y     36
q     25
‘     23
(     23
z      7
1      6
3      2
2      2
"      1
Name: words, dtype: int64

In [13]:
data

Unnamed: 0,words,counter,length,first_letter,is_palindrom
3,the,3627,3,t,False
2,and,1925,3,a,False
26,to,1859,2,t,False
64,he,1758,2,h,False
0,harry,1326,5,h,False
...,...,...,...,...,...
3533,blissfully,1,10,b,False
308,weirdos,1,7,w,False
3540,upper,1,5,u,False
3541,flagged,1,7,f,False
