In [14]:
"""
Word counter.
"""
from collections import Counter
import pandas as pd


def read_file(path: str) -> str:
    with open(path, 'r') as f:
        full_text = f.read()
    return full_text


def clean_text(text: str, unnecessary_char: str = "?!,.;…“”‘’:-—()\"") -> str:
    for i in range(226):
        text = text.replace("Page {} of 226".format(i), "")
    for character in unnecessary_char:
        text = text.replace(character, " ")
    return text


def extract_words_from_text(text: str) -> list:
    # text has to be cleaned from punctuation
    return [word.lower() for word in text.split()]


def is_palindrom(word: str) -> bool:
    return word == word[::-1]


In [15]:
# preparing text for inserting in DataFrame
PATH = "hpv2_text.txt"

hp_content = read_file(PATH)
cleaned_hp_content = clean_text(hp_content)
words = extract_words_from_text(cleaned_hp_content)
counter = Counter(words)

data = pd.DataFrame(counter.items(), columns=['words', 'counter'])
data = (data
        .sort_values(by=['counter'], ascending=False)
        .assign(length=data['words'].apply(len))
        .assign(first_letter=data['words'].apply(lambda x: x[0]))
        .assign(is_palindrom=data['words'].apply(is_palindrom))
       )
        
data.drop(data[data['length'] == 1].index, inplace=True)
data
# data[data['first_letter'] == 's']
# data[data['is_palindrom']==True]


Unnamed: 0,words,counter,length,first_letter,is_palindrom
3,the,3628,3,t,False
2,and,1925,3,a,False
26,to,1860,2,t,False
64,he,1759,2,h,False
0,harry,1327,5,h,False
...,...,...,...,...,...
3480,starry,1,6,s,False
3482,turrets,1,7,t,False
3486,shore,1,5,s,False
3491,curtain,1,7,c,False


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5782 entries, 3 to 5804
Data columns (total 5 columns):
words           5782 non-null object
counter         5782 non-null int64
length          5782 non-null int64
first_letter    5782 non-null object
is_palindrom    5782 non-null bool
dtypes: bool(1), int64(2), object(2)
memory usage: 231.5+ KB


In [16]:
data.groupby('first_letter').sum()

Unnamed: 0_level_0,counter,length,is_palindrom
first_letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6,18,0.0
2,2,3,0.0
3,3,5,0.0
a,5944,1518,2.0
b,3751,2739,0.0
c,2568,3222,0.0
d,2574,2211,3.0
e,1239,1103,2.0
f,2955,2096,0.0
g,1989,1431,0.0


In [13]:
data

Unnamed: 0,words,counter,length,first_letter,is_palindrom
3,the,3627,3,t,False
2,and,1925,3,a,False
26,to,1859,2,t,False
64,he,1758,2,h,False
0,harry,1326,5,h,False
...,...,...,...,...,...
3533,blissfully,1,10,b,False
308,weirdos,1,7,w,False
3540,upper,1,5,u,False
3541,flagged,1,7,f,False
