In [47]:
# !pip3 install pdpipe

In [51]:
import os
import glob
import pandas as pd
import pdpipe as pdp
# from collections import Counter

In [52]:
def filter_null(df, column):
    return df[df[column].notna()]

def filter_not_in_prefix(df, prefixes):
    return df[~df['word'].str.startswith(tuple((p+'-' for p in prefixes)))]

def save(df, file):
    df.to_csv(file)
    return df

def read_file(file):
    with open(file, "r") as f:
        return f.read()

def replace_by_list(text, arr, to=''):
    for s in arr:
        text = text.replace(s, to)
    return text

def load_data_calgary(path):
    return ' '.join((read_file(file) for file in glob.glob(path)))

def test_to_words(text):
    return pd.DataFrame(text.split(), columns=['word'])

def filter_in(df, column, arr):
    return df[~df[column].isin(arr)]

def filter_length_more_then(df, column, min_length):
    return df[df[column].str.len() > min_length]

def get_first_letter(df, column):
    df[column] = df['word'].str.lower().str[0]
    return df

def count(df, column, to):
    return df.groupby(column).size().reset_index(name=to)

def sort(df, column):
    return df.sort_values(by=column, ascending=False).reset_index(drop=True) 

def load_data_node(path, usecols=None):
    return pd.read_csv(path, 
                       delimiter=';',
                       na_filter= True,
                       usecols=usecols
                      )

def rename_columns(df, mapper):
    return df.rename(mapper, axis='columns')

In [55]:
path_corpus = '../../OE_data/texts/'
path = path_corpus + 'calgary/*.txt'
stop_words = read_file('stop_words.txt').split()
word = 'word'
letter = 'letter'
counts = 'text'
dummy_letters = ['#', '.', ';', '!', ':', '(', ')', '?', '\'', '\"', ',', '-']
vouwels = ['o', 'a', 'e', 'i', 'æ', 'u','á', 'ǽ', 'é', 'í', 'ó', 'ú', 'ý'] #, '-', '('

load_texts = (
    test_to_words(
        replace_by_list(
            load_data_calgary(path),
            dummy_letters
        )
    )
)
# print(load_texts)

pipe_texts = (
    load_texts
    .pipe(filter_in, word, stop_words)
    .pipe(filter_length_more_then, word, 1)
    .pipe(get_first_letter, letter)
    .pipe(filter_in, letter, vouwels)
    .pipe(count, letter, counts)
    .pipe(sort, counts)
#     .pipe(save, 'out_not_in_prefix.csv')
)
# print(pipe_texts)
# pipe_texts.plot()
# print(f'filtred: {len(load_texts) - len(pipe_texts)}')

path = path_corpus + 'BT/node.csv'
column = 'title'
counts = 'dict'
prefixes = ['ge', 'on', 'be', 'ofer', 'a', 'for']

pipe_node = (
    load_data_node(path, [column])
    .pipe(rename_columns, {column: word})
    .pipe(filter_null, word)
    .pipe(filter_length_more_then, word, 1)
#     .pipe(filter_not_in_prefix, prefixes)
    .pipe(get_first_letter, letter)
    .pipe(filter_in, letter, vouwels)
    .pipe(count, letter, counts)
    .pipe(sort, counts)
#     .pipe(save, 'out_not_in_prefix.csv')
)
# pipeline = pdp.ColDrop('Medals').OneHotEncode('Born')
# pipeline(df)
# print(pipe_node)
pd.concat([pipe_texts.set_index('letter'), pipe_node.set_index('letter')], axis=1, join='inner').reset_index()

Unnamed: 0,letter,text,dict
0,g,13587,9637
1,s,13578,5261
2,w,12025,3473
3,h,11082,4853
4,f,8949,5874
5,m,7875,2088
6,þ,7254,1261
7,b,6989,5083
8,l,4954,1783
9,d,4303,1719
