# Парсинг файлов словарей

## Подготовка данных

### Изучение данных

In [1]:

#блок загрузки бибилиотек
import pandas as pd
import numpy as np
import re
import fitz


### Парсинг файлов словарей

In [2]:
def pdf_parser(filename, category):
    """ Принимаем имя файла и категории
        Возвращаем словарь: ключ - категория, значение - список слов
    """
    with fitz.open('/Users/vadimprimakov/Documents/Yandex_practicum/data/Oxford_CEFR_level/' + filename) as doc:
        pdf_pages = ''
        for page in doc:
            # Чистим страницу от заголовков
            text = re.sub(r'©.*?level\n|©.*?here.\n', '', page.get_text(), flags=re.DOTALL)
            if page.number == 0:
                text = re.sub(r'The.*?\n', '', text, flags=re.DOTALL)
            pdf_pages += text
            # print(r'{}'.format(pdf_pages))
        pdf_pages = re.sub('\. B2','', pdf_pages)
        pdf_pages = re.split(r'A1\n|A2\n|B1\n|B2\n|C1\n', pdf_pages)
    
    pdf_pages = pdf_pages[1:]
    
    # Разбиваем на слова
    word_dikt = {}
    for category, cat in zip(pdf_pages, category):
        words = re.split(r'\n|\xa0', category)
        rezult = []
        for line in words:
            word = re.split(' |,', line)[0]
            word = re.sub('[^a-z]*', '', word.lower())
            if word != '':
                rezult.append(word)
        word_dikt[cat] = rezult
        
        
    return word_dikt

In [3]:
dikt_oxsford_3000 = pdf_parser('The_Oxford_3000_by_CEFR_level.pdf', ['A1', 'A2', 'B1', 'B2'])

for i in dikt_oxsford_3000.keys():
    print(i, len(dikt_oxsford_3000[i]), dikt_oxsford_3000[i][0], dikt_oxsford_3000[i][-1])

A1 907 a yourself
A2 877 ability zero
B1 809 absolutely youth
B2 727 abandon zone


In [4]:
dikt_oxsford_5000 = pdf_parser('The_Oxford_5000_by_CEFR_level.pdf', ['B2', 'C1'])
for i in dikt_oxsford_5000.keys():
    print(i, len(dikt_oxsford_5000[i]), dikt_oxsford_5000[i][0], dikt_oxsford_5000[i][-1])

B2 700 absorb wrist
C1 1315 abolish youngster


In [5]:
dikt_american_3000 = pdf_parser('American_Oxford_3000_by_CEFR_level.pdf', ['A1', 'A2', 'B1', 'B2'])
for i in dikt_american_3000.keys():
    print(i, len(dikt_american_3000[i]), dikt_american_3000[i][0], dikt_american_3000[i][-1])

A1 905 a yourself
A2 872 ability zero
B1 812 absolutely youth
B2 733 abandon zone


In [6]:
dikt_american_5000 = pdf_parser('American_Oxford_5000_by_CEFR_level.pdf', ['B2', 'C1'])
for i in dikt_american_5000.keys():
    print(i, len(dikt_american_5000[i]), dikt_american_5000[i][0], dikt_american_5000[i][-1])

B2 702 absorb wrist
C1 1317 abolish yield


In [7]:
len(dikt_american_5000['C1'])

1317

In [8]:
dikt_c2 = pd.read_csv('/Users/vadimprimakov/Documents/Yandex_practicum/data/dict_c2.csv')
dikt_c2['list'] = dikt_c2['list'].str.lower()
dikt_c2['diff'] = 'C2'
dikt_c2 = dikt_c2.rename(columns = {'list' : 'word'})
dikt_c2

Unnamed: 0,word,diff
0,abdomen,C2
1,aberration,C2
2,abhorrence,C2
3,above all,C2
4,above-mentioned,C2
...,...,...
1201,yoga,C2
1202,youthful,C2
1203,zeal,C2
1204,zealous,C2


In [9]:
# Объеденим словари в один
dikts = [dikt_oxsford_3000,
         dikt_oxsford_5000,
         dikt_american_3000,
         dikt_american_5000
        ]
word_dikt = {}
for dikt in dikts:
    for key in dikt.keys():
        if key in word_dikt:
            word_dikt[key] = set(word_dikt[key]) | set(dikt[key])
        else:
            word_dikt[key] = dikt[key]

In [10]:
word_dikt.keys()
for i in word_dikt.keys():
    print(i, len(word_dikt[i]))

A1 919
A2 905
B1 831
B2 1481
C1 1360


In [11]:
# Упакуем в датафрейм
df_words = pd.DataFrame(columns=['diff','word'])

for clas in word_dikt:
    df_words = pd.concat([df_words, pd.DataFrame({
                                                    'diff':[clas]*len(word_dikt[clas]),
                                                    'word': [_ for _ in word_dikt[clas]]
                                                    })])
df_words = df_words.reset_index(drop=True)
df_words

Unnamed: 0,diff,word
0,A1,friday
1,A1,kilometre
2,A1,show
3,A1,kitchen
4,A1,between
...,...,...
5491,C1,slot
5492,C1,gallon
5493,C1,texture
5494,C1,coalition


In [12]:
df_words = pd.concat([df_words, dikt_c2]) 
df_words

Unnamed: 0,diff,word
0,A1,friday
1,A1,kilometre
2,A1,show
3,A1,kitchen
4,A1,between
...,...,...
1201,C2,yoga
1202,C2,youthful
1203,C2,zeal
1204,C2,zealous


In [13]:
df_words['diff'].value_counts()

B2    1481
C1    1360
C2    1206
A1     919
A2     905
B1     831
Name: diff, dtype: int64

In [14]:
# проверим наличие дублей
df_words[df_words.word.duplicated(keep=False)].sort_values('word')
df_to_drop = df_words[df_words.word.duplicated(keep='last')].sort_values('word')
df_to_drop.index

Int64Index([1159, 2112, 5049, 4115, 2385, 2395, 4412, 3349, 3734, 3120,
            ...
             973, 1875, 1045, 1955, 2166, 2460,  721,  361, 1280,  648],
           dtype='int64', length=626)

In [15]:
# проверим наличие дублей
df_words = df_words.drop(index = df_to_drop.index)

In [16]:
# проверим наличие дублей
df_words[df_words.word.duplicated(keep=False)].sort_values('word')

Unnamed: 0,diff,word


In [17]:
df_words.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5831 entries, 0 to 1205
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   diff    5831 non-null   object
 1   word    5831 non-null   object
dtypes: object(2)
memory usage: 136.7+ KB


In [18]:
# Сохраним в файл
df_words.to_csv('oxford_dikt.csv', index=False)