# Data extraction from the title page of the thesis

### Data

Data is in the same directory, in `diploms` folder.

If `diplomas_pdftotext.csv` file exists, there is no need to extract text from pdf, move on next section

In [1]:
# !sudo apt-get update
# !sudo apt-get install build-essential libpoppler-cpp-dev pkg-config python-dev
# !pip install pdftotext

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# pdftotext
import pdftotext

# pandas
import pandas as pd

import re

# file system
from os import listdir
from os.path import isfile, join

In [4]:
def get_text_from_main_page_pdftotext(filename: str) -> str:
    with open(filename, 'rb') as f:
        pdf = pdftotext.PDF(f)
        main_page = pdf[0]
        
        # look up in second page for info 
        second_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[1])
        if len(second_page) > 0 :
            main_page += second_page[0]
#             print(second_page)

        # look up in third page for info 
        else:
            third_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[2])
            if len(third_page) > 0 :
                main_page += pdf[1]        # add second too
                main_page += third_page[0]
#                 print(main_page)
            
        return main_page

In [5]:
# list all files
path = "diploms"
files = []
for folder in listdir(path):
    files += [join(path, folder, file) for file in listdir(join(path, folder)) if isfile(join(path, folder, file))]

data = pd.DataFrame(columns = ["filename", "text"])

for filename in files:
    try:
        text = get_text_from_main_page_pdftotext(filename)
        data = data.append(pd.Series(
            [filename, text], index = data.columns
        ), ignore_index = True)
        print(f'\x1B[32mSuccess \x1B[0m- {filename}')
    except Exception as e:
        print(f'\x1B[31mFailed \x1B[0m- {filename}\x1B[31m', e)

[32mSuccess [0m- diploms/msu/2019_Ginzburg.pdf
[32mSuccess [0m- diploms/msu/ameridi_a_d.pdf
[32mSuccess [0m- diploms/msu/aristov_g_d.pdf
[32mSuccess [0m- diploms/msu/bagratuni_d_g.pdf
[32mSuccess [0m- diploms/msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf
[32mSuccess [0m- diploms/msu/2019_Bobrovsky.pdf
[32mSuccess [0m- diploms/msu/antonov_a_s.pdf
[32mSuccess [0m- diploms/msu/Resistance prediction.pdf
[32mSuccess [0m- diploms/msu/azyukina_e_a.pdf
[32mSuccess [0m- diploms/msu/bozheva_a_yu.pdf
[32mSuccess [0m- diploms/msu/anisimov_a_v.pdf
[32mSuccess [0m- diploms/msu/2019_Vostrov.pdf
[32mSuccess [0m- diploms/msu/antonova_e_v1.pdf
[32mSuccess [0m- diploms/msu/aksyonenko__a_a.pdf
[32mSuccess [0m- diploms/msu/Badalyan_master_project.pdf
[32mSuccess [0m- diploms/msu/baranova_p_o.pdf
[32mSuccess [0m- diploms/msu/çδ¬¿¡éîè4»αÑñºáΘ¿Γá.pdf
[32mSuccess [0m- diploms/msu/òá¡ñª∩¡. îáú ñ¿ßßÑα.pdf
[32mSuccess [0m- diploms/msu/bisultanov_ya_h_.pdf
[32mSuccess [0m- diploms/msu

In [6]:
# delete empty documents
data = data[data["text"] != ""]
data["filename"] = data["filename"].str.removeprefix("diploms/")

# save to csv
data.to_csv("diplomas_pdftotext.csv", index = False)

the dataset is ready at `diplomas_pdftotext.csv`

### REGEX

In [7]:
# pandas
import pandas as pd

# regex
import re

# errors
import traceback

In [8]:
df = pd.read_csv("diplomas_pdftotext.csv")
df.head()

Unnamed: 0,filename,text
0,msu/2019_Ginzburg.pdf,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ БЮД...
1,msu/ameridi_a_d.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРС...
2,msu/aristov_g_d.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРСИТЕТ...
3,msu/bagratuni_d_g.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРСИТЕТ...
4,msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf,Московский государственный университет имени М...


In [9]:
def modify_title_page(title_page : str) -> str:
    tokenized = [re.split(r'\s', line) for line in re.split(r'\n', title_page)]
    tokenized = [[token for token in line if token] for line in tokenized]
    modified = [' '.join(token for token in line if token) for line in tokenized]
    i = 0
    while i < len(modified):
        if i-1>0 and modified[i] and re.findall(r'[^\W\d_]', modified[i][0]) and modified[i][0].lower() == modified[i][0]:
            new_line = modified[i-1] + ' ' + modified[i]
            modified.pop(i)
            modified.pop(i-1)
            modified.insert(i-1, new_line)
            i =- 1
        i += 1
            
    modified = '\n'.join(line for line in modified)
    return modified

In [10]:
def get_year(title_page : str) -> str:
    year = re.findall(r'(2[0-9]{3})', title_page)
    return year[-1] if year else None

def get_university(title_page : str) -> str:
    if re.search(r"(?i)Российско ?- ?армянский", title_page):
        return 'rau'
    elif re.search(r"(?i)московский физико[\s\n]?-[\s\n\t]*технический институт", title_page):
        return 'mipt'
    elif re.search(r"(?i)Высшая школа экономики", title_page):
        return 'hse'
    elif re.search(r"(?i)дружбы народов", title_page):
        return 'rudn'
    elif re.search(r"(?i)московский государственный", title_page):
        return 'msu'
    university = re.findall('(?i)(?<=).+университет|$', title_page)[0]
    return university.lower() if university else None   

def get_faculty(title_page : str, university : str) -> str:
    if university == 'rau':
        faculty = re.findall(r"(?i)Институт.*", title_page)
    elif university == 'mipt' or university == 'hse':
        faculty = re.findall(r"(?i)Факультет.*", title_page)
    elif university == 'rudn':
        faculty = re.findall(r"(?i)учебный институт: (.*)", title_page)
        if not faculty:
            faculty = re.findall(r"(?i)\s+(.*)\sинститут", title_page)
    elif university == 'msu':
        faculty = re.findall(r"(?i)\s*(.*Факультет.*)", title_page)
    else:
        faculty = re.findall(r"(?i)Факультет.*", title_page)
    faculty = faculty[0].lower() if faculty else None
    return faculty

def get_department(title_page : str) -> str:
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    return department

def get_speciality(title_page : str) -> str:
    speciality = re.findall(r"(?i)Специальность:\s*(.*)|Направление:\s*(.*)|([0-9]{2}\.[0-9]{2}\.[0-9]{2}.*)", title_page)
    return ''.join(speciality[0]).lower() if speciality else None

def get_name(line : str) -> str:
    name = re.search(r"([А-Я](?:[а-я]+|\.) ?(?:[А-Я](?:[а-я]+|\.))? ?[А-Я][а-я]+)|([А-Я][а-я]+ [А-Я](?:[а-я]+|\.)? ?(?:[А-Я](?:[а-я]+|\.)))", line)
    return name[0] if name else None

def get_author(title_page : str) -> str:
    title_page = re.sub('\n', ' ', title_page)
    author = re.findall(r'(?i)(?:студент|исполнитель|выполнил).*?(?-i:([А-Я](?:[а-я]+|\.) ?(?:[А-Я](?:[а-я]+|\.))? ?[А-Я][а-я]+)|([А-Я][а-я]+ [А-Я](?:[а-я]+|\.)? ?(?:[А-Я](?:[а-я]+|\.))))', title_page)
    if not author:
        return get_name(title_page)
    if author[0] and author[0][0]:
        return author[0][0]
    elif author[0] and author[0][1]:
        return author[0][1]
    return None

def get_supervisor(title_page : str) -> str:
    title_page = re.sub('\n', ' ', title_page)
    supervisor = re.findall(r'(?i)(?:руководитель).*?(?-i:([А-Я](?:[а-я]+|\.) ?(?:[А-Я](?:[а-я]+|\.))? ?[А-Я][а-я]+)|([А-Я][а-я]+ [А-Я](?:[а-я]+|\.)? ?(?:[А-Я](?:[а-я]+|\.))))', title_page)
    if not supervisor:
        return None
    if supervisor[0] and supervisor[0][0]:
        return supervisor[0][0]
    elif supervisor[0] and supervisor[0][1]:
        return supervisor[0][1]
    return None

def get_title(title_page : str) -> str:
    title = re.findall(r'(?i)тема:\s(.*)', title_page)
    return title[0] if title else None

In [11]:
def get_metadata(title_page : str) -> dict:
    title_page = modify_title_page(title_page)
    year = get_year(title_page)
    university = get_university(title_page)
    faculty = get_faculty(title_page, university)
    department = get_department(title_page)
    speciality = get_speciality(title_page)
    author = get_author(title_page)
    supervisor = get_supervisor(title_page)
    title = get_title(title_page)
    
    return {
        'year' : year,
        'university' : university,
        'faculty' : faculty,
        'department' : department,
        'speciality' : speciality,
        'author' : author,
        'supervisor' : supervisor,
        'title' : title
    }

In [12]:
df_regex = pd.DataFrame(columns=['filename', 'year', 'university', 'faculty', 'department',
       'speciality', 'author', 'supervisor', 'title'])

for j in range(df.shape[0]):
    metadata = get_metadata(df.iloc[j].text)
    metadata.update({'filename': df.iloc[j].filename})
    df_regex = df_regex.append(metadata, ignore_index=True)
df_regex.head()

Unnamed: 0,filename,year,university,faculty,department,speciality,author,supervisor,title
0,msu/2019_Ginzburg.pdf,2019,msu,физический факультет,кафедра общей физики,,Гинзбург Борис Александрович,Поляков П.А.,
1,msu/ameridi_a_d.pdf,2019,msu,юридический факультет,кафедра истории государства и права,,Америди Афина Дмитриевна,Томсинов Владимир Алексеевич,
2,msu/aristov_g_d.pdf,2020,msu,юридический факультет,,,Аристов Георгий Дмитриевич,Щербак Наталия Валериевна,
3,msu/bagratuni_d_g.pdf,2019,msu,юридический факультет,,,М.В.Ломоносова,Бандорин Леонид Евгеньевич,
4,msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf,2021,msu,факультет вычислительной математики и кибернетики,кафедра системного программирования,,М.В.Ломоносова,Корухова Людмила Сергеевна,


In [13]:
df_regex.to_csv("diplomas_extracted_regex.csv", index = False)

### NER for getting names

In [14]:
import stanza
# stanza.download('ru') 

import numpy as np

df = pd.read_csv("diplomas_pdftotext.csv")

nlp = stanza.Pipeline('ru', processors='tokenize,ner') 

2022-02-15 14:06:20 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| ner       | wikiner   |

2022-02-15 14:06:20 INFO: Use device: cpu
2022-02-15 14:06:20 INFO: Loading: tokenize
2022-02-15 14:06:20 INFO: Loading: ner
2022-02-15 14:06:21 INFO: Done loading processors!


In [15]:
%%time
stanzed = {}
for j in range(df.shape[0]):
    stanzed.update({df.iloc[j].filename : nlp(df.iloc[j].text)})

CPU times: user 7min 52s, sys: 2.6 s, total: 7min 54s
Wall time: 2min 41s


In [16]:
persons = {}
for filename in stanzed:
    persons_doc = []
    doc = stanzed[filename]
    for sent in doc.sentences:
        for ent in sent.ents:
            if ent.type == "PER":
                 persons_doc.append(ent.text)
    persons.update({filename : persons_doc})

In [17]:
df_ner = pd.DataFrame(columns=['filename', 'author', 'supervisor'])

for i in range(df.shape[0]):
    text = df.iloc[i].text
    filename = df.iloc[i].filename
    author, supervisor = "", ""
    for person in persons[filename]:
        person = get_name(person)
        if not person:
            continue
        p_id = text.index(person)
        if not author and re.findall("(?i)студент|выполнил|исполнитель", text[p_id-250:p_id]):
            author = person
        elif not supervisor and re.findall("(?i)руководитель", text[p_id-250:p_id]):
            supervisor = person
            
    dic = {'filename' : filename, 'author' : author, 'supervisor' : supervisor}
    
    df_ner = df_ner.append(dic, ignore_index=True)
df_ner

Unnamed: 0,filename,author,supervisor
0,msu/2019_Ginzburg.pdf,Гинзбург Борис Александрович,Поляков П.А.
1,msu/ameridi_a_d.pdf,Америди Афина Дмитриевна,Томсинов Владимир Алексеевич
2,msu/aristov_g_d.pdf,,Щербак Наталия Валериевна
3,msu/bagratuni_d_g.pdf,,Бандорин Леонид Евгеньевич
4,msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf,,Корухова Людмила Сергеевна
...,...,...,...
100,ruden/kramarenko_v_o.pdf,Крамаренко В.О.,Ивановская Н.В.
101,ruden/lashina_m_v.pdf,,А.Н. Кирсанов
102,ruden/levicheva_t_k.pdf,,Беликова К.М.
103,ruden/bodnar_v.pdf,Боднарь Валерия,Ситкарева Е.В.


In [18]:
df_ner.to_csv("diplomas_extracted_ner.csv", index = False)

## Extracting title with nn


### Preparing data

In [19]:
df = pd.read_csv("diplomas_pdftotext.csv")
df_answers = pd.read_csv("diplomas_extracted_answers.csv")
df["filename"] = df["filename"].str.removeprefix("diploms/")
df_answers["filename"] = df_answers["filename"].str.removeprefix("diploms/")

In [20]:
df_for_nn = pd.merge(df[["filename", "text"]], df_answers[["filename", "title"]], on="filename")#.drop("filename", axis=1)
df_for_nn = df_for_nn.dropna()
df_for_nn.head()

Unnamed: 0,filename,text,title
0,msu/2019_Ginzburg.pdf,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ БЮД...,«ОСОБЕННОСТИ МИКРОМАГНИТНОЙ СТРУКТУРЫ ПОСТОЯНН...
1,msu/ameridi_a_d.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРС...,«Историко-сравнительное исследование правовых ...
2,msu/aristov_g_d.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРСИТЕТ...,«Оборотоспособность права на получение патента...
3,msu/bagratuni_d_g.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРСИТЕТ...,«Правовые аспекты реновации жилищного фонда»
4,msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf,Московский государственный университет имени М...,Маркирование выводимых на печать текстовых док...


In [21]:
rau = df_for_nn[df_for_nn['filename'].apply(lambda x: 'Rau' in x)]
mipt = df_for_nn[df_for_nn['filename'].apply(lambda x: 'MIPT' in x)]
msu = df_for_nn[df_for_nn['filename'].apply(lambda x: 'msu' in x)]
ruden = df_for_nn[df_for_nn['filename'].apply(lambda x: 'ruden' in x)]
hse = df_for_nn[df_for_nn['filename'].apply(lambda x: 'hse' in x)]

In [22]:
def modify_data_for_train(df_nn):
    df_nn_new = pd.DataFrame(columns=["text", "title_begin", "title_end"])
    for i in range(df_nn.shape[0]):
        text = modify_title_page(df_nn.iloc[i]["text"])
        title = modify_title_page(df_nn.iloc[i]["title"]).lower()
        title_begin = " ".join(text.lower().split("\n")).index(title)
        df_nn_new = df_nn_new.append({
            "text" : text,
            "title_begin" : title_begin,
            "title_end" : title_begin + len(title),
        }, ignore_index = True)

    return df_nn_new

# df_nn = modify_data_for_train(df_for_nn)
# df_nn

rau = modify_data_for_train(rau)
mipt = modify_data_for_train(mipt)
msu = modify_data_for_train(msu)
ruden = modify_data_for_train(ruden)
hse = modify_data_for_train(hse)

In [23]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

from sklearn.model_selection import train_test_split

2022-02-15 14:09:05.893105: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-15 14:09:05.893132: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [24]:
# X = df_nn.iloc[:90]["text"].to_numpy()
# y = df_nn.iloc[:90][["title_begin", "title_end"]].to_numpy()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)

X_train = pd.concat([mipt, msu, ruden, hse])["text"].to_numpy()
X_test = rau["text"].to_numpy()
y_train = pd.concat([mipt, msu, ruden, hse])[["title_begin", "title_end"]].to_numpy()
y_test = rau[["title_begin", "title_end"]].to_numpy()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((62,), (32,), (62, 2), (32, 2))

In [25]:
X_train_1 = np.array([[ord(char) for char in line] for line in X_train], dtype=object)
X_test_1 = np.array([[ord(char) for char in line] for line in X_test], dtype=object)

In [26]:
max_length = max(map(len, X_train_1))

x_train = keras.preprocessing.sequence.pad_sequences(X_train_1, maxlen=max_length, padding='post')
x_test = keras.preprocessing.sequence.pad_sequences(X_test_1, maxlen=max_length, padding='post')

x_train = np.array(x_train).astype(np.float32)
x_test = np.array(x_test).astype(np.float32) 
y_train = np.asarray(y_train).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [27]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((62, 3517), (32, 3517), (62, 2), (32, 2))

#### Conv1D

In [28]:
x_train = np.expand_dims(x_train, axis=2)
y_train = np.expand_dims(y_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
y_test = np.expand_dims(y_test, axis=2)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

((62, 3517, 1), (32, 3517, 1), (62, 2, 1), (32, 2, 1))

In [47]:
# Create the model
model = keras.Sequential()

# model.add(layers.Embedding(100, 4))

model.add(layers.Conv1D(10, 10, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=5))
model.add(layers.Conv1D(10, 10, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=5))
model.add(layers.Flatten())
model.add(layers.Dense(2, activation="relu", 
                        bias_initializer=tf.keras.initializers.GlorotNormal(),
                        ))

optimizer = keras.optimizers.Adam(learning_rate=0.001)

model.compile(optimizer=optimizer, loss="mean_squared_error",
#               metrics=[tf.keras.metrics.RootMeanSquaredError()])
              metrics="mean_squared_error")

model.build(input_shape=x_train.shape)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_4 (Conv1D)           (62, 3508, 10)            110       
                                                                 
 max_pooling1d_4 (MaxPooling  (62, 701, 10)            0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (62, 692, 10)             1010      
                                                                 
 max_pooling1d_5 (MaxPooling  (62, 138, 10)            0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (62, 1380)                0         
                                                                 
 dense_2 (Dense)             (62, 2)                  

In [48]:
history = model.fit(x=x_train, y=y_train, epochs=100, shuffle=True,
           batch_size=8, validation_data=(x_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100


Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [49]:
from datetime import date
date = date.today()
model.save(f'model{date}')

INFO:tensorflow:Assets written to: model2022-02-15/assets


### Evaluating

In [50]:
df_nn = modify_data_for_train(df_for_nn)
X_text = df_nn["text"].to_numpy()
y = df_nn[["title_begin", "title_end"]].to_numpy()
y = np.asarray(y).astype(np.float32)

X = np.array([[ord(char) for char in line] for line in X_text], dtype=object)

X = keras.preprocessing.sequence.pad_sequences(X, maxlen=max_length, padding='post')

X = np.expand_dims(X, axis=2)
y = np.expand_dims(y, axis=2)

In [51]:
results = model.evaluate(X, y, batch_size=32)



In [52]:
predictions = model.predict(X)

In [53]:
def get_titles_from_eval(X, y_hat):
    titles = np.array([])
    y_hat = np.round(y_hat).astype(int)
    for i in range(len(y_hat)):
        titles = np.append(titles, X[i][y_hat[i][0]:y_hat[i][1]]) 
    return titles

titles = get_titles_from_eval(X_text, predictions)
filenames = df_for_nn["filename"]

df_nn_1 = pd.DataFrame({"filename" : filenames, "title" : titles}, index=None, columns=["filename", "title"])

In [54]:
for filename in df["filename"].str.removeprefix("diploms/"):
    if df_nn_1[df_nn_1["filename"] == filename].size == 0:
        df_nn_1 = df_nn_1.append({'filename' : filename, 'title' : ''}, ignore_index=True)

In [55]:
from datetime import date
date = date.today()

df_nn_1.to_csv("diplomas_extracted_nn_1.csv", index = False)
df_nn_1

Unnamed: 0,filename,title
0,msu/2019_Ginzburg.pdf,СОБЕННОСТИ МИКРОМАГНИТНОЙ СТРУКТУРЫ\nПОСТОЯННЫ...
1,msu/ameridi_a_d.pdf,«Историко-сравнительное исследование правовых ...
2,msu/aristov_g_d.pdf,«Оборотоспособность права на получение патента...
3,msu/bagratuni_d_g.pdf,\n«Правовые аспекты реновации жилищного фонда»
4,msu/îá¼áÑóÅé_»αÑñºáΘ¿Γá_ñ¿»½«¼á.pdf,аркирование выводимых на печать текстовых доку...
...,...,...
100,ruden/lashina_m_v.pdf,у\n« Правовой режим договорных форм недропольз...
101,ruden/levicheva_t_k.pdf,Наследование по завещанию в праве России и не...
102,ruden/bodnar_v.pdf,Финансирование арбитража третьими лица
103,ruden/gorbacheva_a_i.pdf,Особенности правового регулирования качества ...


## Answers

In [56]:
import numpy as np

In [57]:
df_answers = pd.read_csv("diplomas_extracted_answers.csv")
df_regex = pd.read_csv("diplomas_extracted_regex.csv")
df_ner = pd.read_csv("diplomas_extracted_ner.csv")
df_nn_1 = pd.read_csv("diplomas_extracted_nn_1.csv")

df_answers = df_answers.sort_values(by="filename")
df_regex = df_regex.sort_values(by="filename")
df_ner = df_ner.sort_values(by="filename")
df_nn_1 = df_nn_1.sort_values(by="filename")

In [58]:
def ngrams(words, ngram):
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

def scores(y, y_hat):
    y = y.to_numpy()
    y_hat = y_hat.to_numpy()
    l = min(len(y), len(y_hat))
    accuracy = np.array([])
    recall = np.array([])
    precision = np.array([])
    f1 = np.array([])
    
    for i in range(l):
        if y[i] is np.nan and y_hat[i] is np.nan:
            tp = 1
            fp = 0
            fn = 0
        elif y_hat[i] is np.nan or y[i] is np.nan:
            tp = 0
            fp = 1
            fn = 1
        else:
            try:
                y_i = y[i].lower().split()         
                y_i += ngrams(y_i, 2)     # adding bigrams
                y_hat_i = y_hat[i].lower().split()
                y_hat_i += ngrams(y_hat_i, 2)     # adding bigrams
            except:
                y_i = [y[i]]
                y_hat_i = [y_hat[i]]
            tp = len(set(y_hat_i).intersection(y_i))     # intersection
            fp = (len(set(y_hat_i)) - len(set(y_hat_i).intersection(y_i)))
            fn = (len(set(y_i)) - len(set(y_hat_i).intersection(y_i))) 

        acc = tp / (tp + fp + fn)
        try:
            pre = tp / (tp + fp)
        except:
            pass
        rec = tp / (tp + fn)
        accuracy = np.append(accuracy, acc)
        precision = np.append(precision, pre)
        recall = np.append(recall, rec)
        if rec == 0 and pre == 0:
            f1 = np.append(f1, 0)
        else:
            f1 = np.append(f1, 2 * pre * rec / (pre + rec))
            
    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1)

In [59]:
score = pd.DataFrame(columns=['column', 'type', 'accuracy', 'precision', 'recall', 'f1'])

for col in df_answers.columns[1:]:
    if col in df_regex.columns:
        sc = scores(df_answers[col], df_regex[col])
        dic = {
            'column' : col,
            'type': 're',
            'accuracy': sc[0], 
            'precision': sc[1], 
            'recall': sc[2], 
            'f1': sc[3], 
        }
        score = score.append(dic, ignore_index = True)
    if col in df_nn_1.columns:
        sc = scores(df_answers[col], df_nn_1[col])
        dic = {
            'column' : col,
            'type' : 'nn_1',
            'accuracy': sc[0], 
            'precision': sc[1], 
            'recall': sc[2], 
            'f1': sc[3], 
        }
        score = score.append(dic, ignore_index = True)
    if col in df_ner.columns:
        sc = scores(df_answers[col], df_ner[col])
        dic = {
            'column' : col,
            'type' : 'ner',
            'accuracy': sc[0], 
            'precision': sc[1], 
            'recall': sc[2], 
            'f1': sc[3], 
        }
        score = score.append(dic, ignore_index = True)
# score[['column', 'type', 'f1']]
score

Unnamed: 0,column,type,accuracy,precision,recall,f1
0,year,re,0.961905,0.961905,0.961905,0.961905
1,university,re,0.990476,0.990476,0.990476,0.990476
2,faculty,re,0.98836,0.98836,0.990476,0.989286
3,department,re,0.742857,0.742857,0.742857,0.742857
4,speciality,re,0.771494,0.783987,0.958277,0.819392
5,author,re,0.714014,0.71746,0.731429,0.722381
6,author,ner,0.57551,0.577143,0.577143,0.577143
7,supervisor,re,0.74963,0.754286,0.750476,0.751905
8,supervisor,ner,0.730582,0.735238,0.731429,0.732857
9,title,re,0.352783,0.365288,0.357935,0.360771


In [60]:
score.to_csv("score.csv", index = False)