# Data extraction from the title page of the thesis

### Data

Data is in the same directory, in `diploms` folder.

If `diplomas_pdftotext.csv` file exists, there is no need to extract text from pdf, move on next section

In [1]:
# pdftotext
import pdftotext

# pandas
import pandas as pd

import re

# file system
from os import listdir
from os.path import isfile, join

In [2]:
def get_text_from_main_page_pdftotext(filename: str) -> str:
    with open(filename, 'rb') as f:
        pdf = pdftotext.PDF(f)
        main_page = pdf[0]
        
        # look up in second page for info 
        second_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[1])
        if len(second_page) > 0 :
            main_page += second_page[0]
#             print(second_page)

        # look up in third page for info 
        else:
            third_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[2])
            if len(third_page) > 0 :
                main_page += pdf[1]        # add second too
                main_page += third_page[0]
#                 print(main_page)
            
        return main_page

In [3]:
# # list all files
# path = "diploms"
# files = []
# for folder in listdir(path):
#     files += [join(path, folder, file) for file in listdir(join(path, folder)) if isfile(join(path, folder, file))]

# data = pd.DataFrame(columns = ["filename", "text"])

# for filename in files:
#     try:
#         text = get_text_from_main_page_pdftotext(filename)
#         data = data.append(pd.Series(
#             [filename, text], index = data.columns
#         ), ignore_index = True)
#         print(f'\x1B[32mSuccess \x1B[0m- {filename}')
#     except Exception as e:
#         print(f'\x1B[31mFailed \x1B[0m- {filename}\x1B[31m', e)

In [4]:
# # delete empty documents
# data = data[data["text"] != ""]

# # save to csv
# data.to_csv("diplomas_pdftotext.csv", index = False)

the dataset is ready at `diplomas_pdftotext.csv`

### REGEX

In [5]:
# pandas
import pandas as pd

# regex
import re

# errors
import traceback

In [6]:
df = pd.read_csv("diplomas_pdftotext.csv")
df.head()

Unnamed: 0,filename,text
0,diploms/MFTI/2016MS_Bolotskaya.pdf,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ ОБРАЗОВАТЕЛЬНОЕ ...
1,diploms/MFTI/2016MS_Shuravin.pdf,Министерство образования и науки Российско...
2,diploms/MFTI/2016BS_Galiullin.pdf,Ìîñêîâñêèé ôèçèêî-òåõíè÷åñêèé èíñòèòóò (ãîñóäà...
3,diploms/MFTI/2015BS_Gagkaeva.pdf,Министерство образования и науки ...
4,diploms/MFTI/2016BS_Noyan.pdf,Московский физико-технический институт\n ...


In [7]:
def modify_title_page(title_page : str) -> str:
    tokenized = [re.split(r'\s', line) for line in re.split(r'\n', title_page)]
    tokenized = [[token for token in line if token] for line in tokenized]
    modified = [' '.join(token for token in line if token) for line in tokenized]
    i = 0
    while i < len(modified):
        if i-1>0 and modified[i] and re.findall(r'[^\W\d_]', modified[i][0]) and modified[i][0].lower() == modified[i][0]:
            new_line = modified[i-1] + ' ' + modified[i]
            modified.pop(i)
            modified.pop(i-1)
            modified.insert(i-1, new_line)
            i =- 1
        i += 1
            
    modified = '\n'.join(line for line in modified)
    return modified

In [8]:
def get_year(title_page : str) -> str:
    year = re.findall(r'(2[0-9]{3})', title_page)
    return year[-1] if year else None

def get_university(title_page : str) -> str:
    if re.search(r"(?i)Российско ?- ?армянский", title_page):
        return 'rau'
    elif re.search(r"(?i)московский физико[\s\n]?-[\s\n\t]*технический институт", title_page):
        return 'mipt'
    elif re.search(r"(?i)Высшая школа экономики", title_page):
        return 'hse'
    elif re.search(r"(?i)дружбы народов", title_page):
        return 'rudn'
    elif re.search(r"(?i)московский государственный", title_page):
        return 'msu'
    university = re.findall('(?i)(?<=).+университет|$', title_page)[0]
    return university.lower() if university else None   

def get_faculty(title_page : str, university : str) -> str:
    if university == 'rau':
        faculty = re.findall(r"(?i)Институт.*", title_page)
    elif university == 'mipt' or university == 'hse':
        faculty = re.findall(r"(?i)Факультет.*", title_page)
    elif university == 'rudn':
        faculty = re.findall(r"(?i)учебный институт: (.*)", title_page)
        if not faculty:
            faculty = re.findall(r"(?i)\s+(.*)\sинститут", title_page)
    elif university == 'msu':
        faculty = re.findall(r"(?i)\s*(.*Факультет.*)", title_page)
    else:
        faculty = re.findall(r"(?i)Факультет.*", title_page)
    faculty = faculty[0].lower() if faculty else None
    return faculty

def get_department(title_page : str) -> str:
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    return department

def get_speciality(title_page : str) -> str:
    speciality = re.findall(r"(?i)Специальность:\s*(.*)|Направление:\s*(.*)|([0-9]{2}\.[0-9]{2}\.[0-9]{2}.*)", title_page)
    return ''.join(speciality[0]).lower() if speciality else None

def get_author(title_page : str) -> str:
    title_page = re.sub('\n', ' ', title_page)
    author0 = re.findall(r'(?i)(?:студент|исполнитель|выполнил).*(?-i:([А-Я](?:[а-я]*|\.) ?(?:[А-Я](?:[а-я]*|\.)) ?[А-Я][а-я]+))', title_page)
    author1 = re.findall(r'(?i)(?:студент|исполнитель|выполнил).*(?-i:([А-Я](?:[а-я]*|\.) ?[А-Я][а-я]+))', title_page)
#     print("author:", author0, author1)
    return (author0[0] if author0 else (author1[0] if author1 else None))

def get_supervisor(title_page : str) -> str:
    title_page = re.sub('\n', ' ', title_page)
    supervisor0 = re.findall(r'(?i)(?:руководитель).*(?-i:([А-Я](?:[а-я]*|\.) ?(?:[А-Я](?:[а-я]*|\.)) ?[А-Я][а-я]+))', title_page)
    supervisor1 = re.findall(r'(?i)(?:руководитель).*(?-i:([А-Я](?:[а-я]*|\.) ?[А-Я][а-я]+))', title_page)
#     print("supervisor:", supervisor0, supervisor1)
    return (supervisor0[0] if supervisor0 else (supervisor1[0] if supervisor1 else None))

def get_title(title_page : str) -> str:
    title = re.findall(r'(?i)тема:\s(.*)', title_page)
    return title[0] if title else None

In [9]:
# тут происходит что-то очень-очень странное 

t1 = "руководитель А.А.Марыволаолы \n студент П.Ы.Ровалры"
t2 = "руководитель П.Ы.Ровалры \n студент А.А.Марыволаолы"
t3 = "студент П.Ы.Ровалры \n руководитель А.А.Марыволаолы"
t4 = "студент П.Ы.Ровалры \n руководитель П.Ы.Ровалры"

print(get_author(t1), get_supervisor(t1))
print(get_author(t2), get_supervisor(t2))
print(get_author(t3), get_supervisor(t3))
print(get_author(t4), get_supervisor(t4))

П.Ы.Ровалры П.Ы.Ровалры
А.А.Марыволаолы А.А.Марыволаолы
А.А.Марыволаолы А.А.Марыволаолы
П.Ы.Ровалры П.Ы.Ровалры


In [10]:
def get_metadata(title_page : str) -> dict:
    title_page = modify_title_page(title_page)
    year = get_year(title_page)
    university = get_university(title_page)
    faculty = get_faculty(title_page, university)
    department = get_department(title_page)
    speciality = get_speciality(title_page)
    author = get_author(title_page)
    supervisor = get_supervisor(title_page)
    title = get_title(title_page)
    
    return {
        'year' : year,
        'university' : university,
        'faculty' : faculty,
        'department' : department,
        'speciality' : speciality,
        'author' : author,
        'supervisor' : supervisor,
        'title' : title
    }

In [11]:
df_regex = pd.DataFrame(columns=['filename', 'year', 'university', 'faculty', 'department',
       'speciality', 'author', 'supervisor', 'title'])

for j in range(df.shape[0]):
    metadata = get_metadata(df.iloc[j].text)
    metadata.update({'filename': df.iloc[j].filename})
    df_regex = df_regex.append(metadata, ignore_index=True)
df_regex.head()

Unnamed: 0,filename,year,university,faculty,department,speciality,author,supervisor,title
0,diploms/MFTI/2016MS_Bolotskaya.pdf,2016,mipt,факультет общей и прикладной фикики,кафедра физики и технологии наноструктур,,,Станислав Олегович Москва,
1,diploms/MFTI/2016MS_Shuravin.pdf,2016,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,,Никиты Сергеевича Научный,,
2,diploms/MFTI/2016BS_Galiullin.pdf,2016,,,,,,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,2015,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,03.03.01 «прикладные математика и физика»,З. В. Гагкаева,З. В. Гагкаева,
4,diploms/MFTI/2016BS_Noyan.pdf,2016,mipt,факультет общей и прикладной физики,,,,Кирилл Сергеевич Долгопрудный,


In [12]:
df_regex.to_csv("diplomas_extracted_regex.csv", index = False)

### NER for getting names

In [13]:
import stanza
# stanza.download('ru') 

import numpy as np

df = pd.read_csv("diplomas_pdftotext.csv")

nlp = stanza.Pipeline('ru', processors='tokenize,ner') 

2022-01-21 17:31:36 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| ner       | wikiner   |

2022-01-21 17:31:36 INFO: Use device: cpu
2022-01-21 17:31:36 INFO: Loading: tokenize
2022-01-21 17:31:36 INFO: Loading: ner
2022-01-21 17:31:37 INFO: Done loading processors!


In [14]:
%%time
stanzed = {}
for j in range(df.shape[0]):
    stanzed.update({df.iloc[j].filename : nlp(df.iloc[j].text)})

CPU times: user 4min 40s, sys: 545 ms, total: 4min 41s
Wall time: 1min 10s


In [15]:
persons = {}
for filename in stanzed:
    persons_doc = []
    doc = stanzed[filename]
    for sent in doc.sentences:
        for ent in sent.ents:
            if ent.type == "PER":
                 persons_doc.append(ent.text)
    persons.update({filename : persons_doc})

In [16]:
df_ner = pd.DataFrame(columns=['filename', 'author', 'supervisor'])

for i in range(df.shape[0]):
    text = df.iloc[i].text
    filename = df.iloc[i].filename
    author, supervisor = "", ""
    for person in persons[filename]:
        p_id = text.index(person)
        if not author and re.findall("(?i)студент|выполнил|исполнитель", text[p_id-250:p_id]):
            author = person
        elif not supervisor and  re.findall("(?i)руководитель", text[p_id-250:p_id]):
            supervisor = person
            
    dic = {'filename' : filename, 'author' : author, 'supervisor' : supervisor}
    
    df_ner = df_ner.append(dic, ignore_index=True)
df_ner

Unnamed: 0,filename,author,supervisor
0,diploms/MFTI/2016MS_Bolotskaya.pdf,,Юрченко Станислав Олегович\n ...
1,diploms/MFTI/2016MS_Shuravin.pdf,Шуравина Никиты Сергеевича,Долганов П.В.
2,diploms/MFTI/2016BS_Galiullin.pdf,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,З. В. Гагкаева,Е. С. Жукова
4,diploms/MFTI/2016BS_Noyan.pdf,,Цирлина Галина Александровна
...,...,...,...
100,diploms/Rau/Копия Дипломная работа - Мкртчян М...,Мкртчян Метаксия Арсеновна,Арамян Рафик Грачикович
101,diploms/Rau/Копия Гюласарян Андраник Эдуардови...,Гюласарян Андраник Эдуардович\n ...,Арутюнян К.В.
102,diploms/Rau/Копия Саргсян Э. Diplom.pdf,Э. А. Саргсян\n ...,П. А. Петросян
103,diploms/Rau/Копия Саакян Мигран (2).pdf,Саакян Мигран Арамович,Авагумян Арсен


In [17]:
df_ner.to_csv("diplomas_extracted_ner.csv", index = False)

## Answers

In [18]:
import numpy as np

In [19]:
df_answers = pd.read_csv("diplomas_extracted_answers.csv")
df_regex = pd.read_csv("diplomas_extracted_regex.csv")
df_ner = pd.read_csv("diplomas_extracted_ner.csv")

In [20]:
df_answers.head()

Unnamed: 0,filename,year,university,faculty,department,speciality,author,supervisor,title
0,diploms/MFTI/2016MS_Bolotskaya.pdf,2016.0,mipt,факультет общей и прикладной фикики,кафедра физики и технологии наноструктур,физики и технологии наноструктур,Болотская Екатерина Евгеньевна,Юрченко Станислав Олегович,Исследование процесса полимеризации методами и...
1,diploms/MFTI/2016MS_Shuravin.pdf,2016.0,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,,Шуравина Никиты Сергеевича,Долганов П.В.,ДИСЛОКАЦИИ В НАНОПЛЕНКАХ ПОЛЯРНЫХ СМЕКТИЧЕСКИХ...
2,diploms/MFTI/2016BS_Galiullin.pdf,2016.0,,,,,,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,2015.0,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,03.03.01 «прикладные математика и физика»,З. В. Гагкаева,Е. С. Жукова,ТЕРАГЕРЦОВАЯ-ИНФРАКРАСНАЯ ДИНАМИКА БАКТЕРИАЛЬН...
4,diploms/MFTI/2016BS_Noyan.pdf,2016.0,mipt,факультет общей и прикладной физики,,,Ноян Алексей Аднанович,Цирлина Галина Александровна,Роль двоения пор в матрицах анодного оксида ал...


In [21]:
df_regex.head()

Unnamed: 0,filename,year,university,faculty,department,speciality,author,supervisor,title
0,diploms/MFTI/2016MS_Bolotskaya.pdf,2016.0,mipt,факультет общей и прикладной фикики,кафедра физики и технологии наноструктур,,,Станислав Олегович Москва,
1,diploms/MFTI/2016MS_Shuravin.pdf,2016.0,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,,Никиты Сергеевича Научный,,
2,diploms/MFTI/2016BS_Galiullin.pdf,2016.0,,,,,,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,2015.0,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,03.03.01 «прикладные математика и физика»,З. В. Гагкаева,З. В. Гагкаева,
4,diploms/MFTI/2016BS_Noyan.pdf,2016.0,mipt,факультет общей и прикладной физики,,,,Кирилл Сергеевич Долгопрудный,


In [22]:
df_ner.head()

Unnamed: 0,filename,author,supervisor
0,diploms/MFTI/2016MS_Bolotskaya.pdf,,Юрченко Станислав Олегович\n ...
1,diploms/MFTI/2016MS_Shuravin.pdf,Шуравина Никиты Сергеевича,Долганов П.В.
2,diploms/MFTI/2016BS_Galiullin.pdf,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,З. В. Гагкаева,Е. С. Жукова
4,diploms/MFTI/2016BS_Noyan.pdf,,Цирлина Галина Александровна


In [23]:
def ngrams(words, ngram):
    temp=zip(*[words[i:] for i in range(0,ngram)])
    ans=[' '.join(ngram) for ngram in temp]
    return ans

def scores(y, y_hat):
    l = min(len(y), len(y_hat))
    accuracy = np.array([])
    recall = np.array([])
    precision = np.array([])
    f1 = np.array([])
    
#     print(type(y[0]))
    if type(y[0]) != str:
        tp = np.sum(y_hat == y)
        fn = fp = np.sum(y_hat != y)
    else:
        tp = 0
        fp = 0
        fn = 0
        for i in range(l):
            if y[i] is np.nan and y_hat[i] is np.nan:
                tp += 1
                fp += 0
                fn += 0
            elif y_hat[i] is np.nan or y[i] is np.nan:
                tp += 0
                fp += 1
                fn += 1
            else:
                y_i = y[i].lower().split()         
                y_i += ngrams(y_i, 2)     # adding bigrams
                y_hat_i = y_hat[i].lower().split()
                y_hat_i += ngrams(y_hat_i, 2)     # adding bigrams
                
                tp += len(set(y_hat_i).intersection(y_i)) / len(y_i)     # intersection
                fp += (len(y_hat_i) - len(set(y_hat_i).intersection(y_i))) / len(y_i)
                fn += (len(y_i) - len(set(y_hat_i).intersection(y_i))) / len(y_i)
                
    acc = tp / (tp + fp + fn)
    pre = tp / (tp + fp)
    rec = tp / (tp + fn)
    f = 2 * pre * rec / (pre + rec)

    accuracy = np.append(accuracy, acc)
    precision = np.append(precision, pre)
    recall = np.append(accuracy, rec)
    f1 = np.append(accuracy, f)

    return np.mean(accuracy), np.mean(precision), np.mean(recall), np.mean(f1)

In [24]:
score = pd.DataFrame(columns=['column', 'type', 'accuracy', 'precision', 'recall', 'f1'])

for col in df_answers.columns[1:]:
    if col in df_regex.columns:
        sc = scores(df_answers[col], df_regex[col])
        dic = {
            'column' : col,
            'type': 're',
            'accuracy': sc[0], 
            'precision': sc[1], 
            'recall': sc[2], 
            'f1': sc[3], 
        }
        score = score.append(dic, ignore_index = True)
    if col in df_ner.columns:
        sc = scores(df_answers[col], df_ner[col])
        dic = {
            'column' : col,
            'type' : 'ner',
            'accuracy': sc[0], 
            'precision': sc[1], 
            'recall': sc[2], 
            'f1': sc[3], 
        }
        score = score.append(dic, ignore_index = True)
# score[['column', 'type', 'f1']]
score

Unnamed: 0,column,type,accuracy,precision,recall,f1
0,year,re,0.926606,0.961905,0.944255,0.944255
1,university,re,0.981132,0.990476,0.985804,0.985804
2,faculty,re,0.978495,0.987788,0.984485,0.983813
3,department,re,0.585128,0.735944,0.662872,0.6617
4,speciality,re,0.53537,0.548276,0.746626,0.616376
5,author,re,0.110688,0.197396,0.155979,0.155001
6,author,ner,0.384504,0.532606,0.482411,0.469971
7,supervisor,re,0.15902,0.258263,0.225859,0.216713
8,supervisor,ner,0.570878,0.703506,0.661312,0.648852
9,title,re,0.218281,0.359627,0.287674,0.288312
