# Data extraction from the title page of the thesis

## getting main page

In [123]:
# pdftotext
import pdftotext

# pandas to work with csv
import pandas as pd

#os to work with file system
from os import listdir
from os.path import isfile, join

# regex
import re

In [124]:
def get_text_from_main_page_pdftotext(filename: str) -> str:
    with open(filename, 'rb') as f:
        pdf = pdftotext.PDF(f)
        main_page = pdf[0]
        
        # look at second page if there is some info 
        second_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[1])
        if len(second_page) > 0 :
            main_page += second_page[0]
#             print(second_page)

        # look at third page if there is some info 
        else:
            third_page = re.findall(r"(?i)^([\s\S]*)оглавление|содержание", pdf[2])
            if len(third_page) > 0 :
                main_page += pdf[1]        # add second too
                main_page += third_page[0]
#                 print(main_page)
            
        return main_page

In [172]:
# list all files
path = "diploms"
files = []
for folder in listdir(path):
    files += [join(path, folder, file) for file in listdir(join(path, folder)) if isfile(join(path, folder, file))]

data = pd.DataFrame(columns = ["filename", "text"])

for filename in files:
    try:
        text = get_text_from_main_page_pdftotext(filename)
        data = data.append(pd.Series(
            [filename, text], index = data.columns
        ), ignore_index = True)
        print(f'\x1B[32mSuccess \x1B[0m- {filename}')
    except Exception as e:
        print(f'\x1B[31mFailed \x1B[0m- {filename}\x1B[31m', e)

[32mSuccess [0m- diploms/MFTI/2017MS_Bubis.pdf
[32mSuccess [0m- diploms/MFTI/2016MS_Bolotskaya.pdf
[32mSuccess [0m- diploms/MFTI/2017MS_Gagkaeva.pdf
[32mSuccess [0m- diploms/MFTI/2016MS_Shuravin.pdf
[32mSuccess [0m- diploms/MFTI/2016BS_Galiullin.pdf
[32mSuccess [0m- diploms/MFTI/2015BS_Gagkaeva.pdf
[32mSuccess [0m- diploms/MFTI/2017BS_Kadyrmetov.pdf
[32mSuccess [0m- diploms/MFTI/2016BS_Noyan.pdf
[32mSuccess [0m- diploms/MFTI/2016MS_Belyanchikov.pdf
[32mSuccess [0m- diploms/MFTI/2016BS_Kulesh.pdf
[32mSuccess [0m- diploms/MFTI/2016BS_Gukov.pdf
[32mSuccess [0m- diploms/MFTI/2017BS_Grebenchuk.pdf
[32mSuccess [0m- diploms/MFTI/2016MS_Khudyakova.pdf
[32mSuccess [0m- diploms/MFTI/2016MS_Glushkov.pdf
[32mSuccess [0m- diploms/MFTI/2016BS_Kanin.pdf
[32mSuccess [0m- diploms/MFTI/2017BS_Kulesh.pdf
[32mSuccess [0m- diploms/ruden/dordzhi-goryaeva_a__c.pdf
[32mSuccess [0m- diploms/ruden/bodnar_v.pdf
[32mSuccess [0m- diploms/ruden/gorbacheva_a_i.pdf
[32mSuccess 

In [173]:
# delete empty
data = data[data["text"] != ""]
# save to csv
data.to_csv("diplomas_pdftotext.csv", index = False)

the dataset is ready at `diplomas_pdftotext.csv`

## extracting

In [174]:
df = pd.read_csv("diplomas_pdftotext.csv")
df

Unnamed: 0,filename,text
0,diploms/MFTI/2016MS_Bolotskaya.pdf,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ ОБРАЗОВАТЕЛЬНОЕ ...
1,diploms/MFTI/2016MS_Shuravin.pdf,Министерство образования и науки Российско...
2,diploms/MFTI/2016BS_Galiullin.pdf,Ìîñêîâñêèé ôèçèêî-òåõíè÷åñêèé èíñòèòóò (ãîñóäà...
3,diploms/MFTI/2015BS_Gagkaeva.pdf,Министерство образования и науки ...
4,diploms/MFTI/2016BS_Noyan.pdf,Московский физико-технический институт\n ...
...,...,...
100,diploms/Rau/Копия Дипломная работа - Мкртчян М...,ГОУ ВПО РОССИЙСКО-АРМЯНСКИЙ (СЛАВЯНСКИЙ)\n ...
101,diploms/Rau/Копия Гюласарян Андраник Эдуардови...,РОССИЙСКО - АРМЯНСКИЙ (СЛАВЯНСКИЙ)...
102,diploms/Rau/Копия Саргсян Э. Diplom.pdf,ГОУ ВПО РОССИЙСКО-АРМЯНСКИЙ\n ...
103,diploms/Rau/Копия Саакян Мигран (2).pdf,DocuSign Envelope ID: 3AACB8B0-654A-48D0-B544-...


In [175]:
from my_metadata import get_metadata
from my_metadata import rau_metadata, mipt_metadata, rudn_metadata, hse_metadata, msu_metadata, any_metadata

import traceback

In [244]:
import re

def get_metadata(title_page : str) -> tuple:
    if type(title_page) != str or len(title_page) == 0:
        return None, None, None, None, None, None, None, None
    
    year = re.findall(r'(2[0-9]{3})', title_page)
    year = year[-1] if year else None
    
    if re.search(r"(?i)Российско ?- ?армянский", title_page):
        university = 'rau'
        faculty, department, speciality, author, supervisor, title = rau_metadata(title_page)
        
    elif re.search(r"(?i)московский физико[\s\n]?-[\s\n\t]*технический институт", title_page):
        university = 'mipt'
        faculty, department, speciality, author, supervisor, title = mipt_metadata(title_page)
        
    elif re.search(r"(?i)Высшая школа экономики", title_page):
        university = 'hse'
        faculty, department, speciality, author, supervisor, title = hse_metadata(title_page)
        
    elif re.search(r"(?i)дружбы народов", title_page):
        university = 'rudn'
        faculty, department, speciality, author, supervisor, title = rudn_metadata(title_page)
        
    elif (re.search(r"(?i)московский государственный", title_page)):
        university = 'msu'
        faculty, department, speciality, author, supervisor, title = msu_metadata(title_page)
    
    # try to extract something
    else:
#         print("-----------------------NO UNI------------------------")
        university = re.findall("(?i)$(.*)университет", title_page)
        university = university[0] if university else None
        faculty, department, speciality, author, supervisor, title = any_metadata(title_page)
#         print(year, university, faculty, department, speciality, author, supervisor, title)
    return year, university, faculty, department, speciality, author, supervisor, title


def splitted_metadata(title_page : str) -> tuple:
    author = None 
    supervisor = None
    title = None
    
    # split in words
    tokenized = re.split(r'\s', title_page)

    for i in range(len(tokenized)):
        if re.match(r"(?i)студент", tokenized[i]) or re.match(r"(?i)исполнитель", tokenized[i]):
            for j in range(i+1, len(tokenized)):
                if tokenized[j] and re.match(r"^[А-Я].*", tokenized[j]) != None:
                    author = tokenized[j]
#                     print("~", author)
                    if tokenized[j+1] and re.match(r"^[А-Я].*", tokenized[j+1]) != None:
                        author += ' ' + tokenized[j+1]
#                         print("~", author)
                        if tokenized[j+2] and re.match(r"^[А-Я].*", tokenized[j+2]) != None:
                            author += ' ' + tokenized[j+2]
#                             print("~", author)
                if author:
                    break
                            
#             print("author:", author)

        if re.match(r"(?i)руководитель", tokenized[i]):
            for j in range(i+1, len(tokenized)):
                if tokenized[j] and re.match(r"^[А-Я].*", tokenized[j]) != None:
                    supervisor = tokenized[j]
#                     print("~", supervisor)
                    if tokenized[j+1] and re.match(r"^[А-Я].*", tokenized[j+1]) != None:
                        supervisor += ' ' + tokenized[j+1]
#                         print("~", supervisor)
                        if tokenized[j+2] and re.match(r"^[А-Я].*", tokenized[j+2]) != None:
                            supervisor += ' ' + tokenized[j+2]
#                             print("~", supervisor)
                if supervisor:
                    break
#             print("supervisor:", supervisor)
            
            
        if re.match(r"(?i)тема", tokenized[i]):
            title = ""
            i += 1
            if(tokenized[i] and tokenized[i][0] in ('"', "«", '“', '“')):
                while(tokenized[i][-1] not in ('"', "»", '”', ".")):
                    title += ' ' + tokenized[i]
                    i += 1
                    while not tokenized[i]:
                        i += 1
#                         print('hm', tokenized[i])
                        
                title += ' ' + tokenized[i]
            else:
                while tokenized[i]:
                    title += ' ' + tokenized[i]
                    i += 1
            title = ' '.join(title.split())
            
#     print(title_page)
    return author, supervisor, title


# RAU
def rau_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)Институт.*", title_page)
    faculty = faculty[0].lower() if faculty else None
#     faculty = "ПМИ"   # as there are no faculty in diploma mentioned
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    
    speciality = re.findall(r"(?i)Специальность:\s*(.*)|Направление:\s*(.*)", title_page)
    speciality = [i for i in speciality[0] if i][0].lower() if speciality else None
    
    author, supervisor, title = splitted_metadata(title_page)
    
    return faculty, department, speciality, author, supervisor, title


def mipt_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)Факультет.*", title_page)
    faculty = faculty[0].lower() if faculty else None
        
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    
    speciality = re.findall(r"(?i)Специальность:\s*(.*)|Направление:\s*(.*)", title_page)
    speciality = [i for i in speciality[0] if i][0].lower() if speciality else None
    
    author = re.findall(r"C?c?тудент.*(([А-Я]. ?)?[А-Я]\. ?[A-Я][а-я]+)|([A-Я][а-я]+ [A-Я][а-я]+ [A-Я][а-я]+)", title_page)
    supervisor = re.findall(r"Научный руководитель.*([А-Я]\. ?[А-Я]\. [A-Я][а-я]+)", title_page)
    author, supervisor, title = splitted_metadata(title_page)
    
    return faculty, department, speciality, author, supervisor, title


def rudn_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)учебный институт: (.*)", title_page)
    if not faculty:
        faculty = re.findall(r"(?i)\s+(.*)\sинститут", title_page)
    if faculty:
        faculty = faculty[0].lower()
    
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0] if department else None
    
    speciality = re.findall(r"[0-9]{2}\.[0-9]{2}\.[0-9]{2}.*", title_page)
    speciality = speciality[0] if speciality else None
    
    author, supervisor, title = splitted_metadata(title_page)

    title2 = re.findall(r"(?i)ТЕМА(.?)", title_page)
    title = title2[0] if title2 and ''.join(title2[0].split()) else title
    
    return faculty, department, speciality, author, supervisor, title


def hse_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)Факультет.*", title_page)
    faculty = faculty[0] if faculty else None
    department = None
    
    speciality = re.findall(r"[0-9]{2}\.[0-9]{2}\.[0-9]{2}.*", title_page)
    speciality = speciality[0] if speciality else None
    
    author, supervisor, title = splitted_metadata(title_page)
    
    return faculty, department, speciality, author, supervisor, title


def msu_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)\s*(.*Факультет.*)", title_page)
    faculty = faculty[0].lower() if faculty else None
    
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    
    speciality = None
    
    author, supervisor, title = splitted_metadata(title_page)
    title2 = re.findall(r"(?i)Тема:\s*(.*)", title_page)
    title = title2[0] if title2 else title
    
    return faculty, department, speciality, author, supervisor, title


def any_metadata(title_page : str) -> tuple:
    faculty = re.findall(r"(?i)\s*(.*Факультет.*)", title_page)
    faculty = faculty[0].lower() if faculty else None
    
    department = re.findall(r"(?i)Кафедра.*", title_page)
    department = department[0].lower() if department else None
    
    speciality = re.findall(r"[0-9]{2}\.[0-9]{2}\.[0-9]{2}.*", title_page)
    speciality = speciality[0] if speciality else None
    
    author, supervisor, title = splitted_metadata(title_page)

    return faculty, department, speciality, author, supervisor, title


In [245]:
import traceback
uni = df[df["filename"].apply(lambda x: "MFTI" in x)]   # Rau MFTI hse ruden msu
uni

for j in range(uni.shape[0]):
    try:
#         get_metadata(uni.iloc[j].text)
        print("--" + str(j) + ": " + uni.iloc[j].filename, get_metadata(uni.iloc[j].text)[2:], sep = "\n")
    except Exception as e:
#         print(e)
        traceback.print_exc(e)
        continue

--0: diploms/MFTI/2016MS_Bolotskaya.pdf
('факультет общей и прикладной фикики', 'кафедра физики и технологии наноструктур', None, None, 'Юрченко Станислав Олегович', None)
--1: diploms/MFTI/2016MS_Shuravin.pdf
('факультет общей и прикладной физики', 'кафедра физики и технологии наноструктур', None, 'Шуравина Никиты Сергеевича', 'Долганов П.В.', None)
--2: diploms/MFTI/2016BS_Galiullin.pdf
(None, None, None, None, None, None)
--3: diploms/MFTI/2015BS_Gagkaeva.pdf
('факультет общей и прикладной физики', 'кафедра физики и технологии наноструктур', None, 'З. В. Гагкаева', 'Е. С. Жукова', None)
--4: diploms/MFTI/2016BS_Noyan.pdf
('факультет общей и прикладной физики', None, None, None, 'Цирлина Галина Александровна', None)
--5: diploms/MFTI/2016MS_Belyanchikov.pdf
('факультет общей и прикладной физики', 'кафедра физики и технологии наноструктур', None, 'Белянчиков Михаил Анатольевич', 'Горшунов Борис Петрович', None)
--6: diploms/MFTI/2016BS_Kulesh.pdf
('факультет общей и прикладной физики'

In [246]:
years = []
universities = []
faculties = []
departments = []
specialities = []
authors = []
supervisors = []
titles = []
for j in range(df.shape[0]):
    try:
        t = get_metadata(df.iloc[j].text)
        year, university, faculty, department, speciality, author, supervisor, title = t
        years.append(year)
        universities.append(university)
        faculties.append(faculty)
        departments.append(department)
        specialities.append(speciality)
        authors.append(author)
        supervisors.append(supervisor)
        titles.append(title)
    except Exception as e:
        print(j, df.iloc[j].title, e)
        traceback.print_exc(e)
        continue

In [247]:
df["year"] = years
df["university"] = universities
df["faculty"] = faculties
df["department"] = departments
df["speciality"] = specialities
df["author"] = authors
df["supervisors"] = supervisors
df["title"] = titles

In [248]:
df

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
0,diploms/MFTI/2016MS_Bolotskaya.pdf,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ ОБРАЗОВАТЕЛЬНОЕ ...,2016,mipt,факультет общей и прикладной фикики,кафедра физики и технологии наноструктур,,,Юрченко Станислав Олегович,
1,diploms/MFTI/2016MS_Shuravin.pdf,Министерство образования и науки Российско...,2016,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,,Шуравина Никиты Сергеевича,Долганов П.В.,
2,diploms/MFTI/2016BS_Galiullin.pdf,Ìîñêîâñêèé ôèçèêî-òåõíè÷åñêèé èíñòèòóò (ãîñóäà...,2016,,,,,,,
3,diploms/MFTI/2015BS_Gagkaeva.pdf,Министерство образования и науки ...,2015,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,,З. В. Гагкаева,Е. С. Жукова,
4,diploms/MFTI/2016BS_Noyan.pdf,Московский физико-технический институт\n ...,2016,mipt,факультет общей и прикладной физики,,,,Цирлина Галина Александровна,
...,...,...,...,...,...,...,...,...,...,...
100,diploms/Rau/Копия Дипломная работа - Мкртчян М...,ГОУ ВПО РОССИЙСКО-АРМЯНСКИЙ (СЛАВЯНСКИЙ)\n ...,2020,rau,институт математики и информатики,кафедра математики и математической кибернетики,прикладная математика и информатика,Мкртчян Метаксия Арсеновна,Арамян Рафик Грачикович,“Вычисление страховых премий для некоторых групп”
101,diploms/Rau/Копия Гюласарян Андраник Эдуардови...,РОССИЙСКО - АРМЯНСКИЙ (СЛАВЯНСКИЙ)...,2020,rau,институт математики и высоких технологий,кафедра: математики и математического моделиро...,прикладная математика и информатика,ПМИ Гюласарян Андраник,Арутюнян К.В. Исполнитель:,“Факторизация некоторых треугольных матриц-фун...
102,diploms/Rau/Копия Саргсян Э. Diplom.pdf,ГОУ ВПО РОССИЙСКО-АРМЯНСКИЙ\n ...,2020,rau,институт математики и информатики,кафедра математической кибернетики,прикладная математика и информатика,Э. А. Саргсян,П. А. Петросян,“О палитровом индексе некоторых графов”
103,diploms/Rau/Копия Саакян Мигран (2).pdf,DocuSign Envelope ID: 3AACB8B0-654A-48D0-B544-...,2020,rau,институт математики и информатики,кафедра системного программирования,прикладная математика и информатика,Саакян Мигран Арамович,Авагумян Арсен,“АСИНХРОННЫЕ ФОРМЫ”


In [249]:
df.to_csv("diplomas_pdftotext_extracted.csv", index = False)

## Statistics

In [250]:
# all
df.describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,105,105,102,103,100,71,64,68,101,40.0
unique,105,104,8,5,16,19,14,65,88,39.0
top,diploms/hse/vaskina_tv_ispolzovanie-narrativov...,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ АВТОНОМНОЕ ОБРАЗО...,2020,rau,институт математики и информатики,кафедра системного программирования,прикладная математика и информатика,Студент,П. А. Петросян,
freq,1,2,39,32,26,13,28,2,4,2.0


In [251]:
df.isnull().sum(axis = 0)

filename        0
text            0
year            3
university      2
faculty         5
department     34
speciality     41
author         37
supervisors     4
title          65
dtype: int64

In [252]:
# msu
df[df["filename"].apply(lambda x: "msu" in x)].describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,30,30,30,29,28,24,0.0,18,28,6
unique,30,30,7,1,4,9,0.0,17,26,6
top,diploms/msu/azyukina_e_a.pdf,МОСКОВСКИЙ ГОСУДАРСТВЕННЫЙ УНИВЕРСИТ...,2019,msu,юридический факультет,кафедра системного программирования,,Барановой Полины Олеговны,Корухова Людмила Сергеевна,«Правовые основы установления ограничений прав...
freq,1,1,9,29,16,8,,2,2,1


In [253]:
df[df["filename"].apply(lambda x: "hse" in x)].describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,26,26,26,26,24,0.0,25,9,26,0.0
unique,26,25,5,1,5,0.0,9,8,24,0.0
top,diploms/hse/vaskina_tv_ispolzovanie-narrativov...,ФЕДЕРАЛЬНОЕ ГОСУДАРСТВЕННОЕ АВТОНОМНОЕ ОБРАЗО...,2021,hse,Факультет социальных наук,,39.03.01 «Социология»,МУВ,Н. В. Большаков,
freq,1,2,9,26,11,,5,2,2,


In [254]:
df[df["filename"].apply(lambda x: "ruden" in x)].describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,6,6,6,6,6,6,6,3,6,2.0
unique,6,6,3,1,1,3,1,3,6,1.0
top,diploms/ruden/gorbacheva_a_i.pdf,РОССИЙСКИЙ УНИВЕРСИТЕТ ДРУЖБЫ НАРОДОВ\n ...,2016,rudn,юридический,Кафедра гражданского и трудового права,40.03.01 «Юриспруденция»,ЮЮ-404,"Русакова Екатерина Петровна,",
freq,1,1,3,6,6,4,6,1,1,2.0


In [255]:
df[df["filename"].apply(lambda x: "MFTI" in x)].describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,11,11,11,10,10,9,1,7,9,1
unique,11,11,2,1,2,1,1,7,9,1
top,diploms/MFTI/2016BS_Gukov.pdf,Министерство образования и науки Р...,2016,mipt,факультет общей и прикладной физики,кафедра физики и технологии наноструктур,010900 – прикладные математика и физика,З. В. Гагкаева,Долганов П.В.,Разработка однофотонного детектора в
freq,1,1,10,10,9,9,1,1,1,1


In [256]:
df[df["filename"].apply(lambda x: "Rau" in x)].describe()

Unnamed: 0,filename,text,year,university,faculty,department,speciality,author,supervisors,title
count,32,32,29,32,32,32,32,31,32,31
unique,32,32,1,1,4,7,3,31,26,31
top,diploms/Rau/Копия Аветисян-Арман-1.pdf,ГОУ ВПО РОССИЙСКО-АРМЯНСКИЙ (СЛАВЯНСКИЙ) УНИВЕ...,2020,rau,институт математики и информатики,кафедра математической кибернетики,прикладная математика и информатика,ПМИ Гюласарян Андраник,П. А. Петросян,ИССЛЕДОВАНИЕ ПОВЕДЕНИЯ ВОЛН В МАГНИТОСТРИКЦИОННЫХ
freq,1,1,29,32,26,13,28,1,4,1
