In [36]:
import os
import re

from collections import defaultdict
from multiprocessing import Pool

# Only version 0.8.4 was working
import fasttext
import pandas as pd

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from gensim.models import word2vec
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet as wn
from sklearn.manifold import TSNE
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

# Часть 1. Эксплоративный анализ

In [2]:
# Prepare data in a convinient way
# Remove punctuation, stop words and tokenize text
stop_words = stopwords.words("english")
directory = "/mnt/hdd1/users/svinkapeppa/hpac/data/raw/"
files = os.listdir(directory)

def prepare_file(file):
    output = []
    text = " ".join([line.lower().strip() for line in open(directory + file)])
    sentences = sent_tokenize(text)

    for sentence in sentences:
        sentence = re.sub("\W+", " ", sentence)
        words = [word for word in sentence.strip().split() if word not in stop_words]
        # Filter short sentences 
        if len(words) > 5:
            output.append(words)
    
    return output

with Pool(32) as p:
    tmp = list(tqdm(p.imap(prepare_file, files, chunksize=64), total=len(files)))

corpus = []
for item in tmp:
    corpus.extend(item)

HBox(children=(FloatProgress(value=0.0, max=36225.0), HTML(value='')))




In [3]:
# Top-1000 frequent words
fd = FreqDist()
for sentence in tqdm(corpus):
    fd.update(sentence)

top1000frequent = [(w[0], w[1]) for w in fd.most_common(1000)]

for w in top1000frequent[:10]:
    print(f"{w[0]:10s}\t{w[1]:10d}")

HBox(children=(FloatProgress(value=0.0, max=17851512.0), HTML(value='')))


harry     	   2773647
would     	   1304198
said      	   1282477
hermione  	   1234058
back      	   1110646
one       	   1094274
could     	   1031157
draco     	    965109
like      	    912531
eyes      	    787413


In [4]:
# Top-10 frequent names
# Tried to use SpaCy -- the wait was too long (53 hours)
# Using brute force approach: download list of HP names from here https://nameberry.com/userlist/view/42943
names = [
    "Abraxas", "Albus", "Alice", "Ambrosius", "Amelia", "Amos", "Andromeda", "Angelina",
    "Arabella", "Argus", "Ariana", "Arthur", "Alastor", "Alecto", "Amycus", "Bellatrix",
    "Bill", "Blaise", "Burke", "Cassandra", "Cedric", "Charlie", "Colin", "Cormac",
    "Cornelius", "Dean", "Draco", "Dudley", "Elphias", "Fleur", "Florean", "Fred",
    "Fawkes", "Filius", "Gabrielle", "George", "Ginevra", "Gryffin", "Gilderoy",
    "Hannah", "Harry", "Hepzibah", "Hermione", "Hestia", "Horace", "Hugo", "James",
    "Katie", "Kingsley", "Lavender", "Lily", "Lucius", "Ludo", "Luna", "Marietta",
    "Millicent", "Minerva", "Molly", "Myrtle", "Merope", "Narcissa", "Neville",
    "Nicolas", "Nymphadora", "Oliver", "Orion", "Padma", "Parvati", "Penelope", "Percy",
    "Petunia", "Phineas", "Pomona", "Poppy", "Perenelle", "Raven", "Remus", "Rita",
    "Romilda", "Ronald", "Ronan", "Rose", "Rowena", "Rufus", "Rosmerta", "Rubeus",
    "Severus", "Sirius", "Sybill", "Teddy", "Trevor", "Viktor", "Wilhelmina", "Zacharias",
]
names = " ".join(names).lower().split()

top10names = []
for w in top1000frequent:
    if len(top10names) == 10:
        break
    if w[0] in names:
        top10names.append(w)

assert len(top10names) == 10
for w in top10names:
    print(f"{w[0]:10s}\t{w[1]:10d}")

harry     	   2773647
hermione  	   1234058
draco     	    965109
severus   	    452058
sirius    	    425183
james     	    314041
remus     	    292968
lily      	    281883
neville   	    153732
lucius    	    149016


In [5]:
# Top-10 frequent (Name, Surname) pairs
# Took (Name, Surname) pairs from https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters
full_names = [
    "Hannah Abbott", "Ludo Bagman", "Bathilda Bagshot", "Katie Bell", "Cuthbert Binns",
    "Sirius Black", "Amelia Bones", "Susan Bones", "Terry Boot", "Lavender Brown",
    "Millicent Bulstrode", "Charity Burbage", "Frank Bryce", "Alecto Carrow",
    "Amycus Carrow", "Reginald Cattermole", "Mary Cattermole", "Cho Chang",
    "Penelope Clearwater", "Michael Corner", "Vincent Crabbe", "Colin Creevey",
    "Dennis Creevey", "Dirk Cresswell", "Barty Crouch", "Roger Davies", "John Dawlish",
    "Fleur Delacour", "Gabrielle Delacour", "Dedalus Diggle", "Amos Diggory", "Cedric Diggory",
    "Elphias Doge", "Antonin Dolohov", "Aberforth Dumbledore", "Albus Dumbledore",
    "Ariana Dumbledore", "Kendra Dumbledore", "Percival Dumbledore", "Dudley Dursley",
    "Marge Dursley", "Petunia Dursley", "Vernon Dursley", "Marietta Edgecombe", "Arabella Figg",
    "Argus Filch", "Justin Finch-Fletchley", "Seamus Finnigan", "Marcus Flint",
    "Mundungus Fletcher", "Filius Flitwick", "Florean Fortescue", "Cornelius Fudge",
    "Marvolo Gaunt", "Merope Gaunt", "Morfin Gaunt", "Anthony Goldstein", "Gregory Goyle",
    "Hermione Granger", "Gregorovitch", "Fenrir Greyback", "Gellert Grindelwald",
    "Wilhelmina Grubbly-Plank", "Rubeus Hagrid", "Rolanda Hooch", "Mafalda Hopkirk",
    "Angelina Johnson", "Lee Jordan", "Igor Karkaroff", "Viktor Krum", "Silvanus Kettleburn",
    "Bellatrix Lestrange", "Rabastan Lestrange", "Rodolphus Lestrange", "Gilderoy Lockhart",
    "Alice Longbottom", "Frank Longbottom", "Augusta Longbottom", "Neville Longbottom",
    "Luna Lovegood", "Xenophilius Lovegood", "Remus Lupin", "Teddy Lupin", "Walden Macnair",
    "Draco Malfoy", "Lucius Malfoy", "Narcissa Malfoy", "Scorpius Malfoy", "Madam Malkin",
    "Griselda Marchbanks", "Olympe Maxime", "Ernie Macmillan", "Minerva McGonagall",
    "Cormac McLaggen", "Graham Montague", "Alastor Moody", "Theodore Nott", "Bob Ogden",
    "Garrick Ollivander", "Pansy Parkinson", "Padma Patil", "Parvati Patil", "Peter Pettigrew",
    "Antioch Peverell", "Cadmus Peverell", "Ignotus Peverell", "Irma Pince", "Sturgis Podmore",
    "Poppy Pomfrey", "Harry Potter", "James Potter", "Lily Potter", "Lily Potter",
    "Quirinus Quirrell", "Helena Ravenclaw", "Mary Riddle", "Thomas Riddle", "Tom Riddle",
    "Demelza Robins", "Augustus Rookwood", "Thorfinn Rowle", "Albert Runcorn", "Newt Scamander",
    "Rufus Scrimgeour", "Kingsley Shacklebolt", "Stan Shunpike", "Aurora Sinistra",
    "Rita Skeeter", "Horace Slughorn", "Salazar Slytherin", "Zacharias Smith", "Severus Snape",
    "Alicia Spinnet", "Pomona Sprout", "Pius Thicknesse", "Dean Thomas", "Andromeda Tonks",
    "Nymphadora Tonks", "Ted Tonks", "Sybill Trelawney", "Wilkie Twycross", "Dolores Umbridge",
    "Emmeline Vance", "Romilda Vane", "Septima Vector", "Lord Voldemort", "Myrtle Warren",
    "Arthur Weasley", "Bill Weasley", "Charlie Weasley", "Fred Weasley", "George Weasley",
    "Ginny Weasley", "Hugo Weasley", "Molly Weasley", "Percy Weasley", "Ron Weasley",
    "Oliver Wood", "Rose Weasley", "Corban Yaxley", "Blaise Zabini",
]
full_names = "\t".join(full_names).lower().split("\t")
full_names = [" ".join(sorted(name.split())) for name in full_names]

fd = FreqDist()
for sentence in tqdm(corpus):
    pairs = [tuple(sorted([sentence[i], sentence[i + 1]])) for i in range(len(sentence) - 1)]
    fd.update(pairs)

top10fullnames = []
for w in fd.most_common(1000):
    if len(top10fullnames) == 10:
        break
    if f"{w[0][0]} {w[0][1]}" in full_names:
        top10fullnames.append(w)

assert len(top10fullnames) == 10
for w in top10fullnames:
    print(f"{w[0][0]:15s} {w[0][1]:15s}\t{w[1]:10d}")

HBox(children=(FloatProgress(value=0.0, max=17851512.0), HTML(value='')))


harry           potter         	    103287
draco           malfoy         	     39798
severus         snape          	     26434
granger         hermione       	     25034
james           potter         	     23404
black           sirius         	     22453
lord            voldemort      	     21943
lucius          malfoy         	     20900
albus           dumbledore     	     18886
riddle          tom            	     15335


In [6]:
# Top-10 frequent (Professor, Name/Surname) pairs
# Took (Name, Surname) pairs from https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters
professor_names = [
    "Hannah Abbott", "Ludo Bagman", "Bathilda Bagshot", "Katie Bell", "Cuthbert Binns",
    "Sirius Black", "Amelia Bones", "Susan Bones", "Terry Boot", "Lavender Brown",
    "Millicent Bulstrode", "Charity Burbage", "Frank Bryce", "Alecto Carrow",
    "Amycus Carrow", "Reginald Cattermole", "Mary Cattermole", "Cho Chang",
    "Penelope Clearwater", "Michael Corner", "Vincent Crabbe", "Colin Creevey",
    "Dennis Creevey", "Dirk Cresswell", "Barty Crouch", "Roger Davies", "John Dawlish",
    "Fleur Delacour", "Gabrielle Delacour", "Dedalus Diggle", "Amos Diggory", "Cedric Diggory",
    "Elphias Doge", "Antonin Dolohov", "Aberforth Dumbledore", "Albus Dumbledore",
    "Ariana Dumbledore", "Kendra Dumbledore", "Percival Dumbledore", "Dudley Dursley",
    "Marge Dursley", "Petunia Dursley", "Vernon Dursley", "Marietta Edgecombe", "Arabella Figg",
    "Argus Filch", "Justin Finch-Fletchley", "Seamus Finnigan", "Marcus Flint",
    "Mundungus Fletcher", "Filius Flitwick", "Florean Fortescue", "Cornelius Fudge",
    "Marvolo Gaunt", "Merope Gaunt", "Morfin Gaunt", "Anthony Goldstein", "Gregory Goyle",
    "Hermione Granger", "Gregorovitch", "Fenrir Greyback", "Gellert Grindelwald",
    "Wilhelmina Grubbly-Plank", "Rubeus Hagrid", "Rolanda Hooch", "Mafalda Hopkirk",
    "Angelina Johnson", "Lee Jordan", "Igor Karkaroff", "Viktor Krum", "Silvanus Kettleburn",
    "Bellatrix Lestrange", "Rabastan Lestrange", "Rodolphus Lestrange", "Gilderoy Lockhart",
    "Alice Longbottom", "Frank Longbottom", "Augusta Longbottom", "Neville Longbottom",
    "Luna Lovegood", "Xenophilius Lovegood", "Remus Lupin", "Teddy Lupin", "Walden Macnair",
    "Draco Malfoy", "Lucius Malfoy", "Narcissa Malfoy", "Scorpius Malfoy", "Madam Malkin",
    "Griselda Marchbanks", "Olympe Maxime", "Ernie Macmillan", "Minerva McGonagall",
    "Cormac McLaggen", "Graham Montague", "Alastor Moody", "Theodore Nott", "Bob Ogden",
    "Garrick Ollivander", "Pansy Parkinson", "Padma Patil", "Parvati Patil", "Peter Pettigrew",
    "Antioch Peverell", "Cadmus Peverell", "Ignotus Peverell", "Irma Pince", "Sturgis Podmore",
    "Poppy Pomfrey", "Harry Potter", "James Potter", "Lily Potter", "Lily Potter",
    "Quirinus Quirrell", "Helena Ravenclaw", "Mary Riddle", "Thomas Riddle", "Tom Riddle",
    "Demelza Robins", "Augustus Rookwood", "Thorfinn Rowle", "Albert Runcorn", "Newt Scamander",
    "Rufus Scrimgeour", "Kingsley Shacklebolt", "Stan Shunpike", "Aurora Sinistra",
    "Rita Skeeter", "Horace Slughorn", "Salazar Slytherin", "Zacharias Smith", "Severus Snape",
    "Alicia Spinnet", "Pomona Sprout", "Pius Thicknesse", "Dean Thomas", "Andromeda Tonks",
    "Nymphadora Tonks", "Ted Tonks", "Sybill Trelawney", "Wilkie Twycross", "Dolores Umbridge",
    "Emmeline Vance", "Romilda Vane", "Septima Vector", "Lord Voldemort", "Myrtle Warren",
    "Arthur Weasley", "Bill Weasley", "Charlie Weasley", "Fred Weasley", "George Weasley",
    "Ginny Weasley", "Hugo Weasley", "Molly Weasley", "Percy Weasley", "Ron Weasley",
    "Oliver Wood", "Rose Weasley", "Corban Yaxley", "Blaise Zabini",
]
professor_names = set(" ".join(professor_names).lower().split())

fd = FreqDist()
for sentence in tqdm(corpus):
    pairs = [
        tuple(sorted([sentence[i], sentence[i + 1]]))
        for i in range(len(sentence) - 1)
        if "professor" in [sentence[i], sentence[i + 1]]
    ]
    fd.update(pairs)

top10professor = []
for w in fd.most_common(1000):
    if len(top10professor) == 10:
        break
        
    if w[0][0] in names or w[0][1] in professor_names:
        top10professor.append(w)

assert len(top10professor) == 10
for w in top10professor:
    print(f"{w[0][0]:15s} {w[0][1]:15s}\t{w[1]:10d}")

HBox(children=(FloatProgress(value=0.0, max=17851512.0), HTML(value='')))


professor       snape          	     35161
professor       sprout         	      5016
professor       slughorn       	      4960
professor       trelawney      	      2679
harry           professor      	      2582
professor       umbridge       	      2302
professor       vector         	      1552
professor       quirrell       	      1470
professor       severus        	      1127
hermione        professor      	      1115


# Часть 2. Модели представления слов

In [7]:
# Initilize model, build vocab and train it
model = word2vec.Word2Vec(workers=32, size=100, min_count=10, window=5, sample=1e-3)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=7)

(1195468474, 1249682154)

In [8]:
# Synonyms
print(model.wv.most_similar("beer", topn=3))
print(model.wv.most_similar("hate", topn=3))

[('lager', 0.8671025037765503), ('butterbeer', 0.8425200581550598), ('coke', 0.8336340188980103)]
[('despise', 0.781937301158905), ('hates', 0.7341607809066772), ('detest', 0.7032870650291443)]


In [9]:
# Associations
print(model.wv.most_similar(positive=["man", "professor"], negative=["professor"], topn=3))
print(model.wv.most_similar(positive=["wizard", "woman", "girl"], negative=["boy", "man"], topn=3))

[('woman', 0.879643976688385), ('wizard', 0.7745729088783264), ('boy', 0.7383607625961304)]
[('witch', 0.799569845199585), ('noblewoman', 0.5840001106262207), ('stunningly', 0.5819244980812073)]


In [14]:
# Odd words
print(model.wv.doesnt_match(["wand", "broom", "stick", "computer"]))
print(model.wv.doesnt_match(["harry", "granger", "malfoy", "table"]))

computer
table


In [11]:
# Calculating TSNE projections
tsne = TSNE(n_components=2, random_state=42)

words = [w[0] for w in top1000frequent]
vectors = model[words]
projections = tsne.fit_transform(vectors)

  """


In [12]:
# Plotting
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save", toolbar_location="above",
           title="Projections of top-1000 most frequent words")
source = ColumnDataSource(data=dict(x1=projections[:,0], x2=projections[:,1], names=words))
p.scatter(x="x1", y="x2", size=8, source=source)
labels = LabelSet(x="x1", y="x2", text="names", y_offset=6, text_font_size="8pt",
                  text_color="#555555", source=source, text_align="center")
p.add_layout(labels)

show(p)

# Часть 3. Классификация текстов

In [58]:
columns = ["id", "class", "text"]

train = pd.read_csv("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/hpac_training_128.tsv", names=columns, sep="\t")
dev = pd.read_csv("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/hpac_dev_128.tsv", names=columns, sep="\t")
test = pd.read_csv("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/hpac_test_128.tsv", names=columns, sep="\t")

train = train.drop("id", axis=1)
dev = dev.drop("id", axis=1)
test = test.drop("id", axis=1)

In [59]:
def preprocess(dataframe):
    stop_words = stopwords.words("english")
    dataframe["text"] = dataframe["text"].apply(lambda x: x.lower())
    dataframe["text"] = dataframe["text"].apply(lambda x: re.sub("\W+", " ", x))
    dataframe["class"] = "__label__" + dataframe["class"]
    
    return dataframe

In [60]:
train = preprocess(train)
dev = preprocess(dev)
test = preprocess(test)

In [64]:
# FastText baseline
train.to_csv("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/train.txt", sep=" ", index=False, header=False)

best_model = None
max_score = 0
for epoch in tqdm([10, 20, 30, 40, 50]):
    model = fasttext.supervised("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/train.txt",
                                "model", dim=300, min_count=5, epoch=epoch)

    pred_class = model.predict(list(dev["text"]))
    true_class = dev["class"].apply(lambda x: x[9:])

    score = f1_score(true_class, pred_class, average="macro")
    if score > max_score:
        best_model = model
        max_score = score

pred_class = model.predict(list(test["text"]))
true_class = test["class"].apply(lambda x: x[9:])

print(f1_score(true_class, pred_class, average="macro"))

scores = f1_score(true_class, pred_class, average=None)
for i in range(len(scores)):
    label = model.labels[i]
    count = train[train["class"] == f"__label__{label}"].shape[0]
    score = scores[i]
    print(f"{label:20s}\t{count:5d}\t{score:.3f}")

0.10760991986106269
AVADA_KEDAVRA       	 7491	0.296
CRUCIO              	 7378	0.163
ACCIO               	 4168	0.349
LUMOS               	 3894	0.000
STUPEFY             	 3326	0.000
OBLIVIATE           	 2891	0.430
EXPELLIARMUS        	 2728	0.000
LEGILIMENS          	 1725	0.000
EXPECTO_PATRONUM    	 1657	0.085
SECTUMSEMPRA        	 1458	0.000
PROTEGO             	 1396	0.000
ALOHOMORA           	 1263	0.426
SCOURGIFY           	 1244	0.000
INCENDIO            	 1238	0.250
IMPERIO             	 1180	0.000
WINGARDIUM_LEVIOSA  	 1178	0.000
REDUCTO             	 1159	0.000
PETRIFICUS_TOTALUS  	 1134	0.062
SILENCIO            	 1056	0.000
REPARO              	 1036	0.075
MUFFLIATO           	  923	0.242
AGUAMENTI           	  746	0.000
FINITE_INCANTATEM   	  644	0.128
NOX                 	  638	0.509
RIDDIKULUS          	  616	0.174
INCARCEROUS         	  606	0.000
DIFFINDO            	  491	0.000
IMPEDIMENTA         	  491	0.078
LEVICORPUS          	  488	0.000
EVANESCO            	  

In [65]:
# Augmentation
def augment(dataframe):
    augmented = []

    for idx, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]):
        if idx % 2 == 0:
            label = row["class"]
            text = row["text"]
            for i in range(len(text)):
                try:
                    text[i] = wn.synsets(word)[1].lemmas()[-1].name()
                except Exception as e:
                    continue
            augmented.append([label, text])
    
    augmented = pd.DataFrame(augmented, columns=columns[1:])
    return pd.concat([dataframe, augmented])

In [66]:
train = augment(train)
train.to_csv("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/train.txt", sep=" ", index=False, header=False)

HBox(children=(FloatProgress(value=0.0, max=60980.0), HTML(value='')))




In [67]:
best_model = None
max_score = 0
for epoch in tqdm([10, 20, 30, 40, 50]):
    model = fasttext.supervised("/mnt/hdd1/users/svinkapeppa/hpac/data/splits/train.txt",
                                "model", dim=300, min_count=5, epoch=epoch)

    pred_class = model.predict(list(dev["text"]))
    true_class = dev["class"].apply(lambda x: x[9:])

    score = f1_score(true_class, pred_class, average="macro")
    if score > max_score:
        best_model = model
        max_score = score

pred_class = model.predict(list(test["text"]))
true_class = test["class"].apply(lambda x: x[9:])

print(f1_score(true_class, pred_class, average="macro"))

scores = f1_score(true_class, pred_class, average=None)
for i in range(len(scores)):
    label = model.labels[i]
    count = train[train["class"] == f"__label__{label}"].shape[0]
    score = scores[i]
    print(f"{label:20s}\t{count:5d}\t{score:.3f}")

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


0.11254685376824754
AVADA_KEDAVRA       	11275	0.290
CRUCIO              	11019	0.157
ACCIO               	 6183	0.373
LUMOS               	 5826	0.000
STUPEFY             	 5022	0.000
OBLIVIATE           	 4327	0.411
EXPELLIARMUS        	 4093	0.150
LEGILIMENS          	 2582	0.000
EXPECTO_PATRONUM    	 2519	0.078
SECTUMSEMPRA        	 2204	0.000
PROTEGO             	 2111	0.095
ALOHOMORA           	 1887	0.417
SCOURGIFY           	 1863	0.000
INCENDIO            	 1833	0.250
IMPERIO             	 1789	0.000
REDUCTO             	 1737	0.000
WINGARDIUM_LEVIOSA  	 1731	0.000
PETRIFICUS_TOTALUS  	 1694	0.126
SILENCIO            	 1600	0.000
REPARO              	 1565	0.118
MUFFLIATO           	 1391	0.198
AGUAMENTI           	 1134	0.000
FINITE_INCANTATEM   	 1008	0.086
NOX                 	  943	0.497
RIDDIKULUS          	  933	0.156
INCARCEROUS         	  909	0.000
IMPEDIMENTA         	  753	0.000
DIFFINDO            	  749	0.070
LEVICORPUS          	  733	0.000
EVANESCO            	 

# Часть 4. Итоги

## Эксплоративный анализ

1. Я работал с сырыми текстами. В самом начале сделал простейшую обработку: убрал пунктуацию, привел слова к нижнему регистру и токенизировал предложения.
2. Подсчет топ-1000 слов был тривиален. Топ-10 из списка получился осмысленный: имена популярных героев + `said` (наверное, фанфики пишут люди, которые не очень сильно думают о том, чтобы использовать другие слова кроме `said`, чтобы вводить прямую речь).
3. Подсчет имен я хотел делать по-серьезному -- использовать NER-теггер. Я попробовал `spacy` и вышло очень плохо: имена почти что не распознавались, а обработка всего датасета занимала примерно 55 часов. Вместо этого я взял список имен из Интернета, прошелся по самым популярным словам, посчитаным ранее, и взял те, которые точно были именами. Результаты нормальные: Гарри и Гермиона впереди с отрывом.
4. Подсчет пар (Имя, Фамилия) решил делать примерно так же, только в этот раз брал уже биграмы. Результаты получились нормальные: в топе только популярные персонажи.
5. Подсчет пар (Профессор, Имя/Фамилия) сделал как (Имя, Фамилия), только чуть-чуть упростил: сразу искал в тех предложениях, где было слово "профессор". Результаты меня удивили: я думал, что в топе будет профессор МакГонагалл.

## Модели представления слов

1. Решил обучать `word2vec`, потому что с `fasttext` у меня были проблемы с установкой (см. ниже).
2. Обучение довольно долгое (корпус ~22кк). Подождал 7 эпох и решил, что хватит. По примерам синонимов, аналогий, лишних слов и визуализации видим, что 7 эпох в целом не плохо, но можно и лучше.
3. В визуализации заметно, что появились кластеры: "цифры", "факультеты", "персонажи" и др.

## Классификация текста

1. `fasttext` 0.9.1 поставить не получилось, поэтому взял 0.8.4. В нем другой интерфейс, но в целом работает.
2. Если просто обучить `fasttext`, то получим небольшой `macro F1-score` -- всего 10%.
3. Если добавить аугментацию синонимами, то качество увеличивается, но не сильно -- 11%.
4. Выделить какую-то закономерность в том, как хорошо происходит классификация и количеством классов не получается: бывают популярные заклинания с качеством 0, а бывают совсем непопулярные с высоким качеством.