In [None]:
import pickle
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.neural_network import MLPRegressor as mlp
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install rouge
from rouge import Rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# change to path to dataset
file_name = "/content/cnn_dataset_1000_labelled.pkl"
stories = pickle.load(open(file_name, 'rb'))

In [None]:
stories[0]

{'story': ["potomac, maryland (cnn) -- to combat the depression and despair during her 105-day stint in iran's notorious evin prison, haleh esfandiari welcomed all distractions and blocked thoughts of her beloved home and family.",
  'haleh esfandiari talks to iranian media in front of evin prison after her august 21 release.',
  'the iranian-american scholar, who was charged with espionage and endangering iranian national security during a december visit to her family, wrote a book in her mind, read newspapers, watched television and exercised voraciously.',
  '"i decided either i am going to succumb to despair or i am going to try to make the best of this condition, and the best of this condition was to have a disciplined day," she said. "so i would exercise for many hours, i would read, i would walk a lot, some three to four hours a day -- even in the room, you know, i would pace up and down timing myself."',
  'the 67-year-old grandmother of two said dwelling on her incarceration, 

In [None]:
!python -m spacy download en_core_web_lg

2023-12-29 04:28:47.019319: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-29 04:28:47.019406: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-29 04:28:47.022145: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully install

In [None]:
import spacy
embedder = spacy.load('en_core_web_lg')

In [None]:
# basic embeddings using averaged glove vectors
# using Spacy's large language model
def get_embedding(text):
    extract = embedder(text)
    total_sum = np.zeros(300)
    count = 0
    for token in extract:
        count += 1
        total_sum += np.asarray(token.vector)
    return total_sum / count

In [None]:
# creating the inputs and expected outputs
train_size = 900
val_size = 50
test_size = 50

def make_set(start_index, size):
    count = 0
    X_set = []
    y_set = []

    for count in tqdm(range(size)):
        data = stories[start_index + count]

        doc_emb = get_embedding(data['story_text'])

        index = 0
        for sentence in data['story']:
            sent_emb = get_embedding(sentence)

            x = np.concatenate((sent_emb, doc_emb))
            try:
                y = data['scores'][index]
            except:
                y = 0.0
            index += 1

            X_set.append(x)
            y_set.append(y)

    return np.asmatrix(X_set), np.asarray(y_set)

X_train, y_train = make_set(0, train_size)
X_val, y_val = make_set(train_size, val_size)
X_test, y_test = make_set(train_size + val_size, test_size)

100%|██████████| 900/900 [08:17<00:00,  1.81it/s]
100%|██████████| 50/50 [00:24<00:00,  2.02it/s]
100%|██████████| 50/50 [00:39<00:00,  1.28it/s]


In [None]:
def get_values(X, model):
    X_array = np.asarray(X)
    return model.predict(X_array)

def get_loss(pred, y):
    return np.linalg.norm(pred - y) / np.shape(y)[0]

model_name = "extractive_summarizer"

def make_parameters(train_size):
    batch_size = 256
    n_batches = int(4 * (train_size / batch_size))

    print("Total Number of Training Examples: " + str(train_size))
    print("Batch Size: " + str(batch_size))
    print("Number of Batches: " + str(n_batches))

    return batch_size, n_batches

def train(X_train, y_train, batch_size, n_batches):
    model = mlp(hidden_layer_sizes = (1024, 2048, 1024, 512, 256, 256, 128, 64), max_iter = 1000)

    train_size = np.shape(X_train)[0]

    min_loss = 1e20

    for iterator in tqdm(range(n_batches)):
        idx = np.random.randint(0, train_size, size = batch_size)

        X_select = X_train[idx,:]
        y_select = y_train[idx]

        model.partial_fit(X_select, y_select)

        sentence_predicted_scores = get_values(X_val, model)

        loss = get_loss(sentence_predicted_scores, y_val)

        # saving best model seen so far
        if loss < min_loss:
            min_loss = loss
            pickle.dump(model, open(model_name + '_best_model', 'wb'))

    final_model = pickle.load(open(model_name + '_best_model', 'rb'))
    drive_path = '/content/drive/MyDrive/extractive_summarizer.pkl'
    with open(drive_path, 'wb') as file:
        pickle.dump(final_model, file)
    return final_model

In [None]:
train_count = y_train.shape[0]
batch_size, n_batches = make_parameters(train_count)

Total Number of Training Examples: 19223
Batch Size: 256
Number of Batches: 300


In [None]:
model = train(np.array(X_train), 1000 * y_train, batch_size, n_batches)

100%|██████████| 300/300 [07:11<00:00,  1.44s/it]


In [None]:
# Hyperparameter for similarity threshold
theta = 0.95

def similarity(A, B):
    similarity =  (A @ B.T) / (np.linalg.norm(A) * np.linalg.norm(B))
    return similarity

def get_top_k(X_doc, y, k):
    order = np.flip(np.argsort(y))
    sentence_set = []
    for sent_id in order:
        if sentence_set == []:
            sentence_set.append(order[0])
            continue

        consider = X_doc[sent_id, :]
        flag = 1
        for consider_id in sentence_set:
            if similarity(X_doc[consider_id, :], consider) > theta:
                flag = 0
                break

        if flag == 1:
            sentence_set.append(sent_id)
    return sentence_set[0: min(k, len(sentence_set))]

In [None]:
# Creating object of the ROUGE class
rouge = Rouge()

In [None]:
# evaluation
# testing out each document iteratively
# test set: document 'train_size + val_size' onwards

def join(lst):
    string = ""
    for elem in lst:
        string = string + elem + " . "
    return string

def extract_rouge(rouge_dict):
    scores = []

    scores.append(100 * rouge_dict["rouge-1"]['f'])
    scores.append(100 * rouge_dict["rouge-1"]['p'])
    scores.append(100 * rouge_dict["rouge-1"]['r'])

    scores.append(100 * rouge_dict["rouge-2"]['f'])
    scores.append(100 * rouge_dict["rouge-2"]['p'])
    scores.append(100 * rouge_dict["rouge-2"]['r'])

    scores.append(100 * rouge_dict["rouge-l"]['f'])
    scores.append(100 * rouge_dict["rouge-l"]['p'])
    scores.append(100 * rouge_dict["rouge-l"]['r'])

    return np.asarray(scores)

start_doc_id = train_size + val_size
doc_count = len(stories)

generated_summary, gold_summary = 0, 0

# set the number of documents for testing
limit = test_size

result = {}
result['3'] = np.zeros(9)
result['4'] = np.zeros(9)
result['5'] = np.zeros(9)
# averaging the ROUGE Metrics
# for different summary lengths

count = 0
all_summaries = []

while count < min(doc_count, limit):
    X_doc = []
    y_doc = []
    data = stories[start_doc_id + count]
    doc_emb = get_embedding(data['story_text'])

    index = 0
    for sentence in data['story']:
        sent_emb = get_embedding(sentence)

        x = np.concatenate((sent_emb, doc_emb))
        try:
            y = data['scores'][index]
        except:
            y = 0.0

        index += 1

        X_doc.append(x)
        y_doc.append(y)

    X_doc = np.asmatrix(X_doc)
    y_doc = np.asarray(y_doc)

    sentence_predicted_scores = get_values(X_doc, model)

    loss = np.linalg.norm(sentence_predicted_scores - y_doc)

    print(loss)

    gold_summary = join(data['highlights'])

    for k in [3, 4, 5]:
        summary_sent_id = get_top_k(X_doc, sentence_predicted_scores, k)

        generated_summary = join([data['story'][idx] for idx in summary_sent_id])

        scores = rouge.get_scores(generated_summary, gold_summary)[0]
        result[str(k)] += extract_rouge(scores)

    summary_eval = {'doc': data['story_text'], 'gen_summ': generated_summary, 'true_summ': gold_summary}
    all_summaries.append(summary_eval)

    count += 1

for k in [3, 4, 5]:
    result[str(k)] = result[str(k)] / test_size

predicted = get_values(X_test, model)
test_loss = get_loss(y_test, predicted)

print("Sample Output:")
print("Document:\n", stories[-1]['story_text'])
print("Generated Summary:\n", generated_summary)
print("Gold Summary:\n", gold_summary)

print("\nAll Metrics:\n")

data = []
for k in [3, 4, 5]:
    lst = np.ndarray.tolist(result[str(k)])
    lst.append(test_loss)
    data.append(lst)

df = pd.DataFrame(data, columns = ['R1-f', 'R1-p', 'R1-r',
                                    'R2-f', 'R2-p', 'R2-r',
                                    'Rl-f', 'Rl-p', 'Rl-r',
                                    'Loss'], dtype = float)

df.index = ['glove top-3', 'glove top-4', 'glove top-5']
display(df)

# save results into a dataframe file
# df.to_csv(model_name + '_results.csv')

3.097774754062668
3.375584823216594
3.1262355444243584
4.2112417264576525
2.283424486765786
2.481217421137601
1.833497198126616
3.856162822917234
1.9256955350652396
1.9541386672276198
2.9774919755620313
1.692843118200946
1.7203849897004655
4.993715811102012
3.067279556557946
2.4520579801859843
2.8467154027430146
2.5651704386178755
3.211432967830303
2.3813479910990134
3.237939536766949
3.1839254145149667
5.307474274982902
2.811395233803725
2.221701780375956
3.1519597292489157
3.713573460475412
3.0763492313582397
3.4682471975128277
2.891407630077076
2.9195640958425026
5.1567066241146815
3.572749656817841
2.543769322202955
5.11975857177629
2.906958366781263
2.6965679444143102
4.037941896874088
2.001659453318919
2.97994376965974
3.054287737049449
3.6770749075014284
2.459209341254176
2.493887556422647
3.747481188716583
3.038710813228341
3.243248651958497
2.6464417598554038
3.6702100637558854
2.1990987077359
Sample Output:
Document:
 -- in the wake of the earthquake in haiti, george clooney 

Unnamed: 0,R1-f,R1-p,R1-r,R2-f,R2-p,R2-r,Rl-f,Rl-p,Rl-r,Loss
glove top-3,20.585508,17.175541,28.386594,3.572463,2.834532,5.311748,19.32487,16.171924,26.582053,0.021697
glove top-4,20.575442,15.719621,33.385626,4.020607,2.924329,7.799534,19.27261,14.749915,31.202568,0.021697
glove top-5,20.861253,15.091167,37.972931,4.497306,3.088513,9.974654,19.607727,14.198037,35.709369,0.021697


In [None]:
all_summaries[0]

{'doc': 'philadelphia, pennsylvania (cnn) -- i wore a path between washington and philly for interviews and shoots for our recent "cheating death" special with dr. sanjay gupta..in the field: jeremy harlan.my job as a photojournalist is to shoot and edit stories for cnn and make sure my news team eats well on the road..on assignment: philadelphia, pennsylvania.now, i love cheese steaks as much as the next guy. but my arteries can only take so much steak, cheese and peppers. so here are a few places that are great alternatives in the city of brotherly love..reading terminal market.hours: 8 a.m.-6 p.m. monday through saturday; 9 a.m.-5 p.m. sunday.cuisine: you name it, the market has it.how do i describe the reading terminal market? it\'s like the las vegas of food. everywhere you turn, there\'s something interesting to see, smell and taste. it is food sensory overload..the hardest part of going to the market is not eating the very first thing you see. give yourself 15 minutes to walk th

In [None]:
# my_text = """All right, so we had quite a bit of reading today. Chapter one of, and we're gonna start with Baker, Introduction to Old English, and move on to the little bit of reading that you had in Mitchell and Robinson. So chapter one on the Anglo-Saxons and their language, this mostly just went into quite a bit more depth of my, a deeper version of my very quick snapshot micro history lesson from the first day of class. Any questions about chapter one about introduction to Old English Baker, chapter one? Yeah, go ahead. Oh, that's not in chapter one. Chapter one is just the history. We're going through in order. So everybody, please take out their Baker textbook because this is, we need this, whether you have the opening files from on Canvas or soon the book itself. All right, so chapter one was just the history. Chapter two So I'm gonna go through the kind of greatest hits of this of this chapter and we'll and chapters three and four and then we will we will I'll take questions as they as they arise So one of the things one of the most important things on page 12 is that length, vowel length, is important in pronunciation. So when you see this long mark over the E of hey, you really actually hold it hey as in, as longer than if it did not have that long mark. So listen to yourself, they give a good example in the parenthetical on page 12. Listen to yourself as you pronounce beat and bead. We naturally hold the E longer when we say bead versus beat. And that's the kind of difference in length that you're going to want to observe when you say a long vowel versus a short vowel in Old English. Vowels in Old English, Old English has seven vowels functionally. It has the five that we're of, plus ash and why. And they are pronounced, I'm going down the list here on page 13. So the only hard or the only non-modern English sound of those is the last one, the Y, which is pronounced, as if you're saying, tu, or dure. So, kining, not kining, kining, kying, brid, bride. The other most important thing about these vowels is the ash, that A ligature, which when I write it on the board, I'll write like that. This vowel is always pronounced A, as in cat. Make sure to pronounce it that way, even when it's hard to do so. So, quatt, that first word of that first email I sent you at the beginning of the semester, that becomes modern English. What? Quatt is not that hard to say. Quatt, however, the old English word for where, is hard to say. But still stay with me in Duckland, quack, quack, quack, quack, quack, even when it's hard, quack, all right? You don't need to worry about i.e. that last of those vowels on page 13. One last thing at the bottom of page 13, and it's hard to do because our modern English inclination to not pronounce final vowels precisely, final unack sounded vowels precisely is so strong. We tend to reduce everything to a schwa, that sort of neutral-a sound. But it is important because in Old English, that date of singular cune-ing-gay ends in an E versus genitive plural cune-ing-gah, right? That last vowel, that is the only way to distinguish those two different versions of that word. So just when you're reading out loud, try to distinguish those vowels at the ends of words. Diff thongs on page 14. The most important thing to remember here is to really say these as one syllable, not two syllables. So, Bae-wolf, that E-O of Bae-wolf, that should be it. The name is two syllables. Bae-wolf, right? The E-A represents a diphthong that started with A and glided to A. This is hard to do in modern English. There's a reason that we don't pronounce words like this anymore, but that word, phylon, rad. I was taught to pronounce it sort of ea rather than ash a. ea is a lot easier to say frankly, but try'sh A and see if you can get it into your voice. So file on rad. Yeah, Tara. Top of 14. Any questions about vowels or diphthongs? All right, consonants, I'm not gonna go over all of these in detail. The most important thing to remember is just that there are no silent consonants. So if it's there on the page, say the letter. Let's see. I think the most challenging ones, and we'll get to these in a little bit, the most challenging ones are the C and the G, just because they have such a range of pronunciation. C, this is number six on page 15. C is pronounced k when it has no little dot over it and chah if it does. But it's never pronounced s. G is trickier. So G has several options. Dotless G is pronounced g, as you would expect, when it comes at the beginning of a word or a syllable. Between voiceless sounds, dotless G is pronounced G. So like, dog-ass. You just kind of, this G, you just sort of sort of swallow it. So instead of dog-goss it's like dog-goss Everyone try that dog-goss? Dog-goss? Yeah, okay. We'll get practice later on CG is pronounced J so bridge bridge edge number eight there um and H is always H at the beginning, but it all at the end it has that h sound. So nicht, naach, ther, dwerf, for dwarf. And SC is usually pronounced, shh. So what looks like skip is actually ship. What looks like ask is actually ash. Rushan, there are a few exceptions to this, but that's pretty safe. Any questions on these? We'll get practices we all read aloud together. Oh, by the way, I did put up a video of me reading the Old English Roon poem on Canvas. So if you haven't checked that out, it's a good way to just get the sound of Old English in your ears as you're working at home. All right, so more about vowels now, 2.2 at the bottom of page 16, leading into 17. I'm mutation on page 17. I'm mutation is a shift in the quality of a vowel, so it's pronounced with the tongue higher and farther forward than usual. And there's actually a useful diagram on pages in Mitchell and Robinson. On page 22 of Mitchell and Robinson, it has a kind of a diagram of where in the mouth the tongue is located when these vowels are pronounced. You might find it useful. What's nice about table 2.1 is that it gives you the kind of key for how unmutated vowels become mutated vowels. You do not need to memorize this table. It will never show up on a quiz. but I do encourage you to sort of practice making those sounds, those shifts from A to A, A to E, O to A, O to E. The reason for doing that is because it will help you recognize a lot of irregular, English nouns take I'mutation in the plural. This is how we go from mouse to mice, goose to geese. That's the process. And you can see goose that O becomes E, anyway. Don't need to worry about it too much right now though. One little point I will note on page 18 though is that some modern English words have the way that I mutation works often leads to ash going turning into EA. So for example, you see halon becomes there. That will become heel in modern English. Similarly, rad will become read in modern English. Again, not something you need to memorize, but just something to know. Like if you're guessing about the meaning of a word and you see an ash, try turning it into an EA and see if it makes sense. This process, by the way, whereby the vowel changes, but the consonants don't, that's very typical of the linguistic evolution of English. Vowels are much, much more inclined to change than consonants, which is not to say that consonants don't change, they do, but it's just something to remember. Any questions so far? All right, page 19, take a look at the fuller middle of the page, the fuller explanation, well, top-ish in middle of the page, these fuller explanations of C and especially G. I've already gone over the difference in G word initial as goad, glad versus the kind of swallow G of doggos, sorgorgha, seagahn, seagahn. Very importantly, that dotted G is almost always pronounced Y. So Yistren die for yesterday. Slayin, sl. May. May. Sale. Meaning sale. Salehudah. The only difference really is certain, when you have N and then dotted G right after it, it's pronounced just like in modern English angel, Sangia. And then I think that's about it that, oh, accentuation, this is also important, bottom of page 20. This makes life so much easier. All Old English words are accented on the first syllable. Except, it's a pretty limited exception. Words with that prefix GE, or the YI, the this prefix. Here, we pronounce the, we accent the second syllable. But other than that, everything is pronounced on the first syllable. Only other exception to that is verbs with prefixes are accented after the prefix. So the examples that they give down there for waer than accented on the second syllable, but the noun derived from it for weird is still accented on the prefix. So when in doubt, accent the first syllable. And then you have a little summary of the pronunciation rules on the bottom of in 2.7 on the bottom of page 21. All right. Any questions there? Okay, basic grammar, a review. I'm not gonna go into, I not going to go through all of this, but I am going to emphasize the importance of understanding the difference between a clause and a phrase, okay? So a phrase is a cohesive group of words that lack a subject and a verb, whereas, or rather a subject in a finite verb, a clause is a group of words that has a subject in a finite verb, okay? And a finite verb just means like a conjugated verb that's inflected as distinct from an imperative or an infinitive or something like that. Okay? So, in the example that I put up on the board here, I put it on the table that I bought in Greece. Let's find some phrases and clauses. The reason it's important to be able to find phrases and clauses in any sentence is that every time you translate an old English sentence, you are looking for the main for the subject and the main verb. So what is the main verb of this sentence? Yeah. Yeah, exactly. So here's the subject and here's the verb. And yet we have other verbs, right? What is the other verb? Bought exactly. So we know that there are how many clauses in this sentence. Two, exactly. There will always be as many clauses as there are subjects and verbs. But this one is governing this one. How is it doing so? What are the phrases that we can see and what kinds of phrases are they? Yeah, go ahead. Okay. All right, good. Very good. So here we have a noun clause. Here we have a long prepositional phrase with an embedded clause and then a further embedded phrase right there. Good. You don't need to worry too much about the different kinds. I'm never going to test you on, is this a noun clause? Is it an adverb clause? Whatever. That I do not care about. What I do care about is your ability. And you don't have to diagram sentences like these on exams. But you will need effectively to be able to diagram the sentences in order to be able to translate correctly. Okay? So I'll highlight the bottom of page 25. This is in 3.13. The finite clause must contain a finite verb. In general, finding and understanding the finite verb is the key to decoding complex clauses and sentences in Old English. And so it is essential that you get familiar with the finite verb paradigms. You will memorize the finite verb paradigms and do course. I say this as a preview of coming attractions. The reason that the verb is more important than the subject in Old English is that the verb is always there, whereas the subject can be implied as it is in some modern languages as well. All right. Questions? I'll just say very quickly on page 26 when it says the past participle is also used to form a paraphrastic passive. paraphrastic just means using multiple words to carry the meaning. So using the words, the words most lovely as opposed to loveliest, that's an inherently paraphrastic way of speaking. So the king was slain as opposed to just the king died, et cetera. Let's see. Any questions leading up to 3.4 on page 30? All right. Subjects, the elements of the sentence or the clause. So the subject names what the sentence or clause is about. It may be a noun, a pronoun, a noun, phrase or a list. I mean, namely a compound subject. Top of page 31, pay close attention here. In Old English, as in modern English, subjects can be simpler complex. So what's different though is that in Old English, a compound subject can be split. If I say in modern English, my friends and I are going to the store, I would never split up my friends and I in the sentence. That would be very, very strange. I would not say my friends are going to the store and I. In Old English, they do that all the time. So, for example, my shield protects me and my sword. In modern English, that sentence is unambiguous, if a little weird. The me and my sword are the compound direct object of protects. In old English, however, my shield protects me and my sword. The only way you're going to be able to determine whether my sword is, as it is in modern English, an object of the verb protect, or part of the compound subject, i.e. my shield and my sword protect me, the only way you're going to be able to distinguish is based on the case, the grammatical case of the word sword, all right? Which is one of the reasons that knowing the paradigms is so absolutely crucial to being able to decode these sentences. Old English differs from modern English in that it often omits the subject when the context makes it obvious what it is. It says there. I would actually say that's still at the top of page 31. I would actually say that Old English often omits the subject even when context does not make it particularly clear what it is, which especially in poetry, which is one way that poets, Old English poets could get kind of nice literary mileage out of the inherent ambiguity of the language, but it also makes things harder for us as non-native speakers and readers of the language. So bear these things in mind as you're moving forward. Yeah. In the case of only one shut up here, you have like me on my shield, I see my sword. Presumably this would all be reflected in the verb and that like you would expect the verb to reflect for plural. Is it going to come from conflict? Excellent. Yes. And to ask if Arabic does weird things with this were like it might not reflect clearly just for kicks. No, you're absolutely right. It's a totally great point. You're right. We would be able to distinguish based on in this case, but not necessarily in others. Excellent point. Because if it were say my companions protect me and my right, then it would still be ambiguous. But very good right. Next thing I want to highlight is the top of page 33. Because the direct object is usually defined as the noun, this is from the previous page, usually defined as the pronoun, or noun phrase that receives the action of the verb. They explain some sort of subtleties of that. Top of page 33. In modern English, the direct object usually follows the verb and never has a preposition in front of it. In old English, the direct object may fall over the verb, but it may also precede the verb, especially when the object is a pronoun. Generally, it's in the accusative case, but not always. And then an indirect object is a thing that has some indirect relationship to the action of the verb. Basically, the indirect object is anything that is not a direct object. So if you, and you really don't need to worry about, again, I'm not going to say what is the direct object. I'll just expect you to be able to translate. But the variable word order that is alluded to at the top of page 33, the direct object may follow the verb, it may also precede it. This is something that we're going to hear a lot. Word order is extremely flexible in Old English compared to modern English. And the reason for that is that it's a case-based language, right? We learn a lot of the features, the functions of the word in a given sentence, based on the case endings. Modern English has lost almost all its case endings. Who and whom is one sort of increasingly archaic exception to that, where who is the subject, whom is the object? But apart from and he, him, right? There are, she, her, there are, we do have cases, but increasingly they are obsolete and not, they're only in pronouns, not in nouns. But because Old English communicates so much more information through the endings of the words themselves, you can have word order that is what we would think of as unnatural. And you have to kind of rearrange the pieces of the sentence, the pieces of the puzzle as it were in order to decode it. We'll come back to this again and again, but it's worth telling yourself from the beginning that this is true. Any questions about chapter three and this very basic, very quick grammar review? All right, let's move on to chapter four and cases. So page 34, case is the inflection of nouns, pronouns and adjectives to signal their function in sentences and clauses. This is exactly what we were talking about just before. Modern English case has almost disappeared. Again, except in pronouns. The cases in Old English are nominative, accusative, genitive, date-ive, and the increasingly archaic as we move. It's actually archaic, even a little archaic, even from the beginning of our written record, the instrumental. So the nominative case is quite easy because it has, as Baker says, had has few functions and there are few complications. Basically, it's the subject. It's also the complement. So the complement is the word on the other side of a linking verb such as to be. So sayosunna is swi-da-broad. The sun is very broad. Broad there is in the nominative because it is the complement of su-na. And then since Old English does not have a vocative case like some languages. The nominative is used for direct address as well. But those are really the only things that it gets used for. The accusative case is also pretty straightforward because direct objects of transitive verbs are typically in the accusative case. So thus, in the sentence at the top of page 37, his aga and swastore the burriida, his leech, his own sister buried his corpse, leech is in the accusative. Some prepositions always take the accusative, no matter what. And sometimes you have an accusative used adverbially in expressions of time, but you don't need to worry too much about that. All right, the genitive case modifies or limits a word, this is page 37, by associating it with something. So for example, in the phrase, fast-kinningus sweared, the king's sword, the sense of sword is modified by our saying that it belongs to the king. We're not speaking of any just sword, or just any sword. Most genitives will fall into one of three categories. The possessive genitive is the one that we're most familiar with. The King's sword, Saint Edmund's feast day. But the partitive genitive is an extremely important because extremely common use of the genitive in Old English. So for example, Alch, Tha-Ramanna, each of the men, Elra, Kuningah, Best of all kings. As the translations with Av suggest, we have a roughly similar construction made with the preposition of, but Old English doesn't need the preposition because it has the genitive case to sort of embed that OV indirectly into the word itself. So make sure that you're on the lookout for that. And notice already, just in those two examples, how in one case the partitive genitive comes after the adjective, ouch, ail ra mam na. And in the other one, ail ra kuning best, the partitive genitive comes before. Again, typical. You've just got. So one piece of advice I like to give is to kind of like, we have very rigid word order in modern English. In order to read old English, well, you kind of have to almost like, I think of it as like, relaxing. Our sense of what word order can do. And you have to develop a kind of almost like flexibility of mind in saying, OK, well, what if this word goes with that word, even though they're separated by like an entire line of poetry and so on and so forth? Finally, the third element of the, of the, or third aspect of the genitive is the descriptive. So that lump shall bound, weatus, hewis. The lamb must be of a white color. And I mean, they say it's more idiomatic to say is white in color. Sure, I guess. But like, that's still, I don't know, that still sounds basically right to me. The lamb is of a white color. Maybe that is a little weird. In any event, people in Old English did it all the time. So those are the three. And then the last little bit, the last little point at the top of page 38. A few prepositions take the genitive case. It's rare, but it can happen. And a few verbs have genitive direct objects. And that's just something you'll have to learn as you learn the verbs. Some verbs take the, most verbs take the accusative. A very few take the genitive. Quite a few actually, as we'll see, take the date of. So any questions on nominative genitive, nominative, accusative or genitive before we go on to dative, which is the sort of catch all? All right. So 4.2.4, page 38. In all of the Germanic languages, the dative case is in a malgum of several older cases that have fallen together. So date of, lockative, ablative, and instrumental. Pre-written versions of Germanic languages, we think had all of these cases, but for the most part, they've all sort of like collapsed into the data by the time that we're working. This is good, and as much as it means you have fewer paradigms to memorize, it's bad in the sense that the data can do almost anything. We will discover, all right? Or it's not bad. It's confusing potentially. So I'm going to go through some of the most important uses of the date of case, and then we'll take questions. So the date of interest signifies that in some way that one is in some way interested in the outcome of an action, this category includes the indirect object. So Yifim has sweared. But the date of of interest also covers situations in which something has been taken away and is therefore sometimes called the date of disinterest. Benam heihim, his bishop, Shira, he took his bishop, bishop Rick away from him. One other aspect, one other function of the date of case that actually, oh yeah, there it is at the bottom of the page, I'll get back to that in one sec. The direct object, so some verbs have their direct objects in the date of case, I already, I mentioned that briefly before. It's not that uncommon actually. So just, yeah, and you'll just have to look up the word in order to know. Fortunately, our glossaries indicate which case the verb will take as a direct object. So for example, here on, which is the example that they give right there, oh I guess they gloss it under. It takes a little while to get the hang of using old English glossaries because there's a lot of words can be spelled a lot of different ways. There's no such thing as standardized spelling quite yet. But if you look up Huron on page 333 of the glossary to hear, to listen to, and then to obey, and then it includes in parentheses with the data. So the glossary will guide you."""



In [None]:
my_text = """ In India, 2023 will be remembered as the year we went to the Moon. On 23 August, massive celebrations broke out across the country when Chandrayaan-3 touched down in the lunar south pole region - an area on the Moon's surface that no-one had reached before. With this, India also joined an elite club of countries to achieve a soft landing on the Moon, after the US, the former Soviet Union and China. In the following months, India continued its stride into space - by sending an observation mission to the Sun and then by carrying out a key test flight ahead of its planned mission to take astronauts into space in 2025. We look back at an eventful year when India's strides into space made global headlines. It was "20 minutes of terror" for scientists at the Indian Space Research Organisation (Isro) as the Vikram lander, carrying the Pragyaan rover in its belly, began its descent to the Moon's surface.
The lander's speed was gradually reduced from 1.68km per second to almost zero, enabling it to make a soft landing in the south pole region where the surface is "very uneven" and "full of craters and boulders".
"India is on the Moon," a triumphant Isro chief S Somanath announced - and with that the country entered the history books.
Over the next 10 days, space scientists - and the rest of the country - followed every move made by the lander and the rover as they gathered data and images and relayed them back to Earth for analysis. So we saw images of the six-wheeled rover sliding down from the lander's belly and taking its first steps on the lunar soil. Moving at a speed of 1cm per second, it "traversed over 100m [328 feet]" and at times re-routed to avoid falling into craters.
Some of their findings that show a sharp difference in temperatures just above and below the lunar surface and confirmed presence of a host of chemicals, especially sulphur, in the soil have enthused space scientists and the scientific community at large. One of the highlights, Isro said, was Vikram's "hop experiment". The agency said that when the lander was "commanded to fire its engines, it rose up by about 40cm [16 inches] and landed at a distance of 30-40cm". This "successful experiment" means the spacecraft could be used in future to bring samples back to the Earth or for human missions, it added.
And earlier this month Isro said it had successfully brought back into Earth's orbit a part of the rocket that carried Chandrayaan-3 to Moon.
The "propulsion module", which had detached from the Vikram lander after ferrying it close to the Moon, had re-entered Earth's orbit after a series of complex manoeuvres.
Together, the hop experiment and the return of the propulsion module to Earth's orbit are crucial for Isro's future plans to bring back samples or return astronauts from Space.
"""

In [None]:
from spacy.pipeline.sentencizer import Sentencizer
from spacy.lang.en import English
nlp = English()
sentencizer = Sentencizer()
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7dce98deb700>

In [None]:
doc = nlp(my_text.replace("\n", ""))
my_text_sentences = [sent.text.strip() for sent in doc.sents]

In [None]:
my_new_text_sentences = []
for sent in my_text_sentences:
    if len(sent) >= 20:
        my_new_text_sentences.append(sent)
my_text_sentences = my_new_text_sentences

In [None]:
len(my_text_sentences)

18

In [None]:
# get the model from drive

# pickle_file_path = '/content/drive/MyDrive/extractive_summarizer.pkl'
# with open(pickle_file_path, 'rb') as file:
#     loaded_model = pickle.load(file)

In [None]:
text_emb = get_embedding(my_text)

my_x = []
for sent in my_text_sentences:
    sent_emb = get_embedding(sent)
    t = np.concatenate((sent_emb, text_emb))
    my_x.append(t)
my_x = np.asmatrix(my_x)
predicted_scores = get_values(my_x, model)
# k = 10
k = 5
summary_sent_id = get_top_k(my_x, predicted_scores, k)
summary_sent_id.sort()
generated_summary = join([my_text_sentences[idx] for idx in summary_sent_id])

In [None]:
generated_summary

'In India, 2023 will be remembered as the year we went to the Moon. . The lander\'s speed was gradually reduced from 1.68km per second to almost zero, enabling it to make a soft landing in the south pole region where the surface is "very uneven" and "full of craters and boulders". . Over the next 10 days, space scientists - and the rest of the country - followed every move made by the lander and the rover as they gathered data and images and relayed them back to Earth for analysis. . Moving at a speed of 1cm per second, it "traversed over 100m [328 feet]" and at times re-routed to avoid falling into craters. . This "successful experiment" means the spacecraft could be used in future to bring samples back to the Earth or for human missions, it added. . '