In [35]:
import spacy
from spacy.matcher import Matcher
from collections import Counter
import csv
from collections import OrderedDict

In [10]:
# Load the Spacy model
nlp = spacy.load("en_core_web_lg")
verbs_counter = Counter()

# Process the text file
with open('unread_messages.txt', 'r', encoding='utf-8') as file:
    for line in file:
        doc = nlp(line.strip())

        # Iterate over sentences in the document
        for sent in doc.sents:
            # Find the root verb in the sentence if it's in its base form
            for token in sent:
                if token.dep_ == "ROOT" and token.pos_ == "VERB" and token.text.lower() == token.lemma_.lower():
                    # Increment the count for the verb
                    verbs_counter[token.lemma_.lower()] += 1

# Sort the verbs first alphabetically and then by frequency
sorted_verbs = sorted(verbs_counter.items(), key=lambda x: (x[0], -x[1]))

# Output the verbs and their counts
for verb, freq in sorted_verbs:
    print(f"{verb}: {freq}")

# If you need them in a list:
sorted_verbs_list = [(verb, freq) for verb, freq in sorted_verbs]

#respect: 1
%: 1
+1: 2
-: 6
-----: 2
-->join: 5
-->sign: 1
-just: 1
-live: 1
0jjqndcdinc/0ydqvtc00ldqstgg: 1
0jlrgnc10lzrjydqtdgj0lug0lxrgdgc0ywhinci0ysg0yprgdc/0lxqstcw0lxringminc/0ydq: 1
0kjqunga0l7qutc+0lpqvidqv9ga0ljqvnc10l3qtdc90ljrjydqmncyincyingb0ytqtdga0lug: 1
0l0k0jvqsnc80lhqtdga0yi8l2zvbnq+pc9hpgo8l3rkpgo8l3rypgo8dhi+cjx0zcbhbglnbj0i: 1
0l3qtsdqv9gd0lhqu9c40lrqvtcy0ldqstgi0ljqtdgb0y8k0l3qvtcy0lxqu9c70ysicmhyzwy9: 1
0l3quncz0lggpc9kaxy+idwvdgq+idwvdhi+idwvdgjvzhk+idwvdgfibgu+idwvdgq+idx0zcbh: 2
0l3qunga0l7qstcw0ylrjcdquczuynnwo9c/0l7rgdgh0ljrgtcw0ylrjcdqt9cwinc+0ltqunc9: 1
0l3qvtc1inc/0ljrgdgm0lzqvidqvdc1jm5ic3a70ylrgnc10lhrg9c10yig0l7rgtcy0lxrgtcw: 1
0l3rgtc10ydqsnc60ylquncy0ldrhs4gnczuynnwo9c00l3rjywgmjcmbmjzcds0ndumbmjzcdvr: 1
0l3ri9gfidwvyt4gpc90zd4gpc90cj4gphrypia8dgqgywxpz249imnlbnrlcii+idxhighyzwy9: 2
0l3ri9gfinc00ldqvdc90yvrhsa8l2e+idwvdgq+idwvdhi+idx0cj4gphrkigfsawdupsjjzw50: 1
0l7qs9c00lag0l3qtszuynnwo9cx0ypqtnc10yig0ydqttcw0llqtdgc0ywuincf0l7rjdgc0l7q: 1
0l7qs9c00la

In [24]:
sorted_verbs_list = [[verb, verb[0], freq] for verb, freq in sorted_verbs if len(verb) < 40 and freq > 7]
sorted_final = sorted(sorted_verbs_list, key=lambda x: (x[1], -x[2]))


In [74]:
filename = "verbs_raw_second_version.csv"

with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(sorted_final)

In [75]:
def format_sentence(sentence: str, verb: str):
    return sentence.replace(verb, f"**{verb.upper()}**")

def extract_sequence(sent: str, verb: str, window=10):
    words = sent.split()
    for i, word in enumerate(words):
        if word.lower() == verb.lower():
            # Calculate the start and end indices for slicing the words list
            start = max(i - window, 0)
            end = min(i + window + 1, len(words))
            # Return the sequence as a string
            return " ".join(words[start:end])
    return None

def process_chunk(text_chunk, verbs, sequences_for_verbs: dict):
    doc = nlp(text_chunk)
    for sent in doc.sents:
        sentence_text = sent.text.strip()
        for verb in verbs:
            if verb in sentence_text.lower():
                sequence = extract_sequence(sentence_text, verb)
                if sequence:
                    sequences_for_verbs[verb][sequence] = None

In [77]:
verbs = [el[0] for el in sorted_final]
print(verbs)
sequences_for_verbs = {verb: OrderedDict() for verb in verbs}

chunk_size = 100000
with open('unread_messages.txt', 'r', encoding='utf-8') as file:
    text_chunk = file.read(chunk_size)
    while text_chunk:
        process_chunk(text_chunk, verbs, sequences_for_verbs)
        text_chunk = file.read(chunk_size)

['answer', 'add', 'apply', 'ask', 'appear', 'attend', 'allow', 'avoid', 'appreciate', 'agree', 'accept', 'assume', 'actualize', 'achieve', 'affect', 'act', 'approach', 'activate', 'analyze', 'access', 'adjust', 'afford', 'automate', 'arrive', 'aim', 'align', 'argue', 'accomplish', 'attack', 'await', 'become', 'buy', 'build', 'believe', 'browse', 'bring', 'begin', 'break', 'breathe', 'benefit', 'boost', 'bet', 'beat', 'book', 'click', 'check', 'control', 'create', 'change', 'come', 'choose', 'call', 'contact', 'consider', 'continue', 'cover', 'cancel', 'contain', 'connect', 'celebrate', 'complete', 'copy', 'claim', 'care', 'catch', 'commit', 'code', 'cut', 'cause', 'chat', 'consult', 'close', 'confirm', 'count', 'contribute', 'carry', 'co', 'compare', 'cost', 'consist', 'capture', 'challenge', 'cherish', 'combine', 'collaborate', 'do', 'discover', 'download', 'discuss', 'donate', 'describe', 'draw', 'develop', 'decide', 'dive', 'design', 'define', "don't", 'determine', 'drop', 'deploy',

In [78]:
new_seq = dict()
for verb, sent in sequences_for_verbs.items():
    new_sent = list(sent.keys())
    new_sent = [el.lower() for el in new_sent]
    new_seq[verb] = new_sent

In [81]:
with open('final_raw_unread.txt', 'w', encoding='utf-8') as file:
    # Iterate through each word and its list of sentences
    for word_idx, (word, sentences) in enumerate(new_seq.items(), start=1):
        file.write(f"{word_idx}. {word.upper()}, {len(sentences)}, \n")
    file.write('='*10)
    file.write('\n')
    file.write('='*10)
    file.write('\n')
    for word, sentences in new_seq.items():
        file.write(word.upper() + '\n')
        file.write('-'*25)
        file.write('\n')
        for i, sentence in enumerate(sentences, start=1):
            if 'answer from' in sentence:
                continue
            file.write(f"{i}. {sentence}\n")

        file.write('\n')
        file.write('-'*25)
        file.write('\n')




In [80]:
for v_idx, (verbs, _, freq) in enumerate(sorted_final, start=1):
    print(f'{v_idx}. {verbs}, {freq}')

1. answer, 1616
2. add, 402
3. apply, 324
4. ask, 267
5. appear, 143
6. attend, 106
7. allow, 87
8. avoid, 77
9. appreciate, 70
10. agree, 58
11. accept, 45
12. assume, 42
13. actualize, 38
14. achieve, 33
15. affect, 29
16. act, 24
17. approach, 24
18. activate, 20
19. analyze, 20
20. access, 19
21. adjust, 19
22. afford, 18
23. automate, 18
24. arrive, 17
25. aim, 14
26. align, 13
27. argue, 13
28. accomplish, 12
29. attack, 11
30. await, 11
31. become, 495
32. buy, 298
33. build, 257
34. believe, 198
35. browse, 151
36. bring, 145
37. begin, 137
38. break, 37
39. breathe, 29
40. benefit, 21
41. boost, 20
42. bet, 19
43. beat, 18
44. book, 16
45. click, 1605
46. check, 1011
47. control, 848
48. create, 335
49. change, 291
50. come, 225
51. choose, 164
52. call, 142
53. contact, 124
54. consider, 123
55. continue, 115
56. cover, 92
57. cancel, 86
58. contain, 79
59. connect, 62
60. celebrate, 60
61. complete, 57
62. copy, 57
63. claim, 56
64. care, 40
65. catch, 37
66. commit, 37
67. 