In [2]:
import nltk
import spacy
from nltk import Tree
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from SynSetFactory import SynSetFactory



ps = PorterStemmer()

sentences = []
tokens = []
lemmas = []
stems = []
pos_tag = []
dependency_parser = []
relations = []
parse_tree = []

def wordnet_features(word):
    synset_ = {'synonym':'', 'hypernym':'','hyponym':'','meronym':'','holonym':''}
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synset_['synonym'] += l.name()+','
        
        for hyper in syn.hypernyms():
            for lemma_ in hyper.lemma_names():
                synset_['hypernym'] += lemma_+','
        
        for hypo in syn.hyponyms():
            for lemma_ in hypo.lemma_names():
                synset_['hyponym'] += lemma_+','
        
        for mero in syn.part_meronyms():
            for lemma_ in mero.lemma_names():
                synset_['meronym'] += lemma_+','
                
        for holo in syn.part_holonyms():
            for lemma_ in holo.lemma_names():
                synset_['holonym'] += lemma_+','
    return synset_

def dependency_parse_tree(sentence):
    en_nlp = spacy.load('en_core_web_sm')
    doc = en_nlp(sentence)

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
        else:
            return node.orth_
    
    tree = [to_nltk_tree(sent_.root) for sent_ in doc.sents]
    # [to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]
    return tree[0]

sp = spacy.load('en_core_web_sm')

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\BillShuts\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# from google.colab import files
# uploaded = files.upload()

In [5]:
# Folder Path
path = "C:/Users/BillShuts/PycharmProjects/Task1/Input/"
file_path = path + "abraham-lincoln.txt"

# for file in os.listdir():
#     if file.endswith(".txt"):
#         file_path = path+"/"+file
#         print(file_path)
        
# for name, data in uploaded.items():
#   file_path = name
        
with open(file_path, 'r', encoding='utf8') as f:
    lines = f.readlines()
    data = ""
    for line in lines:
        line = line.strip()
        data += line+"\n"    

    document = sp(data)
    for sent in document.sents:
        sentences.append(sent)
    for sent in document.sents:
        token =[]
        lemma = []
        stem = []
        pos = []
        parser = {}
        synset = {}
        for words in sent:
            token.append(words.text)
            lemma.append(words.text+"-"+words.lemma_)
            stem.append(words.text+"-"+ps.stem(words.text))
            pos.append(words.text+"-"+words.pos_)
            parser[words.text] = {"dependency" : words.dep_,
                                 "parent_pos":words.head.text+"-"+words.head.pos_,
                                 "children":[[child.text+"-"+child.pos_,child.dep_] for child in words.children]}
            synset[words.text] = SynSetFactory.create_syn_set(words.text)

        tokens.append(token)
        lemmas.append(lemma)
        stems.append(stem)
        pos_tag.append(pos)
        dependency_parser.append(parser)
        relations.append(synset)
        if not sent.text.isspace() and len(sent.text.split()) > 1:
            parse_tree.append(dependency_parse_tree(sent.text))

KeyboardInterrupt: 

In [12]:
# mkdir output

In [13]:
# cd output

In [14]:
print("Features are extracted as List. Now storing as text files")

with open('Output/sentences.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in sentences)

with open('Output/tokenized.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in tokens)

with open('Output/lemmarization.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in lemmas)

with open('Output/stemming.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in stems)

with open('Output/pos_tagged.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in pos_tag)

with open('Output/dependency_parsed.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in dependency_parser)

with open('Output/parse_tree.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in parse_tree)

with open('Output/wordnet_relations.txt', 'w', encoding='utf-8') as f:
    f.writelines("%s\n" % i for i in relations)

Features are extracted as List. Now storing as text files
