Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 94 lines (83 sloc) 3.59 KB
#!/usr/bin/python3
# makengram.py
# script to make ngram models from the raw text
# by Anup pokhrel
# http://virtualanup.com/nepali-ngram-models/
from collections import defaultdict
import sys
class ngrammodel:
def __init__(self,n):
'''Constructor for the ngram model generator
n is the model number like 1 for unigram,
2 for bigram and so on'''
self.n = n
self.words = defaultdict(int) # words are stored as a dictionary
def processarticle(self,article):
'''
process a single article.
the function will split the article into sentences and
process the sentences
'''
# There is not simple way of representing a word in regex in nepali language
# so, we can simply assume the text to be split into sentences seperated by
# some symbols. We will then process the sentence
# endsymbols represent the symbols used to end sentences
endsymbols = ['?','!','', ';', '\n', '\r', '\r\n']
for symbol in endsymbols:
article = article.replace(symbol,'.') #replace with end of sentence symbol
sentences = article.split('.')
for sentence in sentences:
# sentence must be of enough length
if len(sentence) > 10:
self.processsentence(sentence)
def processsentence(self,sentence):
'''
process the sentence. It splits the sentence into words and analyze the word list
'''
endsymbols = ['-',',','\'','"','\t','(',')','<','>','','','','','']
for symbol in endsymbols:
sentence = sentence.replace(symbol,' ')
if self.n > 1:
# record the start and end of sentences by #
words = ['#']+sentence.split(' ')+['#']
else:
words = sentence.split(' ')
wordlist = []
for word in words:
# to meet the requirements of being a word, some of the predefined characters
# must appear in it
validletters=['','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','#']
for letter in validletters:
if letter in word:
wordlist.append(word)
if(len(wordlist) == self.n):
self.words[' '.join(wordlist)] += 1
wordlist = wordlist[1:]
break # break to next word
def readfile(self,file):
'''
reads the content of the file and saves in the ngram
model
'''
for line in file:
self.processarticle(line)
def saveoutput(self,file):
'''
saves the output in the given file
'''
for wordseq in sorted(self.words, key=self.words.get, reverse=True):
file.write(wordseq+' '+str(self.words[wordseq])+"\n")
if __name__ == '__main__':
# get the model number from command line
# like ./makengram.py 2 <outputfilename> for bigram model
if len(sys.argv) != 4:
print("Syntax : "+sys.argv[0]+"<model_number> <input_file> <output_file>",len(sys.argv))
exit()
mn = int(sys.argv[1])
if mn<1 or mn> 5:
print("Model number not supported")
exit()
model = ngrammodel(mn)
#model.readfile(open('test'))
model.readfile(open(sys.argv[2]))
model.saveoutput(open(sys.argv[3],'w+'))
You can’t perform that action at this time.