In [17]:
from sklearn.feature_extraction.text import CountVectorizer
import re
import pymorphy2

In [10]:

class Porter:
	PERFECTIVEGROUND =  re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
	REFLEXIVE = re.compile(u"(с[яь])$")
	ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
	PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
	VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
	NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
	RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
	DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
	DER = re.compile(u"ость?$")
	SUPERLATIVE = re.compile(u"(ейше|ейш)$")
	I = re.compile(u"и$")
	P = re.compile(u"ь$")
	NN = re.compile(u"нн$")

	def stem(word):
		word = word.lower()
		word = word.replace(u'ё', u'е')
		m = re.match(Porter.RVRE, word)
		if m and m.groups():
			pre = m.group(1)
			rv = m.group(2)
			temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
			if temp == rv:
				rv = Porter.REFLEXIVE.sub('', rv, 1)
				temp = Porter.ADJECTIVE.sub('', rv, 1)
				if temp != rv:
					rv = temp
					rv = Porter.PARTICIPLE.sub('', rv, 1)
				else:
					temp = Porter.VERB.sub('', rv, 1)
					if temp == rv:
						rv = Porter.NOUN.sub('', rv, 1)
					else:
						rv = temp
			else:
				rv = temp
			
			rv = Porter.I.sub('', rv, 1)

			if re.match(Porter.DERIVATIONAL, rv):
				rv = Porter.DER.sub('', rv, 1)

			temp = Porter.P.sub('', rv, 1)
			if temp == rv:
				rv = Porter.SUPERLATIVE.sub('', rv, 1)
				rv = Porter.NN.sub(u'н', rv, 1)
			else:
				rv = temp
			word = pre+rv
		return word
	stem=staticmethod(stem)

In [8]:
documents = [
    'Это первый документ.',
    'Это второй документ среди всех документов.',
    'А вот и третий.',
    'Вот и кончились документы.'
]
bow = CountVectorizer()
bow.fit_transform(documents)

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [28]:
def stem_sentence(sentence):
    stemmed_words = []
    for word in sentence.split():
        stemmed_words.append(Porter.stem(word))
    return ' '.join(stemmed_words)

def remove_punctuation(sentence):
    lower_sentence = sentence.lower()
    return re.sub(r'[^\w\s]', '', lower_sentence)

lemmatizer = pymorphy2.MorphAnalyzer()
def lemmatize_sentence(sentence):
    lemmatized_words = []
    for word in sentence.split():
        lemmatized_words.append(lemmatizer.parse(word)[0].normal_form)
    return ' '.join(lemmatized_words)

def preprocess_sentence_lemmatize(sentence):
    return lemmatize_sentence(remove_punctuation(sentence))

def preprocess_sentence_stem(sentence):
    return stem_sentence(remove_punctuation(sentence))

In [30]:
for text in documents:
    print(preprocess_sentence_lemmatize(text))

это первый документ
это второй документ среди весь документ
а вот и третий
вот и кончиться документ


In [53]:
import nltk
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\koval\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")
russian_stopwords = set(russian_stopwords)

In [60]:
def remove_stopwords(sentence):
    filtered_words = []
    for word in sentence.split():
        if word not in russian_stopwords:
            filtered_words.append(word)
    return ' '.join(filtered_words)

In [63]:
remove_stopwords('документы мои любимые документы и вы задокументились')

'документы мои любимые документы задокументились'

In [64]:
def preprocess_sentence_lemmatize(sentence):
    return lemmatize_sentence(remove_stopwords(remove_punctuation(sentence)))

def preprocess_sentence_stem(sentence):
    return stem_sentence(remove_stopwords(remove_punctuation(sentence)))

In [65]:
for text in documents:
    print(preprocess_sentence_lemmatize(text))

это первый документ
это второй документ среди документ
третий
кончиться документ
