In [None]:
import numpy as np
import pandas as pd

from nltk.corpus import names
import nltk; nltk.download('stopwords')
# NLTK Stop words
from nltk.corpus import stopwords

import re

from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt


n_features = 1000
n_components = 16
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


%matplotlib inline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import dill

In [None]:
pd.set_option('max_colwidth', 120)
pd.set_option('display.width', 500)

In [None]:
data = pd.read_csv("Заявки в чатбот.csv", encoding = 'utf-16', sep=";")
data.tail(20)

In [None]:
class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
        
    def get_stopwords(self):
        russian_stopwords = stopwords.words("russian")
        df_sw = pd.read_csv('stopwords.csv', encoding = 'utf-8', sep=";")
        for index, row in df_sw.iterrows():
            russian_stopwords.append(row['stopword'])
        return russian_stopwords
        
    def to_lemmatize2(self, df, key):
        all_word_str = " ".join(df[key])
        all_word_list = all_word_str.split()
        all_unique_word = pd.Series(all_word_list).unique()
        lemmatized_word_dict = {}
        lemmatizer = MorphAnalyzer()
        for word in all_unique_word:
            lemmatized_word_dict[word] = lemmatizer.normal_forms(word)[0]
        lemm_func = lambda text: ' '.join([lemmatized_word_dict[word] for word in text.split()])
        df[key] = df[key].apply(lemm_func)
        return df, all_unique_word
    
    def fit(self, X, y=None):
        return self
    def transform(self, X):

        X[self.key] = X[self.key].replace('—','-')
        
        #1. удаляем пунктуацию
        deleted_symbols = r'[\\\\\'[\]!"$%&()*+,-./:;<=>?№@^_`{|}~«»\n]'  
        func = lambda text : re.sub(deleted_symbols, ' ', str(text))
        X[self.key] = X[self.key].apply(func)
        
        #2. удалим смайлики
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
        func = lambda text : re.sub(emoji_pattern, ' ', str(text))
        X[self.key] = X[self.key].apply(func)
        
        #3. удалим отдельно стоящие цифры
        func = lambda text : ' '.join([elem for elem in str(text).split(' ') if elem.isdigit() == False])   
        X[self.key] = X[self.key].apply(func)
        
        #4. приводим к нижнему регистру
        X[self.key] = X[self.key].apply(lambda text : text.lower())
        
        #5. лемматизация (приводим слова к начальной форме)
        X, _ = self.to_lemmatize2(X, self.key)
        
        #6. удаляем стоп слова
        sw = self.get_stopwords()
        func = lambda text : ' '.join([elem for elem in str(text).split(' ') if elem not in sw and not elem in ['nan', np.nan]])   
        X[self.key] = X[self.key].apply(func)
        
        return X 
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #приведем к виду списка списков, потом этот список списков пойдет в модель LDA
        lst = X[self.key].to_list()
        texts = []
        for i in range(len(lst)):    
            texts.append(lst[i].split(' '))
        return texts

In [None]:
description = Pipeline([
                ('imputer', TextImputer('mesTExt', '')),
                ('selector', ColumnSelector(key='mesTExt'))
            ])

In [None]:
pipeline = Pipeline([('description', description),
    ('tfidf_vectorizer', TfidfVectorizer(max_df=0.95, min_df=2, analyzer=lambda x: x, 
                                   max_features=n_features,
                                   stop_words='english')),
    ('lda', LatentDirichletAllocation(n_components=n_components, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)),
])

model = pipeline.fit(data.iloc[:])

In [None]:
pipeline.get_params().keys()

In [None]:
data.iloc[:10]

In [None]:
test_preds = pd.DataFrame(pipeline.transform(data.iloc[:10]))
test_preds

In [None]:
type(pipeline.transform(data.iloc[:10]))

In [None]:
test_preds.to_csv("test_preds.csv")

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = pipeline.steps[1][1].get_feature_names()
print_top_words(pipeline.steps[2][1], tf_feature_names, n_top_words)

In [None]:
data.iloc[1]

In [None]:
data.head()

In [None]:
len(tf_feature_names)

In [None]:
with open("lda_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)