### Modul import

In [1]:
# Common
import pandas as pd
import numpy as np
import pyodbc

# Text processing
import re
from bs4 import BeautifulSoup
import nltk

# Visualization modules
import matplotlib.pyplot as plt

# Model modules
from sklearn.feature_extraction.text import TfidfVectorizer#, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
from gensim.test.utils import get_tmpfile
from gensim.test.utils import datapath
from gensim.models.ldamulticore import LdaMulticore

from gensim.models import phrases, word2vec

# Remove unnecessary warnings
pd.options.mode.chained_assignment = None




### Load Functions

In [2]:
from lipht_lda import df_lda_preprocessing, lda_preprocess_string, df_lda_features, get_lda_topics, lda_predict_df, lda_predict_string, document_to_bow, df_lda_features

# Import Data

In [3]:
import pyodbc
server = "LIPHT-VM-01"#"LI-PH-01"
db = "Akademikernes_MSCRM_Addition"
con = pyodbc.connect('DRIVER={SQL Server};SERVER=' + server + ';DATABASE=' + db)

query="""
SELECT *
  FROM [Akademikernes_MSCRM_Addition].[out].[LDA_Messages_persisted]
  """
df_raw = pd.read_sql(query, con)
df_raw.head()

Unnamed: 0,ThreadID,ThreadSubject,FirstMessage,FirstMemberMessage,ThreadInitiatedBy,ThreadClass,InDiagnosticScope,ThreadMessageID,ThreadResponsibleDepartment,ThreadResponsibleDepartmentTeam


In [4]:
df_raw = pd.to_pickle('data/LDA_Messages_persisted.pkl')
df_import = df_raw.copy(deep=True)

TypeError: to_pickle() missing 1 required positional argument: 'path'

In [None]:
# df_import = pd.read_pickle('data/AKA_rawdata_df_lda_preprocessed.pkl')
# df_import = pd.read_pickle('data/AKA_rawdata_with_language.pkl')

#### Setup Analysis Parameters

In [None]:
dataset = 'df_B'
n_gram = 2
sample_size= 10000
no_words= 5000
no_below= 20 # filter out tokens that appear in less than 15 documents
random_state=1
research_scope = 'Udbetaling'
num_topics = 20

In [None]:
# Create dataset for method B
# Process 1st MemberMessage, then
# Concatenate with Subject
df_B = df_import.copy(deep=True)
df_B.head()

In [None]:
df_B.shape

In [None]:
%%time
# df_B = df_B[(df_B['ThreadStatus']=='Fuldført') & (df_B['ThreadMessageDirection']=='Indgående') & (df_B['ThreadMessageIsFirstMemberMessage']==1) & (df_B['ThreadTotalMessageCount']>1) & (df_B['ThreadHasInteraction']>=1) & (df_B['ThreadResponsibleDepartmentTeam']=='Udbetalingsteam') & (df_B['pred_label']=='Danish')]
df_B = df_B[(df_B['ThreadStatus']=='Fuldført') & (df_B['ThreadMessageDirection']=='Indgående') & (df_B['ThreadMessageIsFirstMemberMessage']==1) & (df_B['ThreadResponsibleDepartmentTeam'].str.contains('Udbetalingsteam')==True) & (df_B['pred_label']=='Danish')]
print(df_B.shape)

In [None]:
df_B.ThreadResponsibleDepartmentTeam.unique()

In [None]:
%%time
df_lda_preprocessing(df_B,'ThreadMessageText',n_gram)

In [None]:
df_B.isnull().sum()

In [None]:
print(df_B.shape)

In [None]:
df_B.to_pickle('data/df_B.pkl')

## Analyze Requests
Messages are either
- Incoming (from members) or
- Outgoing (from aka)
All messeages have
- subject_field and
- message_field

In the following we will analyze the different splits of data, with regards to the above:
- Incoming_subject
- Incoming_message
- Outgoing_subject
- Outgoing_message

### Request Analysis: DepartmentTeam

#### Load the data

In [None]:
%%time
# Load the data, and print rows, columns
df_scope = pd.read_pickle('data/{}.pkl'.format(dataset))

In [None]:
df_scope.shape

In [None]:
data_scope_name = research_scope +'_topics-'+ str(num_topics) +'_Sample-'+str(sample_size) +'_WordCount-'+str(no_words) +'_RandomState-'+str(random_state)+'_dataset-'+ dataset
print(data_scope_name)

### Vectorize words

#### Create dictionary and top words

##### Parameters:	
- **no_below** (int, optional) – Keep tokens which are contained in at least no_below documents.
- **no_above** (float, optional) – Keep tokens which are contained in no more than no_above documents (fraction of total corpus size, not an absolute number).
- **keep_n** (int, optional) – Keep only the first keep_n most frequent tokens.
- **keep_tokens** (iterable of str) – Iterable of tokens that must stay in dictionary after filtering.

In [None]:
%%time
# Create dictionary with words from df_scope (the total dataset)
dictionary = Dictionary(documents=df_scope.stemmed_text.values)
print("Found {} words.".format(len(dictionary.values())))

In [None]:
%%time
# dictionary.filter_extremes(no_above=0.8, no_below=3)
dictionary.filter_extremes(no_below=no_below, keep_n=no_words)

dictionary.compactify()  # Reindexes the remaining words after filtering
print("Left with {} words.".format(len(dictionary.values())))

In [None]:
%%time
#Make a BoW for every Besked
document_to_bow(df_scope, dictionary)

#### Create Sample of Scope

In [None]:
scope_lda_sample = df_scope.sample(sample_size, random_state=random_state)
scope_lda_sample.shape

In [None]:
scope_lda_sample.columns.values

In [None]:
scope_lda_sample[['ThreadID','ThreadMessageID','ThreadSubject','ThreadMessageText','text','tokenized_text','stopwords_removed','lemmatized_text','stemmed_text']].head()

In [None]:
scope_lda_sample['text'][592345]

### Find optimal number of topics for LDA
#### K-means Elbow method

#### Create Topwords
Create a list of topwords from the entire dataframe

#### Tf-idf and document similarity

I define term frequency-inverse document frequency (tf-idf) vectorizer parameters and then convert the clean_content list into a tf-idf matrix.

To get a Tf-idf matrix, first count word occurrences by request. This is transformed into a request-term matrix (dtm). This is also just called a term frequency matrix.

Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a request but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the request.

A couple things to note about the parameters I define below:

max_df: this is the maximum frequency within the request a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the request it probably cares little meanining - rule of thumb (verify this)

min_idf: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the request to be considered. Here I pass 0.2; the term must be in at least 20% of the request. 

TEST THIS
I found that if I allowed a lower min_df I ended up basing clustering on names--for example "Michael" or "Tom" are names found in several of the movies and the synopses use these names frequently, but the names carry no real meaning.

ngram_range: this just means I'll look at unigrams, bigrams and trigrams. See n-grams

In [None]:
%%time
top_words = [v for v in dictionary.values()]
top_words = list(set(top_words))
df_scope['OnlyTopWords'] = list(map(lambda doc: [word for word in doc if word in top_words], df_scope['stemmed_text']))

In [None]:
print("No of top words: {} ".format(len(top_words)))

In [None]:
# %%time
top_words, _ = remove_not_topwords(scope_lda_sample, df_scope)

In [None]:
# # Create dictionary with words from df_scope (the total dataset) or scope_lda_sample (the sample size)
# dictionary = Dictionary(documents=df_scope.stemmed_text.values)
# #Make a BoW for every Besked
# document_to_bow(df_scope)

In [None]:
# LDA preprocessing
print("Found {} words.".format(len(dictionary.values())))

In [None]:
%%time
scope_lda_sample['clean_content'] = scope_lda_sample['OnlyTopWords'].apply(ListToString)

In [None]:
%%time
tfidf_wordvector = TfidfVectorizer(
                analyzer='word', 
                max_df=0.8, 
                min_df=5, 
#                 stop_words=stopwords.words('danish'),
#                 ngram_range=(1,3)
                ) 

#fit the tfidf_wordvector to clean_content
tfidf_wordvector_maxtrix = tfidf_wordvector.fit_transform(scope_lda_sample.clean_content)
print(tfidf_wordvector_maxtrix.shape)

dist is defined as 1 - the cosine similarity of each request. Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each request and the other request in the corpus (each clean_content among the total clean_content). Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.

Note that with dist it is possible to evaluate the similarity of any two or more clean_content.

In [None]:
tfidf_wordvector_2d = tfidf_wordvector_maxtrix.todense()

In [None]:
top_range = 151
increments = 5

In [None]:
%%time
distortions = []
K = range(1,top_range,increments)
for k in K:
    kmeanModel = KMeans(n_clusters=k, n_jobs=-1, random_state=0).fit(tfidf_wordvector_2d)
    kmeanModel.fit(tfidf_wordvector_2d)
    distortions.append(sum(np.min(cdist(tfidf_wordvector_2d, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / tfidf_wordvector_2d.shape[0])

In [None]:
# Plot the elbow
plt.figure(figsize=(16, 10))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method: {4}. Showing the optimal k\nSample Size: {0}, Top {1} Words, with increments of {2} from 0 to {3}'.format(sample_size, len(top_words), increments, top_range-1, data_scope_name))
plt.show()

## LDA Model Training
Latent Dirichlet Allocation (LDA) is generative approach in classifying texts. It is a three level hierarchical Bayesian model where it creates probabilities on word level, on document level and on corpus level (corpus means all documents)

In [None]:
# We want to maximize the probability of the corpus in the training set.
corpus = scope_lda_sample.bow

In [None]:
print(('LDA Model based on {3} dataset.\n\tSample Size: {0},\n\tTop {1} Words,\n\tNo of Topics {2}'.format(sample_size, len(dictionary.values()), num_topics, data_scope_name)))

##### Input num_topics from the analysis above

In [None]:
%%time
#A multicore approach to decrease training time
# https://radimrehurek.com/gensim/corpora/mmcorpus.html
# ram_corpus = get_tmpfile("corpus_scope.mm")
# MmCorpus.serialize(ram_corpus, corpus)
# mm = MmCorpus(ram_corpus)
LDAmodel_scope = LdaMulticore(corpus=corpus,#mm,
                        id2word=dictionary,
                        num_topics=num_topics,
                        workers=4,
                        chunksize=5000,
                        passes=50,
                        alpha='asymmetric',
                        random_state=random_state)

In [None]:
dictionary.save('data/model/{0}_LDAmodel_dictionary.pkl'.format(data_scope_name))

In [None]:
LDAmodel_scope.save('data/model/{0}_LDAmodel'.format(data_scope_name))

In [None]:
LDAmodel_scope = LdaMulticore.load('data/model/{0}_LDAmodel'.format(data_scope_name))

#### Feature vector

In [None]:
%%time
df_lda_features(LDAmodel_scope, scope_lda_sample)

#### Topic distributions and let's see some words that come with the topics

In [None]:
RequestTopicDistribution = scope_lda_sample['lda_features'].mean()

In [None]:
fig, ax1 = plt.subplots(1,1,figsize=(20,6))
nr_top_bars = 3
ax1.set_title("Request Topic distributions", fontsize=16)

for ax, distribution, color in zip([ax1], [RequestTopicDistribution], ['r']):
    # Individual distribution barplots
    ax.bar(range(len(distribution)), distribution, alpha=0.7)
    rects = ax.patches
    for i in np.argsort(distribution)[-nr_top_bars:]:
        rects[i].set_color(color)
        rects[i].set_alpha(1)

fig.tight_layout(h_pad=3.)

#### Inspect topics and words

In [None]:
from lipht_lda import get_topics_and_probability, get_lda_topics

In [None]:
get_topics_and_probability(scope_lda_sample, LDAmodel_scope, num_topics, 5)

In [None]:
get_lda_topics(scope_lda_sample, LDAmodel_scope, num_topics,20)

#### Name the topics

In [None]:
lda_topic_names = {
    0:'Ferie og feriepenge',
    1:'Sendt oplysninger til AKA',
    2:'Ansættelseskontrakt eller frigørelse',
    3:'Spørgsmål om dagpenge',
    4:'Ansøgning om befordring',
    5:'Ansættelse',
    6:'Ledighed',
    7:'Adgang',
    8:'Noget med tid*',
    9:'Dagpenge mellem jul og nytår',
    11:'Fejl ved dagpenge',
    12:'Spørgsmål til blanket',
    14:'Ydelseskort',
    15:'Pension og Efterløn',
    16:'Dagpenge/Supplerende',
    17:'Spørgsmål til udfyldelse',
    19:'Spørgsmål om beskæftigelse'
}

### Test the model

In [None]:
pd.options.display.max_colwidth = 200

In [None]:
# document = scope_lda_sample.sample(1) # From sample
document = df_scope.sample(1) # From population
doc_id = document['ThreadMessageID']
unseen_document = document['ThreadMessageText']
print(doc_id, unseen_document)

In [None]:
# Test function and prediction
print(lda_predict_string(unseen_document, LDAmodel_scope, dictionary,lda_topic_names))

In [None]:
bow_vector = dictionary.doc2bow(lda_preprocess_string(unseen_document))
for index, score in sorted(LDAmodel_scope[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, LDAmodel_scope.print_topic(index, 5)))

# index, score = sorted(LDAmodel_scope[bow_vector], key=lambda tup: -1*tup[1])[0]
# print("Score: {}\t Topic: {}".format(score, LDAmodel_scope.print_topic(index, 5)))

### Predict topics on data
Per every row in the dataset

In [None]:
%%time
lda_predict_df(df_scope, LDAmodel_scope, dictionary, lda_topic_names)

#### Save the data with prediction

In [None]:
df_scope.to_pickle('data/AKA_{0}_with_prediction.pkl'.format(data_scope))

### Load to MS SQL server

In [None]:
%%time
df_raw = pd.read_pickle('data/AKA_rawdata.pkl')

In [None]:
# from lipht_lda import df_lda_preprocessing

In [None]:
%%time
df_lda_preprocessing(df_raw, 'ThreadMessageText')

In [None]:
df_raw.columns

In [None]:
df_raw.to_pickle('data/AKA_rawdata_df_lda_preprocessed.pkl')

In [None]:
# df_raw[df_raw['pred_label']=='English']['text']

In [None]:
df_raw_with_language = df_raw[['ThreadID','ThreadMessageID','ThreadMessageText','text','pred_label','pred_probability']]

In [None]:
df_raw_with_language.head()

In [None]:
df_raw_with_language.to_csv('lang_pred.csv')

In [None]:
con = "mssql+pyodbc:///?odbc_connect={}".format(urllib.parse.quote_plus("DRIVER=ODBC Driver 13 for SQL Server;SERVER={0};PORT=1433;DATABASE={1};UID={2};PWD={3};TDS_Version=8.0;".format(server, db, user, password)))

In [None]:
test = pd.DataFrame({'test':[1,2,3]}) #'te','te','te'

In [None]:
import pyodbc
from sqlalchemy import create_engine
import urllib

params = urllib.parse.quote_plus(r'DRIVER={SQL Server};SERVER=LIPHT-VM-01;DATABASE=Akademikernes_MSCRM_addition;Trusted_Connection=yes')
conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
engine = create_engine(conn_str)


test.to_sql(name='Test',con=engine , schema='input', if_exists='replace', index=False)

In [None]:
df_raw.to_pickle('data/AKA_rawdata_with_language.pkl')

In [None]:
df_raw.columns