In [None]:
import pandas as pd
from utils.lipht_visualization import word_frequency_barplot, topic_distribution_barplot
from utils.lipht_lda_utils import PrepareDictionary, df_lda_features, get_topics_and_probability, get_lda_topics, df_lda_preprocessing, TrainLDAModel, lda_predict_df
from utils.lipht_data import getEngine

In [None]:
engine =  getEngine('LIPHT-VM-01','Akademikernes_MSCRM_Addition')
query="""
SELECT lmp.[ThreadID]
      ,lmp.[ThreadStartDate]
      ,lmp.[ThreadEndDate]
      ,lmp.[ThreadSubject]
      ,lmp.[FirstMessageID]
      ,lmp.[FirstMessageBody]
      ,lmp.[FirstMemberMessageID]
      ,lmp.[FirstMemberMessageBody]
      ,lmp.[FirstResponseFromAKAToMemberMessageID]
      ,lmp.[FirstResponseFromAKAToMemberMessageBody]
      ,lmp.[FirstResponseFromMemberToAKAMessageID]
      ,lmp.[FirstResponseFromMemberToAKAMessageBody]
      ,lmp.[LastMessageID]
      ,lmp.[LastMessageBody]
      ,lmp.[ThreadInitiatedBy]
      ,lmp.[ThreadClass]
      ,lmp.[HasMemberMessage]
      ,lmp.[HasResponseFromAKAToMember]
      ,lmp.[HasResponseFromMemberToAKA]
      ,lmp.[ThreadResponsibleDepartment]
      ,lmp.[ThreadResponsibleDepartmentTeam]
      ,lmp.[ThreadResponsibleDepartmentTeamGroup]
  FROM [Akademikernes_MSCRM_Addition].[out].[LDA_Messages_persisted] lmp
  LEFT JOIN input.[language_predictions] l on l.FirstMemberMessageID = lmp.FirstMemberMessageID
  WHERE l.pred_index not in [5,7]
  AND l.pred_probability > 0.9
  """
df_scope = pd.read_sql(query, engine)

In [None]:
# df_scope.to_pickle('data/LDA_Messages_persisted_with_language_prediction.pkl')
# df_scope = pd.read_pickle('LDA_Messages_persisted_with_language_prediction')

In [None]:
departmentteam = 'Udbetalingsteam'
initiatedby = ''
n_gram = 3
sample_size= 10000
no_above = 0.00008#0.00003145
no_below= 2 # filter out tokens that appear in less than 15 documents
random_state=1
lda_num_topics = 4 # The number of topics
lda_chunksize = 1000
lda_passes = 50

log = {
    'departmentteam': departmentteam,
    'initiatedby': initiatedby,
    'n_gram': n_gram,
    'no_above': no_above,
    'no_below': no_below,
    'random_state': random_state,
    'lda_num_topics': lda_num_topics,
    'lda_chunksize': lda_chunksize,
    'lda_passes': lda_passes
}

In [None]:
df_team = df_scope[(df_scope['FirstMemberMessageBody'].isnull()==False) & df_scope['ThreadResponsibleDepartmentTeamGroup']==departmentteam].copy(deep=True)
log['rows'] = df_team.shape[0]

In [None]:
df_team = df_team.head(10000)

In [None]:
df_team.shape[0]

In [None]:
%%time
df_lda_preprocessing(df_team, 'FirstMemberMessageBody')

In [None]:
# df_scope.to_pickle('data/LDA_Messages_persisted_with_language_prediction_processed.pkl')
# df_scope = pd.read_pickle('LDA_Messages_persisted_with_language_prediction_processed.pkl')

In [None]:
df_team.shape[0]

In [None]:
line_sample = df_team.sample(1).index[0]
df_team['FirstMemberMessageBody'][line_sample]

In [None]:
df_team['text'][line_sample]

In [None]:
df_team['stopwords_removed'][line_sample]

In [None]:
df_team.columns

In [None]:
%%time
dictionary, corpus = PrepareDictionary(df_team, 'stopwords_removed', 1, 10, log)

In [None]:
len(dictionary.values())

In [None]:
from datetime import datetime
datetime.now()

In [None]:
%%time
LDAmodel_scope = TrainLDAModel(corpus, dictionary, 50, 4, lda_chunksize, lda_passes, random_state, log)

In [None]:
topic_distribution_barplot(LDAmodel_scope, df_team)

In [None]:
topics = get_topics_and_probability(df_team, LDAmodel_scope, 50, 40)

In [None]:
topics

In [None]:
get_lda_topics(df_team, LDAmodel_scope, 50,20)

In [None]:
df_log = pd.DataFrame(log, index=['0'])
df_log

In [None]:
topics.to_sql(name='topics_firstmemberbody',con=engine , schema='input', if_exists='replace', index=False)

# TEST of stopwords

In [1]:
from utils.lipht_lda_utils import getStopWords
from utils.lipht_data import getEngine

In [2]:
getStopWords()

['burde',
 'år',
 'fremgår',
 'april',
 'for',
 'forskellige',
 'høre',
 'nogen',
 'fordi',
 'hos',
 'andre',
 'lørdag',
 'tre',
 'vor',
 'ny',
 'ham',
 'har',
 'skriv',
 'lave',
 'hvordan',
 'tirsdag',
 'fik',
 'ej',
 'via',
 'tak',
 'ønsker',
 'stadig',
 'skriver',
 'vedkommende',
 'dage og',
 'norge',
 'gennem',
 'lav',
 'august',
 'with',
 'derefter',
 'hej',
 'idag',
 'november',
 'oktober',
 'an',
 'med',
 'foran',
 'mindre',
 'af et',
 'skulle',
 'god dag',
 'mod',
 'selvom',
 'gang',
 'kun',
 'pt',
 'godt',
 'i',
 'komme',
 'lad',
 'omkring',
 'sammen',
 'tilbage',
 'øvrigt',
 'tage',
 'du',
 'flest',
 'thi',
 'imens',
 'ser',
 'dette',
 'som',
 'bare',
 'derfra',
 'mvh',
 'jeg',
 'herpå',
 'mine',
 'ud',
 'mand',
 'blandt',
 'mange',
 'seks',
 'juli',
 'ni',
 'om',
 'skal',
 'lidt',
 'ned',
 'tidligere',
 'nyt',
 'men',
 'nok',
 'skrevet',
 'får',
 'juni',
 'selv',
 'aldrig',
 'syv',
 'dage',
 'esmtps',
 'uden',
 'jan',
 'hvor',
 'derfor',
 'maj',
 'hilsner',
 'om på',
 'febru