# Load a pretrained model

#### Load moduls

In [None]:
import pandas as pd

from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary

#### Load custom moduls

In [None]:
from lipht_lda import lda_predict_string, lda_predict_df

In [None]:
def load_topic_names(departmentteam):
    dicts_from_file = []
    with open('data/{}_lda_topic_names.txt'.format(departmentteam),'r') as inf:
        for line in inf:
            dicts_from_file.append(eval(line))
    return dicts_from_file[0]

In [None]:
import pyodbc
from sqlalchemy import create_engine
import urllib

params = urllib.parse.quote_plus(r'DRIVER={SQL Server};SERVER=LIPHT-VM-01;DATABASE=Akademikernes_MSCRM_addition;Trusted_Connection=yes')
conn_str = 'mssql+pyodbc:///?odbc_connect={}'.format(params)
engine = create_engine(conn_str)

# Test language model

#### Input model and data variables

In [None]:
data = 'LDA_Messages_persisted'
scope = 'language'
path = 'data/model/'
model = '{}'.format(scope)
dictionary = model + '.id2word'

#### Load model and data

In [None]:
%%time
# load the model and data
df_scope = pd.read_pickle('data/{}.pkl'.format(data))
LDAmodel_scope = LdaMulticore.load(path+model)
LDAmodel_dictionary = Dictionary.load(path+dictionary)
print(df_scope.shape)

In [None]:
lda_topic_names = {0:'Danish',1:'English'}

#### Test model on data

In [None]:
# Set column width to enable better readings
pd.options.display.max_colwidth = 200

In [None]:
document = df_scope.sample(1)
doc_id = document['ThreadMessageID']
unseen_document = document['FirstMemberMessage']
print(doc_id, unseen_document)

In [None]:
%%time
result = lda_predict_string(unseen_document, LDAmodel_scope, LDAmodel_dictionary, lda_topic_names)

In [None]:
print(result)

### Predict language topics on data
Per every row in the dataset

In [None]:
%%time
lda_predict_df(df_scope, LDAmodel_scope, LDAmodel_dictionary, lda_topic_names)

In [None]:
df_scope.head(1)

In [None]:
df_scope = df_scope.rename(index=str, columns={
    'prediction': 'language_prediction', 
    'pred_probability': 'language_probability',
    'pred_index': 'language_id',
    'pred_label': 'language'
})

#### Save the data

In [None]:
%%time
df_scope.to_pickle('data/{}_with_{}.pkl'.format(data, scope))

In [None]:
%%time
df_scope_sql = df_scope[['ThreadID','ThreadMessageID','text', 'text_CharCount', 'text_LessThan5000', 'text_WordCount', 'text_Questionmarks', 'text_1Question', 'text_Exclamationmarks', 'language_probability', 'language_id', 'language']]
df_scope_sql.to_sql(name='{}'.format(scope),con=engine , schema='input', if_exists='replace', index=False)
# df_scope.to_csv('{}_with_{}.csv'.format(data, scope))

The saved csv file can now be imported to the data model in SQL Server

# Test DepartmentTeam: Udbetalingsteam model A

#### Input model and data variables

In [None]:
data = 'A_Udbetalingsteam'
scope = 'Udbetalingteam_A'
path = 'data/model/'
model = '{}'.format(scope)
dictionary = model + '.id2word'

#### Load model and data

In [None]:
%%time
# load the model and data
df_scope = pd.read_pickle('data/{}.pkl'.format(data))
LDAmodel_scope = LdaMulticore.load(path+model)
LDAmodel_dictionary = Dictionary.load(path+dictionary)

In [None]:
lda_topic_names = {
    0:'Ferie og feriepenge',
    1:'Sendt oplysninger til AKA',
    2:'Ansættelseskontrakt eller frigørelse',
    3:'Spørgsmål om dagpenge',
    4:'Ansøgning om befordring',
    5:'Ansættelse',
    6:'Ledighed',
    7:'Adgang',
    8:'Noget med tid*',
    9:'Dagpenge mellem jul og nytår',
    10:'',
    11:'Fejl ved dagpenge',
    12:'Spørgsmål til blanket',
    13:'',
    14:'Ydelseskort',
    15:'Pension og Efterløn',
    16:'Dagpenge/Supplerende',
    17:'Spørgsmål til udfyldelse',
    18:'',
    19:'Spørgsmål om beskæftigelse'
}

#### Test model on data

In [None]:
# Set column width to enable better readings
pd.options.display.max_colwidth = 200

In [None]:
document = df_scope.sample(1)
doc_id = document['ThreadMessageID']
unseen_document = document['FirstMemberMessage']
print(doc_id, unseen_document)

In [None]:
%%time
result = lda_predict_string(unseen_document, LDAmodel_scope, LDAmodel_dictionary)

In [None]:
print(result)

### Predict model topics on data
Per every row in the dataset

In [None]:
%%time
lda_predict_df(df_scope, LDAmodel_scope, LDAmodel_dictionary)

In [None]:
# df_scope = df_scope.rename(index=str, columns={
#     'prediction': 'top_prediction', 
#     'pred_probability': 'topic_udbetaling_probability',
#     'pred_index': 'topic_udbetaling_id',
#     'pred_label': 'topic_udbetaling'
# })

In [None]:
df_scope.head(1)

#### Save the data

In [None]:
%%time
df_scope_sql = df_scope[['ThreadID','ThreadMessageID','text', 'pred_probability', 'pred_index', 'pred_label']]
df_scope.to_sql(name='{}'.format(scope),con=engine , schema='input', if_exists='replace', index=False)
# df_scope.to_csv('{}_with_{}.csv'.format(data, scope))

The saved csv file can now be imported to the data model in SQL Server

# Test DepartmentTeam: X model

#### Input model and data variables

In [None]:
data = 'A_Udbetalingsteam'
scope = 'Udbetaling_LDAmodel'
path = 'data/model/'
model = '{}'.format(scope)
dictionary = model + '.id2word'

#### Load model and data

In [None]:
%%time
# load the model and data
df_scope = pd.read_pickle('data/{}.pkl'.format(data))
LDAmodel_scope = LdaMulticore.load(path+model)
LDAmodel_dictionary = Dictionary.load(path+dictionary)

In [None]:
lda_topic_names = {
    0:'Ferie og feriepenge',
    1:'Sendt oplysninger til AKA',
    2:'Ansættelseskontrakt eller frigørelse',
    3:'Spørgsmål om dagpenge',
    4:'Ansøgning om befordring',
    5:'Ansættelse',
    6:'Ledighed',
    7:'Adgang',
    8:'Noget med tid*',
    9:'Dagpenge mellem jul og nytår',
    10:'',
    11:'Fejl ved dagpenge',
    12:'Spørgsmål til blanket',
    13:'',
    14:'Ydelseskort',
    15:'Pension og Efterløn',
    16:'Dagpenge/Supplerende',
    17:'Spørgsmål til udfyldelse',
    18:'',
    19:'Spørgsmål om beskæftigelse'
}

#### Test model on data

In [None]:
# Set column width to enable better readings
pd.options.display.max_colwidth = 200

In [None]:
document = df_scope.sample(1)
doc_id = document['ThreadMessageID']
unseen_document = document['FirstMemberMessage']
print(doc_id, unseen_document)

In [None]:
%%time
result = lda_predict_string(unseen_document, LDAmodel_scope, LDAmodel_dictionary)

In [None]:
print(result)

### Predict model topics on data
Per every row in the dataset

In [None]:
%%time
lda_predict_df(df_scope, LDAmodel_scope, LDAmodel_dictionary, lda_topic_names)

In [None]:
df_scope.head()

#### Save the data

In [None]:
%%time
df_scope.to_sql(name='{}'.format(scope),con=engine , schema='input', if_exists='replace', index=False)
# df_scope.to_csv('{}_with_{}.csv'.format(data, scope))

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({'a':np.random.randn(10)})
df

Unnamed: 0,a
0,-0.76014
1,-0.366198
2,-0.767414
3,-0.906483
4,1.384986
5,-1.042394
6,-0.68079
7,1.29329
8,2.128693
9,-1.105843


In [4]:
df_sample = df.sample(5)
df_sample

Unnamed: 0,a
8,2.128693
7,1.29329
6,-0.68079
3,-0.906483
9,-1.105843


In [5]:
df_pop = df.loc[~df.index.isin(df_sample.index)]
df_pop

Unnamed: 0,a
0,-0.76014
1,-0.366198
2,-0.767414
4,1.384986
5,-1.042394


The saved csv file can now be imported to the data model in SQL Server

In [6]:
t = list(df_pop.columns)
t.append('test')
t.append('ngrams')

In [7]:
t

['a', 'test', 'ngrams']

In [8]:
a = [x for x in t if x not in ['ngrams', 'tokenized_text']]
a

['a', 'test']

In [9]:
df['list'] = pd.Series(["test"], index=[0])

In [16]:
def list_to_stringlist(test):
#     return str(test)
    return ', '.join(test)
test = list_to_stringlist(a)
type(test)

str

In [15]:
def stringlist_to_list(test):
    return list(test.split(','))
te = stringlist_to_list(test)
type(te)

list