# Create datasets

In [1]:
import pandas as pd
import pyodbc

from lipht_lda import df_lda_preprocessing

In [23]:
def SubSetDataFrame(df, n_gram=None, list_of_subset=None):
    # For each team we need to create two datasets
    # For initiatedby AKA : LAV LDA på first member message
    # For initiatedby Member : LAV LDA på Threadsubject + first member message
    
    if list_of_subset is None:
        list_of_subset = df['ThreadResponsibleDepartmentTeam'].unique()
    
    # prepare dir
    dir_AKA = 'Initiatedby_AKA'
    dir_Member = 'Initiatedby_Member'
    
    # prepare df per test
    df_aka = df[df['ThreadInitiatedBy']=='AKA'].copy(deep=True)
    df_member = df[df['ThreadInitiatedBy']=='Member'].copy(deep=True)
    
    # preprocess df
    df_lda_preprocessing(df_aka,'FirstMemberMessage',n_gram)
    df_member['FirstMemberMessage'] = df_member['ThreadSubject'] + df_member['FirstMemberMessage']
    df_lda_preprocessing(df_member,'FirstMemberMessage',n_gram)
    
    print('Creating datasets for all {} items in {}'.format(len(list_of_subset),'ThreadResponsibleDepartmentTeam'))

    for s in list_of_subset:
        A = df_aka[df_aka['ThreadResponsibleDepartmentTeam']==s].copy(deep=True)
        M = df_member[df_member['ThreadResponsibleDepartmentTeam']==s].copy(deep=True)
        
        A.to_pickle('data/{}/{}.pkl'.format(dir_AKA,s))
        M.to_pickle('data/{}/{}.pkl'.format(dir_Member,s))
    
        print('Created dataset {} for A with {} rows and B with {} rows.'.format(s,A.shape[0], M.shape[0]))

### Get the raw data
To store as a pickle file

In [3]:
server = "LIPHT-VM-01"#"LI-PH-01"
db = "Akademikernes_MSCRM_Addition"
con = pyodbc.connect('DRIVER={SQL Server};SERVER=' + server + ';DATABASE=' + db)

query="""
SELECT *
  FROM [Akademikernes_MSCRM_Addition].[out].[LDA_Messages_persisted]
  """

In [4]:
df_raw = pd.read_sql(query, con)
df_raw.head()

Unnamed: 0,ThreadID,ThreadSubject,FirstMessage,FirstMemberMessage,ThreadInitiatedBy,ThreadClass,InDiagnosticScope,ThreadMessageID,ThreadResponsibleDepartment,ThreadResponsibleDepartmentTeam
0,15FA2A2B-B6FA-E611-AC9F-005056AD2D14,Nyt brev: Is part time insurance right for you?,"Hej,\n\n\nWell, yes I did. Basically because I...","Hej,\n\n\nWell, yes I did. Basically because I...",Member,Member to AKA to Member,1,1EFA2A2B-B6FA-E611-AC9F-005056AD2D14,Forsikring,Medlemskabsteam
1,36B7B25E-A349-E711-BE8E-005056AD2D14,Virksomhedspraktik samt samtale med A-kasse og...,"Hej,\nSom aftalt med Karen Henningsen ved møde...","Hej,\nSom aftalt med Karen Henningsen ved møde...",Member,Member to AKA to Member,1,3DB7B25E-A349-E711-BE8E-005056AD2D14,Jobmatch,Team Rådighed
2,45E1BEF8-EB30-E711-AC9F-005056AD2D14,Dit kontingent til Akademikernes A-kasse,Du har desværre endnu ikke betalt kontingent t...,Jeg har allerede tilmeldt mig betalingsservice...,AKA,AKA to Member to AKA,1,50E1BEF8-EB30-E711-AC9F-005056AD2D14,Forsikring,Medlemskabsteam
3,C6F41AEE-7532-E711-AC9F-005056AD2D14,Dit kontingent til Akademikernes A-kasse,Du har desværre endnu ikke betalt kontingent t...,"Hej,\nJeg har modtaget jeres rykker, men forst...",AKA,AKA to Member to AKA,1,D0F41AEE-7532-E711-AC9F-005056AD2D14,Forsikring,Medlemskabsteam
4,D65C779B-0541-E711-BE8E-005056AD2D14,Ny adresse,Jeg har forsøgt at ændre min adresse. Jeg flyt...,Jeg har forsøgt at ændre min adresse. Jeg flyt...,Member,Member to AKA to Member,1,DD5C779B-0541-E711-BE8E-005056AD2D14,Forsikring,Medlemskabsteam


In [5]:
df_raw.shape

(15738, 10)

In [6]:
%%time
df_lda_preprocessing(df_raw,'FirstMemberMessage',2)

  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


Wall time: 59.3 s


In [7]:
%%time
df_raw.to_pickle('data/LDA_Messages_persisted.pkl')

Wall time: 2.64 s


### Load raw data
from pickle file

In [16]:
# We load the raw file, that has a language prediction
# To make sure that the language does not get predicted as a topic
df_raw = pd.read_pickle('data/LDA_Messages_persisted_with_language.pkl')

In [17]:
df_raw = df_raw.rename(index=str, columns={
    'prediction': 'language_prediction', 
    'pred_probability': 'language_probability',
    'pred_index': 'language_id',
    'pred_label': 'language'
})

In [18]:
df_raw['ThreadResponsibleDepartmentTeam'] = df_raw['ThreadResponsibleDepartmentTeam'].str.replace(' 2 - CPR 11 til 20| 1 - CPR 01 til 10','')

In [19]:
# Get list of unike DepartmentTeams
# We need to create a subset of data for each DepartmentTeam.
# If there are any teams that are split unnecessarily we will need to group them together
ThreadResponsibleDepartmentTeam = df_raw.ThreadResponsibleDepartmentTeam.unique()
ThreadResponsibleDepartmentTeam

array(['Medlemskabsteam', 'Team Rådighed', 'Job', 'Udbetalingsteam',
       'Logistik', 'Ikke Fordelt Team', 'Virksomhedsteam', 'Anden aktør',
       'Digitaliseringsafdeling', 'Kvalitetsteam', None, 'Kommunikation',
       'Startup', 'Økonomi', 'HR/Strategi (MÅ IKKE ANVENDES)',
       'Juridisk team'], dtype=object)

In [20]:
# Subset to danish only
df_raw = df_raw[(df_raw['language']=='Danish') & (df_raw['ThreadResponsibleDepartmentTeam'].isnull()==False) & (df_raw['ThreadResponsibleDepartmentTeam']!='HR/Strategi (MÅ IKKE ANVENDES)')]

In [21]:
df_raw.shape

(13606, 26)

Create a A and B subset of data per Responsible Department Team
    - A has FirstMemberMessage
    - B has ThreadSubject concatenated with FirstMemberMessage

In [24]:
%%time
SubSetDataFrame(df_raw, 2)

Creating datasets for all 14 items in ThreadResponsibleDepartmentTeam
Created dataset Team Rådighed for A with 1095 rows and B with 124 rows.
Created dataset Medlemskabsteam for A with 1180 rows and B with 1405 rows.
Created dataset Job for A with 1042 rows and B with 439 rows.
Created dataset Udbetalingsteam for A with 661 rows and B with 5251 rows.
Created dataset Logistik for A with 1916 rows and B with 225 rows.
Created dataset Ikke Fordelt Team for A with 14 rows and B with 64 rows.
Created dataset Virksomhedsteam for A with 8 rows and B with 13 rows.
Created dataset Digitaliseringsafdeling for A with 13 rows and B with 35 rows.
Created dataset Anden aktør for A with 49 rows and B with 14 rows.
Created dataset Kvalitetsteam for A with 0 rows and B with 11 rows.
Created dataset Kommunikation for A with 9 rows and B with 5 rows.
Created dataset Startup for A with 27 rows and B with 4 rows.
Created dataset Økonomi for A with 1 rows and B with 0 rows.
Created dataset Juridisk team for