# Notebook 2 : Topic analysis with Nucleus API

Began 6 April 2020 by Amaury de Barbuat from ECL

Updated by William Riou from ENSTA PARIS

## 0-Intro
This notebook has 2 goals :

- to reproduce Nucleus platform results

- to get more than 8 main topics (10 to 20)

In [1]:
# Classic useful libraries

import os
import csv
import json
import datetime
import time
from pprint import pprint
import numpy as np
from pathlib import Path

In [2]:
# Particular library from Nucleus for this notebook

import nucleus_api
from nucleus_api.rest import ApiException
import nucleus_api.api.nucleus_api as nucleus_helper

## 1- Connect to the API

### Initialization, configure API host and key, and create new API instance

In [3]:
from pprint import pprint
from pathlib import Path
    
configuration = nucleus_api.Configuration()
configuration.host = 'nucleus.sumup.ai:5000'
configuration.api_key['x-api-key'] = 'zGtJTrTa4izSMMdssWpOeg'

# Create API instance
api_instance = nucleus_api.NucleusApi(nucleus_api.ApiClient(configuration))

## 2- Append my .csv file

Other methods exist but I chose this one frome the pdf file 'Guidelines for Calibrating Nucleus APIs' in nucleus-sdk-master folder.

### 21- Append a local .csv file to dataset

In [8]:
csv_file = 'ASRS1.csv'
dataset = 'ASRS1'

doc_cnt = 0
with open(csv_file, encoding='utf-8-sig') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        if doc_cnt < 1:
            payload = nucleus_api.Appendjsonparams(dataset=dataset, language='english', document={'time':row['time'], 'content':row['content'], 'title':row['title']})
            api_response = api_instance.post_append_json_to_dataset(payload)
        doc_cnt = doc_cnt + 1
    print(doc_cnt)



MaxRetryError: HTTPConnectionPool(host='nucleus.sumup.ai', port=5000): Max retries exceeded with url: /datasets/append_json_to_dataset (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f8a786e98d0>: Failed to establish a new connection: [Errno -2] Name or service not known'))

### 22- List available datasets

In [4]:
print('---------------- List available datasets ---------------------')
try:
    api_response = api_instance.get_list_datasets()
except ApiException as e:
    print("Exception when calling DatasetsApi->get_list_datasets: %s\n" % e)

list_datasets = api_response.result

print(len(list_datasets), 'datasets in the database:')
for ds in list_datasets:
    print('    ', ds.name)
    
print('-------------------------------------------------------------')

---------------- List available datasets ---------------------
6 datasets in the database:
     GeoCOVID
     CODA19
     COVID19_geolocation
     Drouet_letters_cleaned
     ASRS3
     ASRS1
-------------------------------------------------------------


## 3- Analysis

### 31- Topic modeling : Get list of topics from dataset

#### a. All time

In [6]:
dataset = 'ASRS1'
print('------------- Get list of topics from dataset {}--------------'.format(dataset))
print('topics from 1987 to 2019')

query = ''
custom_stop_words = ['CALLBACK', 'CONVERSATION', 'REPORTER', 'RPTR', 'REVEALED', 'INFO', 'SUPPLEMENTAL'] # str | List of stop words. (optional)
custom_stop_words += ['callback', 'conversation', 'reporter', 'rptr', 'revealed', 'info', 'supplemental']
custom_stop_words += ['contributing', 'factors', 'mins', 'minutes', 'clock', 'carrier', 'xxxx', 'air']
custom_stop_words += ['pilot', 'flying', 'officer', 'captain', 'attendant', 'attendants', 'flt', 'filed']
custom_stop_words += ['crew', 'chief', 'approx', 'approximately', 'called', 'told', 'contacted', 'txwy']
custom_stop_words += ['flt', 'flight', 'attendant', 'attendants' 'rpted', 'aircraft', 'acft']
num_topics = 1 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)

start = 1987
start = str(start)
print("start:", start)
period_start = start+"-01-01"
end = 2019
end = str(end)
print("end:", end)
period_end = end+"-12-31"

S=[start+'-'+end]
K=[start+'-'+end]
W=[start+'-'+end]

payload = nucleus_api.Topics(dataset=dataset,                                
                            query=query,                   
                            custom_stop_words=custom_stop_words,     
                            num_topics=num_topics,
                            metadata_selection=metadata_selection,
                            period_start=period_start,
                            period_end=period_end)
api_response = api_instance.post_topic_api(payload)
doc_ids = api_response.result.doc_ids
topics = api_response.result.topics
for i, res in enumerate(topics):
    print('Topic', i, 'keywords:')
    print('    Keywords:', res.keywords)
    keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
    print('    Keyword weights:', keywords_weight_str)
    print('    Strength:', res.strength)
    doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
    doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
    for j in range(len(res.doc_topic_exposures)):
        doc_topic_exp = float(res.doc_topic_exposures[j])
        if doc_topic_exp != 0:
            doc_topic_exposure_sel.append(doc_topic_exp)
            doc_id_sel.append(doc_ids[j])
    doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
    doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
    #print('    Document IDs:', doc_id_sel_str)
    #print('    Document exposures:', doc_topic_exposure_sel_str)
    print('---------------')
    
    S.append(float(res.strength))
    K.append(res.keywords)
    W.append(keywords_weight_str)

    
import pandas as pd

n=len(K)
df_keywords = pd.DataFrame({}, index = [k for k in range (1, n)])
df_keywords[K[0]] = [K[j] for j in range (1, n)]
df_keywords.to_csv('Keywords.csv', sep=';', header=True, index=True)

p=len(W)
df_weights = pd.DataFrame({}, index = [k for k in range (1, p)])
df_weights[W[0]] = [W[j] for j in range (1, p)]
df_weights.to_csv('Weights.csv', sep=';', header=True, index=True)

m=len(S)
df_strength = pd.DataFrame({}, index = [k for k in range (1, m)])
df_strength[S[0]] = [S[j] for j in range (1, m)]
df_strength.to_csv('Strength.csv', sep=';', header=True, index=True)

    
print('-------------------------------------------------------------')

------------- Get list of topics from dataset ASRS1--------------
topics from 1987 to 2019
start: 1987
end: 2019
INFO: Start polling job status of 2786564




INFO: Job 2786564 completed.
Topic 0 keywords:
    Keywords: hold short;short rwy;rwy hold;taxi rwy;gnd ctl;short runway;short lines;cross rwy
    Keyword weights: 0.0352;0.0328;0.5452;0.0416;0.0713;0.2182;0.0272;0.0286
    Strength: 1.0
---------------
-------------------------------------------------------------


#### b. Time selection

In [9]:
dataset = 'ASRS1'
print('------------- Get list of topics from dataset {}--------------'.format(dataset))
#print('topics based on ten years interval from 1987 to 2019')



months=['01', '04', '07', '10']



query = ''
custom_stop_words = ['CALLBACK', 'CONVERSATION', 'REPORTER', 'RPTR', 'REVEALED', 'INFO', 'SUPPLEMENTAL'] # str | List of stop words. (optional)
custom_stop_words += ['callback', 'conversation', 'reporter', 'rptr', 'revealed', 'info', 'supplemental']
custom_stop_words += ['contributing', 'factors', 'mins', 'minutes', 'clock', 'carrier', 'xxxx', 'air']
custom_stop_words += ['pilot', 'flying', 'officer', 'captain', 'attendant', 'attendants', 'flt', 'filed']
custom_stop_words += ['crew', 'chief', 'approx', 'approximately', 'called', 'told', 'contacted', 'txwy']
custom_stop_words += ['flt', 'flight', 'attendant', 'attendants' 'rpted', 'aircraft', 'acft']
custom_stop_words += ['declared', 'advised', 'decided', 'noticed', 'declare', 'informed']
num_topics = 4 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
metadata_selection = "" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)

S=[]
K=[]


for year in range(1988, 2019, 1):
    for m in months:
        #start = str(year)
        #print("start:", start)
        period_start = str(year)+"-"+m+"-01"
        #end = year
        #end = str(end)
        #print("end:", end)
        period_end = str(year+5)+"-"+m+"-01"
        payload = nucleus_api.Topics(dataset=dataset,                                
                                    query=query,                   
                                    custom_stop_words=custom_stop_words,     
                                    num_topics=num_topics,
                                    metadata_selection=metadata_selection,
                                    period_start=period_start,
                                    period_end=period_end)
        api_response = api_instance.post_topic_api(payload)
        doc_ids = api_response.result.doc_ids
        topics = api_response.result.topics

        s=[period_start+' to '+period_end]
        k=[period_start+' to '+period_end]

        for i, res in enumerate(topics):
            print('Topic', i, 'keywords:')
            #print('    Keywords:', res.keywords)
            keywords_weight_str = ";".join(str(x) for x in res.keywords_weight)
            #print('    Keyword weights:', keywords_weight_str)
            #print('    Strength:', res.strength)
            doc_topic_exposure_sel = []  # list of non-zero doc_topic_exposure
            doc_id_sel = []        # list of doc ids matching doc_topic_exposure_sel
            for j in range(len(res.doc_topic_exposures)):
                doc_topic_exp = float(res.doc_topic_exposures[j])
                if doc_topic_exp != 0:
                    doc_topic_exposure_sel.append(doc_topic_exp)
                    doc_id_sel.append(doc_ids[j])
            doc_id_sel_str = ' '.join(str(x) for x in doc_id_sel)
            doc_topic_exposure_sel_str = ' '.join(str(x) for x in doc_topic_exposure_sel)
            #print('    Document IDs:', doc_id_sel_str)
            #print('    Document exposures:', doc_topic_exposure_sel_str)
            print('---------------')

            s.append(res.strength)
            k.append(res.keywords)

        S.append(s)
        K.append(k)


print('-------------------------------------------------------------')

n=len(K)
df_keywords = pd.DataFrame({}, index = [k for k in range (1, len(K[0]))])
for i in range (n):
    df_keywords[K[i][0]] = [K[i][j] for j in range (1, len(K[i]))]
df_keywords.to_csv('Keywords.csv', sep=';', header=True, index=True)


n=len(S)
df_strength = pd.DataFrame({}, index = [k for k in range (1, len(S[0]))])
for i in range (n):
    df_strength[S[i][0]] = [S[i][j] for j in range (1, len(S[i]))]
df_strength.to_csv('Strength.csv', sep=';', header=True, index=True)

------------- Get list of topics from dataset ASRS1--------------
INFO: Start polling job status of 2786582
INFO: Job 2786582 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786583
INFO: Job 2786583 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786584
INFO: Job 2786584 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786585
INFO: Job 2786585 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786587
INFO: Job 2786587 completed.
Topic 0 keywords:
---------------
T



INFO: Job 2786592 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786594




INFO: Job 2786594 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786595
INFO: Job 2786595 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786596
INFO: Job 2786596 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786597
INFO: Job 2786597 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786598
INFO: Job 2786598 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786610 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786612
INFO: Job 2786612 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786613
INFO: Job 2786613 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786614
INFO: Job 2786614 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786615
INFO: Job 2786615 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786627 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786629
INFO: Job 2786629 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786630
INFO: Job 2786630 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786631
INFO: Job 2786631 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786632
INFO: Job 2786632 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786635 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786636
INFO: Job 2786636 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786637
INFO: Job 2786637 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786638
INFO: Job 2786638 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786639
INFO: Job 2786639 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786650 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786651




INFO: Job 2786651 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786652




INFO: Job 2786652 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786653




INFO: Job 2786653 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786654
INFO: Job 2786654 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786655




INFO: Job 2786655 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786656
INFO: Job 2786656 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786657
INFO: Job 2786657 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786658
INFO: Job 2786658 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786659
INFO: Job 2786659 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786663 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786664




INFO: Job 2786664 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786665
INFO: Job 2786665 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786666
INFO: Job 2786666 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786667




INFO: Job 2786667 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786669
INFO: Job 2786669 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786670
INFO: Job 2786670 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786671
INFO: Job 2786671 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786672
INFO: Job 2786672 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786675 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786676
INFO: Job 2786676 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786677




INFO: Job 2786677 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786678
INFO: Job 2786678 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786679
INFO: Job 2786679 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786680
INFO: Job 2786680 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786681
INFO: Job 2786681 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786688 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786689
INFO: Job 2786689 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786691
INFO: Job 2786691 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786692
INFO: Job 2786692 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786693
INFO: Job 2786693 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S



INFO: Job 2786705 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786706
INFO: Job 2786706 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786707
INFO: Job 2786707 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786708
INFO: Job 2786708 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: Start polling job status of 2786709
INFO: Job 2786709 completed.
Topic 0 keywords:
---------------
Topic 1 keywords:
---------------
Topic 2 keywords:
---------------
Topic 3 keywords:
---------------
INFO: S

In [10]:
def store_list(L):
    
    MyFile=open('Keywords.txt','w')
    n=len(L)
    for i in range (n):
        T=L[i]
        p=len(T)
        for j in range(p):
            MyFile.write(T[j]+'\n')
    MyFile.close()

In [11]:
store_list(S)

In [12]:
store_list(K)

In [13]:
n=len(K)
df_keywords = pd.DataFrame({}, index = [k for k in range (1, len(K[0]))])

for i in range (n):
    df_keywords[K[i][0]] = [K[i][j] for j in range (1, len(K[i]))]


In [14]:
n=len(S)
df_strength = pd.DataFrame({}, index = [k for k in range (1, len(S[0]))])

for i in range (n):
    df_strength[S[i][0]] = [S[i][j] for j in range (1, len(S[i]))]


In [15]:
df_keywords.to_csv('Keywords.csv', sep=';', header=True, index=True)
df_strength.to_csv('Strength.csv', sep=';', header=True, index=True)

### 32- Contrasted topic modeling : Extract a contrasted topic

In [None]:
dataset = 'ASRS1' # str | Dataset name.
metadata_selection = {} # dict | Specifies metadata-based queries on the dataset, of type {"metadata_field": "selected_values"}. (optional)
print('------------------ Get contrasted topic for content about {} in {}  --------------------'.format([x for x in  metadata_selection.values()], dataset))

query = '' # str | Dataset-language-specific fulltext query, using mysql MATCH boolean query format (optional)
time_period = "1M" # str | Alternative 1: time period counting back from today over which the analysis is conducted (optional)
period_start = '1989-01-01' # str | Alternative 2: start of period over which the analysis is conducted (optional)
period_end = '2019-12-31' # str | Alternative 2: start of period over which the analysis is conducted (optional)
excluded_docs = '' # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
syntax_variables = True # bool | Specifies whether to take into account syntax aspects of each category of documents to help with contrasting them (optional) (default to False)
num_keywords = 20 # integer | Number of keywords for the contrasted topic that is extracted from the dataset. (optional) (default to 50)
remove_redundancies = False # bool | If True, this option removes quasi-duplicates from the analysis and retain only one copy of it. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default False)
metadata_selection_contrast = {"aircraft": "runway"} # dict | Specifies the two categories of documents to contrast against each other
custom_stop_words += ['mss naf', 'bnf mss', 'naf transcription', 'florence naugrette', \
                      'assistee florence', 'gondolle assistee', 'sophie gondolle', 'transcription sophie', 'naugrette guernesey', 'transcription gerard', 'gerard pouchain', 'transcription florence']

try:
    payload = nucleus_api.TopicContrastModel(dataset=dataset, 
                                            metadata_selection_contrast=metadata_selection_contrast)
    api_response = api_instance.post_topic_contrast_api(payload)
    


except ApiException as e:
    api_error = json.loads(e.body)
    print('ERROR:', api_error['message'])

print('Contrasted Topic')
print('    Keywords:', api_response.result.keywords)
print('    Keywords weight:', api_response.result.keywords_weight)
print('    Performance metrics:', api_response.result.perf_metrics)
topic_contrast_classifier_config = api_response.result.classifier_config
topic_contrast_fixed_topics = {'weights': api_response.result.keywords_weight, 
                               'keywords': api_response.result.keywords}
print('    Classifier config saved to topic_contrast_classifier_config to be used in post_doc_classify_api')
print('    Fixed topics saved to topic_contrast_fixed_topics to be used in post_doc_classify_api')

print(topic_contrast_classifier_config)
print('-------------------------------------------------------------')

------------------ Get contrasted topic for content about [] in ASRS1  --------------------
INFO: Start polling job status of 2786715




### 33- Topic historical analysis

In [None]:
dataset = 'ASRS1'   # str | Dataset name.
print('------------ Get topic historical analysis for {} ----------------'.format(dataset))

query = '' # str | Fulltext query, using mysql MATCH boolean query format. Example, (\"word1\" OR \"word2\") AND (\"word3\" OR \"word4\") (optional)
num_topics = 14 # int | Number of topics to be extracted from the dataset. (optional) (default to 8)
num_keywords = 8 # int | Number of keywords per topic that is extracted from the dataset. (optional) (default to 8)
metadata_selection ="" # dict | JSON object specifying metadata-based queries on the dataset, of type {"metadata_field": "selected_values"} (optional)
time_period = "3Y"     # str | Time period selection. Choices: ["1M","3M","6M","12M","3Y","5Y",""] (optional)
period_start = "1989-01-01" # str | Start date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
period_end = "2019-12-31" # str | End date for the period to analyze within the dataset. Format: "YYYY-MM-DD HH:MM:SS"
n_steps = 10 #Number of steps in the historical analysis over the requested period. Each step is such that they contain an equal number of documents.
excluded_docs = [''] # str | List of document IDs that should be excluded from the analysis. Example, ["docid1", "docid2", ..., "docidN"]  (optional)
custom_dict_file = {} # file | Custom sentiment dictionary JSON file. Example, {"field1": value1, ..., "fieldN": valueN} (optional)
recompute_topics = False # If True, this option will trigger a recomputation of the topics at each past point in time. Especially helpful if conducting historical analysis of a query.
remove_redundancies = True # bool | If True, this option removes quasi-duplicates from the analysis. A quasi-duplicate would have the same NLP representation, but not necessarily the exact same text. (optional) (default True)

for year in range(1989, 2019, 10):
    start = str(year)
    print("start:", start)
    period_start = start+"-01-01"
    end = year + 9
    end = str(end)
    print("end:", end)
    period_end = end+"-12-31" 
    api_response = None
    payload = nucleus_api.TopicHistoryModel(
        dataset=dataset, 
        query=query, 
        custom_stop_words=custom_stop_words, 
        num_topics=num_topics,
        num_keywords=num_keywords, 
        metadata_selection=metadata_selection,       
        time_period=time_period, 
        period_start=period_start,
        period_end=period_end,
        n_steps=10,
        excluded_docs=excluded_docs,
        custom_dict_file=custom_dict_file,
        recompute_topics=False,
        remove_redundancies=True)
    api_response = api_instance.post_topic_historical_analysis_api(payload)
    api_ok = True
    if api_ok:
        print('Printing historical metrics data...')
        for i,res in enumerate(api_response.result):
            if (start == '1989' and (i==4 or i==5)) \
            or (start == '1999' and (i==7 or i==12)) \
            or (start == '2009' and (i==11)) \
            or (start == '2019' and (i in range(1,14,2) or i == 12)) \
            or (start == '1989' and (i==1 or i==3 or i==12)):
                print('Topic', i, res.keywords)
                print('    Timestamps:', res.time_stamps)
                best_index = 0
                best = float('-inf')
                for index in range(len(res.strengths)):
                    curr = float(res.strengths[index])
                    if curr > best:
                        best = curr
                        best_index = index
                highest_strength = res.time_stamps[best_index]
                print('Document with highest strength', highest_strength, 'Strength:', best)
                print('----------------')

print('-------------------------------------------------------------')