In [1]:
import json
import pandas as pd
import random
import numpy as np
from tqdm import tqdm

pd.options.display.max_rows = 250
pd.options.display.max_colwidth = 250

In [2]:
act = pd.read_csv('data_tables/actor.csv', index_col=0)
convs = pd.read_csv('data_tables/convs.csv', index_col=[0, 1])
convs.drop(['isGroup', 'canvasRect_width', 'canvasRect_height'], axis=1, inplace=True) #'canvasRect_x', 'canvasRect_y'
convs['Actor'] = convs['Actor'].fillna(-1.0)
convs.outgoingLinks = convs.outgoingLinks.apply(lambda x: [tuple(i) for i in json.loads(x.replace('(', '[').replace(')', ']'))])

In [3]:
act_list = [
    'You',
    'Tricentennial Electrics',
    'Spinal Cord',
    'Limbic System',
    'Ancient Reptilian Brain',
    'Fysique',
    'Intellect',
    'Motorics',
    'Psyche',
    'Conceptualization',
    'Logic',
    'Encyclopedia',
    'Rhetoric',
    'Drama',
    'Visual Calculus',
    'Empathy',
    'Inland Empire',
    'Volition',
    'Authority',
    'Suggestion',
    'Esprit de Corps',
    'Endurance',
    'Physical Instrument',
    'Shivers',
    'Pain Threshold',
    'Electrochemistry',
    'Half Light',
    'Hand/Eye Coordination',
    'Reaction Speed',
    'Savoir Faire',
    'Interfacing',
    'Composure',
    'Perception',
    'Perception (Smell)',
    'Perception (Hearing)',
    'Perception (Taste)',
    'Perception (Sight)'
]

In [4]:
self_idx = list(act[act['Name'].apply(lambda x: x == 'You')].index)[0]
act_list_idx = list(act[act['Name'].apply(lambda x: x in act_list)].index)
act_list_idx.append(0)
act_list_idx.append(-1)
conv_idx_act = list(set(convs[convs.apply(lambda x: x.Actor in act_list_idx, axis=1)].index.get_level_values(level=0)))
cut_convs = convs.loc[conv_idx_act]

In [5]:
print(len(convs))
convs = convs.loc[convs['Actor'].apply(lambda x: x in act_list_idx)]
print(len(convs))

112962
61750


## clear outgoing links

In [7]:
link_removed = 0
rem_links = 0

index = convs.index
for idx, row in convs.iterrows():
    links = row['outgoingLinks']
    cut_links = []
    if links:
        for i in links:
            if i in index:
                cut_links.append(i)
            else:
                link_removed += 1
                #print(f'removing link {i}')
                #a = 1/0
    rem_links += len(cut_links)
    convs.at[idx, 'outgoingLinks'] = cut_links
print(link_removed)

27378


## add incoming links

In [8]:
convs['ingoingLinks'] = [[]]*len(convs)

In [9]:
for idx_from, row in convs.iterrows():
    links = row['outgoingLinks']
    cut_links = []
    for idx_to in links:
        ing_links = set(convs.loc[idx_to, 'ingoingLinks'])
        ing_links.add(idx_from)
        convs.at[idx_to, 'ingoingLinks'] = list(ing_links)
            

## clear solo lines

In [10]:
print(len(convs))
convs = convs[convs.apply(lambda x: len(x['outgoingLinks'])!=0 or len(x['ingoingLinks'])!=0, axis=1)]
print(len(convs))

61750
51348


## remove tmp actors

#### fixing links before removing this actors

In [11]:
tmp_act_idx = list(convs[convs['Actor'].apply(lambda x: x in (0.0, -1.0))].index)
for idx in tmp_act_idx:
    outg, ing = convs.loc[idx, ['outgoingLinks', 'ingoingLinks']]
    for i in ing:
        init = set(convs.loc[i, 'outgoingLinks'])
        init.remove(idx)
        convs.at[i, 'outgoingLinks'] = list(init.union(set(outg)))
    
    for i in outg:
        init = set(convs.loc[i, 'ingoingLinks'])
        init.remove(idx)
        convs.at[i, 'ingoingLinks'] = list(init.union(set(ing)))

#### removing them

In [12]:
print(len(convs))
convs = convs[convs['Actor'].apply(lambda x: x not in (0.0, -1.0))]
print(len(convs))

51348
30073


#### checking if any links going outside of current data

In [13]:
index = convs.index
for idx, row in convs.iterrows():
    for i in row['outgoingLinks']:
        if i not in index:
            print('error', idx)

    for i in row['ingoingLinks']:
        if i not in index:
            print('error', idx)

#### remove sololines once again

In [14]:
print(len(convs))
convs = convs[convs.apply(lambda x: len(x['outgoingLinks'])!=0 or len(x['ingoingLinks'])!=0, axis=1)]
print(len(convs))

30073
26486


### removing NaN text

In [15]:
print(len(convs))
nan_text_idx = convs[convs['Dialogue Text'].isna()].index
for idx in nan_text_idx:
    outg, ing = convs.loc[idx, ['outgoingLinks', 'ingoingLinks']]
    for i in ing:
        init = set(convs.loc[i, 'outgoingLinks'])
        init.remove(idx)
        convs.at[i, 'outgoingLinks'] = list(init.union(set(outg)))
    
    for i in outg:
        init = set(convs.loc[i, 'ingoingLinks'])
        init.remove(idx)
        convs.at[i, 'ingoingLinks'] = list(init.union(set(ing)))
convs = convs[~convs['Dialogue Text'].isna()]
print(len(convs))

26486
25422


In [16]:
print(len(convs))
convs = convs[convs.apply(lambda x: len(x['outgoingLinks'])!=0 or len(x['ingoingLinks'])!=0, axis=1)]
print(len(convs))

25422
25391


In [17]:
index = convs.index
for idx, row in convs.iterrows():
    for i in row['outgoingLinks']:
        if i not in index:
            print('error', idx)

    for i in row['ingoingLinks']:
        if i not in index:
            print('error', idx)

In [18]:
convs['ActorName'] = convs['Actor'].map(act['Name'])
convs['ActorName'].value_counts()

You                        13249
Logic                        876
Rhetoric                     869
Inland Empire                801
Empathy                      796
Conceptualization            602
Electrochemistry             554
Encyclopedia                 545
Shivers                      543
Volition                     497
Half Light                   469
Authority                    442
Esprit de Corps              430
Savoir Faire                 430
Visual Calculus              415
Suggestion                   410
Composure                    408
Drama                        403
Endurance                    401
Reaction Speed               372
Interfacing                  363
Physical Instrument          343
Perception (Sight)           237
Pain Threshold               237
Hand/Eye Coordination        179
Perception (Hearing)         175
Ancient Reptilian Brain      107
Perception (Smell)            89
Limbic System                 77
Tricentennial Electrics       30
Perception

In [19]:
convs_dict = convs['outgoingLinks'].to_dict()
def recursive_count(node, prev_nodes):
    if len(prev_nodes) > 5:
        return 1
    links = convs_dict[node]
    if len(links) == 0:
        return 1
    
    sum_ = 0
    for link in links:
        if link not in prev_nodes:
            sum_ += recursive_count(link, prev_nodes+[node])
    return sum_


def recursive_path(node, prev_nodes, max_len=7, min_len=3):
    if len(prev_nodes) >= max_len:
        return [prev_nodes]
    links = convs_dict[node]
    if len(links) == 0:
        if len(prev_nodes) > min_len - 1:
            return [prev_nodes+[node]]
        else:
            return []
    
    sum_ = []
    for link in links:
        if link not in prev_nodes:
            sum_ += recursive_path(link, prev_nodes+[node])
    return sum_

def filter_paths(paths):
    ret_path = []
    for path in paths:
        # You not only on last position
        for node in path[:-1]:
            if convs.loc[node]['ActorName'] == 'You':
                ret_path.append(path)
                break
    return ret_path


def populate_dial(path):
    ret_path = []
    cur = []
    for node in path:
        if convs.loc[node]['ActorName'] == 'You' and cur:
            ret_path.append(cur[:])
        cur.append(node)
    if convs.loc[path[-1]]['ActorName'] != 'You':
        ret_path.append(cur[:])
    return ret_path
                
        

In [20]:
def path_to_example(path):
    lines = []
    buffer = []
    for node in path:
        text, actor = convs.loc[node][['Dialogue Text', 'ActorName']]
        if actor == 'You':
            if buffer:
                lines.append('\n'.join(buffer))
            lines.append(text)
            buffer = []
        else:
            buffer.append(f'[{actor}]: {text}')
    if buffer:
        lines.append('\n'.join(buffer))
    return lines

In [21]:
all_starts = list(convs[convs['ingoingLinks'].apply(len) == 0].index)

In [22]:
samples = []
for i in all_starts:
    # filter
    paths = filter_paths(recursive_path(i, []))
    # if there is too much from one conv, so sample dialogues from here
    if len(paths) > 10:
        paths = random.sample(paths, 10)
    # populate dialogues by cutting them
    new_paths = []
    for p in paths:
        new_paths.extend(populate_dial(p))
    paths = new_paths
    if paths:
        samples.append((i, paths))

In [23]:
dataset = []
for sample in samples:
    paths = sample[1]
    for i in paths:
        dataset.append(path_to_example(i))

In [24]:
import plotly.express as px
#df = px.data.tips()
x = [len(i[1]) for i in samples]
print(sum(x), sum(x)/len(x))
fig = px.histogram(x=x,)
fig.show()

13909 14.503649635036496


In [35]:
dataset = list(set([tuple(i) for i in dataset]))

In [38]:
with open('dataset/data_v1.json', 'w') as f:
    json.dump(dataset, f, indent=4)

In [263]:
freq_dict = {}
for sample in dataset:
    for line in sample:
        if line == '':
            print(sample)
        freq_dict[line] = freq_dict.get(line, 0) + 1


In [266]:
for k, v in freq_dict.items():
    if v > 100:
        print(f'{k}', v)

Look at the person standing on the side. 126
[Visual Calculus]: The commandant -- the one who gives the order. Machine gun fire crackling through the air, the lights of the muzzle flashes dancing on his face... 126
What *is* this... 108
[Visual Calculus]: The abundance of bullet holes leads to two options: either an inordinate amount of executions were performed here, or they did not use a *conscience round* -- where only one soldier has the loaded rifle. Looks like this was a mass execution with everyone fully armed. 108
What? 121
Inspect the ghostly figures. 105
[Visual Calculus]: The man does not know the bullet has entered his brain. He never will. Death comes faster than the realization. 105
I have the map -- extrapolate the radius to include all of Martinaise. 107
[Visual Calculus]: According to your map of the district, this shot could have come from a wide angle of locations -- starting with the northern edge of the abandoned boardwalk, ending with an islet in the bay. Let's ca

In [238]:
max(freq_dict.values())

1606

In [99]:
import plotly.graph_objects as go

def plot_conv(dia_table, inp_conv_id):
    canva = dia_table.loc[inp_conv_id].loc[:, ['canvasRect_x', 'canvasRect_y', 'outgoingLinks', 'Dialogue Text']]
    colors = ['#872341', '#2d98b5']
    dot_color = '#BE3144'
    bg_color = '#22092C'

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=canva['canvasRect_x'],
        y=canva['canvasRect_y'],
        text=canva.index,#['Dialogue Text'],
        mode="markers",
        marker=dict(color=dot_color, )
    ))

    for self_dia_id, other in canva.iterrows():
        x0 = other['canvasRect_x']
        y0 = other['canvasRect_y']
        links = other['outgoingLinks']
        for conv_id, dia_id in links:
            if conv_id != inp_conv_id:
                continue

            x2, y2 = canva.loc[dia_id][['canvasRect_x', 'canvasRect_y']]
            x1, y1 = (x0 + x2)/2, (y0 + y2)/2
            
            fig.add_trace(go.Scatter(
                x=[x0, x1], y=[y0, y1], showlegend=False, hoverinfo='none',
                mode="lines", line=dict(width=0.5, color=colors[0])))
            fig.add_trace(go.Scatter(
                x=[x1, x2], y=[y1, y2], showlegend=False,  hoverinfo='none',
                mode="lines", line=dict(width=0.5, color=colors[1])))

    fig.update_layout(
        autosize=False,
        width=1500, height=1000,
        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        plot_bgcolor=bg_color, paper_bgcolor=bg_color,
    )

    fig.update_shapes(dict(xref='x', yref='y'))
    return fig

plot_conv(convs, 998)

In [34]:
for idx, i in enumerate(all_starts[700:], start=700):
    if idx not in [630, 631, ]:
        print(idx)
        r = recursive_count(i, [])
        print(r, idx, i)


700
3 700 (71, 302)
701
1 701 (71, 314)
702
1 702 (71, 359)
703
6 703 (71, 368)
704
1 704 (71, 395)
705
1 705 (71, 406)
706
2 706 (71, 427)
707
6 707 (71, 455)
708
19 708 (71, 464)
709
4 709 (71, 487)
710
3 710 (71, 497)
711
8 711 (71, 517)
712
1 712 (71, 536)
713
4 713 (71, 537)
714
4 714 (71, 558)
715
1 715 (71, 560)
716
2 716 (71, 585)
717
1 717 (71, 598)
718
3 718 (71, 622)
719
2 719 (71, 663)
720
4 720 (71, 669)
721
4 721 (71, 681)
722
6 722 (71, 696)
723
2 723 (71, 708)
724
2 724 (71, 735)
725
3 725 (71, 748)
726
1 726 (71, 752)
727
1 727 (71, 759)
728
4 728 (71, 776)
729
4 729 (71, 777)
730
1 730 (71, 798)
731
8 731 (71, 836)
732
3 732 (71, 839)
733
8 733 (71, 879)
734
4 734 (71, 911)
735
19 735 (71, 918)
736
1 736 (71, 927)
737
8 737 (71, 945)
738
3 738 (71, 969)
739
4 739 (71, 1036)
740
19 740 (71, 1060)
741
6 741 (71, 1063)
742
4 742 (71, 1068)
743
1 743 (71, 1074)
744
4 744 (71, 1088)
745
8 745 (119, 6)
746
9 746 (119, 11)
747
3 747 (121, 2)
748
4 748 (121, 7)
749
4 749 (121

KeyboardInterrupt: 