In [None]:
#| echo: false
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp core

# main

> ...

In [None]:
from inhere.src import *
from inhere.gptanalysis import *

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial import distance
from sklearn.manifold import TSNE
import altair as alt
from IPython.display import display, Markdown

## variables

In [None]:
cols_emb_irrel = ['Number of hit', 'left_context', 'node', 'right_context']

cols_emb_forms = ['L2_word', 'L1_word', 'R1_word']
cols_emb_wc = ['L2_wc', 'L1_wc', 'R1_wc']
cols_emb_sem_prag = ['foc', 'fig', 'mot', 'con', 'van', 'ill']

In [None]:
cols_emb = cols_emb_wc + cols_emb_sem_prag + cols_emb_forms

len(cols_emb)

12

## load data

In [None]:
df = load_data()

In [None]:
df

Unnamed: 0,Number of hit,left_context,node,right_context,L2_word,L1_word,R1_word,R2_word,R3_word,L2_wc,L1_wc,R1_wc,foc,fig,mot,con,van,ill
0,1,swim swam and swum mm there 's not enough pepper,in here,well there 's a whole new thing of pepper we,enough,pepper,well,there,s,L2PRON,L1N,R1ADV,content,ingredient,in,dish,external,representative
1,2,in Suffolk in Suffolk yeah what another helico...,in here,? no it 's did n't see it --UNCLEARWORD was,went,down,?,no,it,L2V,L1ADV,R1PUNCT,motion,object,into,region,internal,question
2,3,in here no it was lucky you had one come,in here,did you ? yeah the dogs chased the chickens oh,one,come,did,you,?,L2PRON,L1V,R1V,motion,animal,into,building,internal,expressive
3,4,so you but if the ground is nice and moist,in here,and you 've got a moist atmosphere yeah yeah that,and,moist,and,you,ve,L2CONJ,L1ADJ,R1CONJ,quality,setting,in,garden,internal,suggestion
4,5,re going to have some permanent music set up a...,in here,and something probably er do n't need to plug it,up,absolutely,and,something,probably,L2ADV,L1ADV,R1CONJ,action,object,into,socket,external,suggestion
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,newspaper yeah that 'll be in the newspaper 'l...,in here,yeah we 've got loads of paper in there okay,ll,be,yeah,we,ve,L2V,L1V,R1INTERJ,location,abstract,in,document,external,prediction
196,197,cigarette in your house no no we 've been smoking,in here,we s- we just we were just sat smoking here,been,smoking,we,s-,we,L2V,L1V,R1PRON,action,speaker_pl,in,apartment,internal,representative
197,198,yeah they 're the rules --UNCLEARWORD --UNCLEA...,in here,do you want me to open that window again ?,is,hot,do,you,want,L2V,L1ADJ,R1V,quality,setting,in,room,internal,expressive
198,199,actually I open the fridge and think there 's ...,in here,I wan na eat cos I never get to go,s,nothing,I,wan,na,L2V,L1PRON,R1PRON,content,there,in,container,external,representative


## one-hot encoding

In [None]:
df_onehot = get_one_hot_encoding(df, cols_emb)

In [None]:
df_onehot

Unnamed: 0,L2_wc_L2ADJ,L2_wc_L2ADV,L2_wc_L2CONJ,L2_wc_L2DET,L2_wc_L2INTERJ,L2_wc_L2N,L2_wc_L2NEG,L2_wc_L2NUM,L2_wc_L2PREP,L2_wc_L2PRON,...,R1_word_where,R1_word_which,R1_word_while,R1_word_who,R1_word_whoops,R1_word_with,R1_word_would,R1_word_yeah,R1_word_yes,R1_word_you
0,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
196,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
198,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## dimensionality reduction with PCA

In [None]:
dimred = reduce_dimensions(df_onehot)

In [None]:
dimred.shape

(200, 50)

## clustering

### determine number of clusters

#### automatic method using Silhouette Score

In [None]:
clusters_n = get_number_of_clusters(dimred)

In [None]:
clusters_n

9

#### manual method using Elbow Method

In [None]:
n_max = 20

In [None]:
make_elbow_chart(dimred, n_max)

In [None]:
clusters_n = 5

### run clustering

In [None]:
#| output: false
clusters, centers = run_clustering(dimred, clusters_n)

  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
df['cluster'] = clusters

### determine distances to clusters

In [None]:
df = get_cluster_distances(df, dimred, centers)

In [None]:
df

Unnamed: 0,Number of hit,left_context,node,right_context,L2_word,L1_word,R1_word,R2_word,R3_word,L2_wc,...,mot,con,van,ill,cluster,dist_cluster_0,dist_cluster_1,dist_cluster_2,dist_cluster_3,dist_cluster_4
0,1,swim swam and swum mm there 's not enough pepper,in here,well there 's a whole new thing of pepper we,enough,pepper,well,there,s,L2PRON,...,in,dish,external,representative,0,2.574126,3.518907,3.565559,3.189650,3.082958
1,2,in Suffolk in Suffolk yeah what another helico...,in here,? no it 's did n't see it --UNCLEARWORD was,went,down,?,no,it,L2V,...,into,region,internal,question,3,3.213378,2.876518,3.561182,2.753914,3.061164
2,3,in here no it was lucky you had one come,in here,did you ? yeah the dogs chased the chickens oh,one,come,did,you,?,L2PRON,...,into,building,internal,expressive,1,3.430617,2.184031,3.517899,3.594468,3.144925
3,4,so you but if the ground is nice and moist,in here,and you 've got a moist atmosphere yeah yeah that,and,moist,and,you,ve,L2CONJ,...,in,garden,internal,suggestion,2,3.285447,3.488172,2.479260,3.765549,2.948098
4,5,re going to have some permanent music set up a...,in here,and something probably er do n't need to plug it,up,absolutely,and,something,probably,L2ADV,...,into,socket,external,suggestion,0,2.710894,3.355253,3.884228,3.467482,3.295035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,newspaper yeah that 'll be in the newspaper 'l...,in here,yeah we 've got loads of paper in there okay,ll,be,yeah,we,ve,L2V,...,in,document,external,prediction,0,2.830600,3.556885,3.717338,3.259911,3.098179
196,197,cigarette in your house no no we 've been smoking,in here,we s- we just we were just sat smoking here,been,smoking,we,s-,we,L2V,...,in,apartment,internal,representative,4,2.894592,2.822629,3.231568,3.452605,2.202666
197,198,yeah they 're the rules --UNCLEARWORD --UNCLEA...,in here,do you want me to open that window again ?,is,hot,do,you,want,L2V,...,in,room,internal,expressive,2,3.421285,3.538195,1.710039,3.861995,2.848420
198,199,actually I open the fridge and think there 's ...,in here,I wan na eat cos I never get to go,s,nothing,I,wan,na,L2V,...,in,container,external,representative,0,2.401980,3.652510,3.582584,3.187979,3.061696


In [None]:
# df.to_excel('../out/attestations_clustered.xlsx', index=False)

## visualise clustered attestations

In [None]:
df_tsne = run_tsne(dimred)
chart = plot_tsne(df_tsne, clusters)

In [None]:
chart

In [None]:
# chart.save('../out/vector_representations_clustered.png', scale_factor=2.0)

## cluster analysis

### typical examples

In [None]:
examples_n = 5

In [None]:
prompt_examples = ''

for c in range(clusters_n):
	prompt_examples += f'cluster {c}' + '\n'
	prompt_examples += '\n' + format_examples(get_examples(df, c, examples_n)) + '\n\n'

display(Markdown(prompt_examples))

cluster 0

- in here it wo n't allow me to store anything in here right well that 's annoying so I can only put
- ? yeah just get a pair just to eh keep in here well I 've got two pairs anyway so I might
- s like four or five drawers yeah there 's four in here and they 're nice it 's a nice size one
- --ANONnameM not have one ? I 've got bigger bowls in here hang on a minute there might be something in here
- did it go ? ah users --ANONnameM my pictures probably in here what is it you 're looking for ? er that

cluster 1

- I 'm the alpaca Pandora of of the --ANONplace came in here totally see you as like a horror movie lead actually
- Helen live together and er they when I first moved in here and I was n't sure about it and I was
- see it ? oh yeah yeah --ANONnameM --ANONnameM --ANONnameM come in here where I can see you hello hello hello hello juice
- when I lived with --ANONnameM and --ANONnameF before I moved in here mm hm and um she came and everything and it
- How long ago ? last year just before I moved in here oh right nah I were n't there then in the

cluster 2

- last night than yeah the night before it is cooler in here like like I think it was really hot last night
- it 's too hot in here it 's getting hot in here so hot so let 's eat Christmas dinner erm I
- some for the hat have another one getting quite noisy in here now is n't it let 's go for a walk
- of an inkling of of apprehensiveness but it 's busy in here look at this why ? cos it 's the holidays
- just gone in the kitchen my goodness it is dark in here is n't it without the lights ? wow let there

cluster 3

- was erm why wo n't this why will nothing fit in here ? Where is the knife ? --UNCLEARWORD to do that
- ca n't think for the life of me it was in here ? yeah I looked in all the cupboards just in
- here look Fontana and Vrisar nothing about about what 's in here ? well Porec centre did you see that ? no
- shopping is n't it ? yeah yeah oh what 's in here ? ah or in Tescos he does n't --ANONnameM does
- slippers come from oh yeah what else have we got in here ? this is all the little baby turtles that we

cluster 4

- I bet we 've got hundreds ca n't see any in here although well you know at the risk of sounding racist
- in the other room cos we t- we 're talking in here oh no well we 'll talk in here well then
- n't you so yeah but erm no point leaving it in here there 'll be some money I 'm sure stuff and
- bit there and um last week when --ANONnameM was sleeping in here they started playing for some reason started playing really loud
- the table in here cos we 're gon na eat in here yes I think so is that alright ? yeah --UNCLEARWORD



### analysis by GPT

#### prompt

In [None]:
#| echo: false
with open('../data/gpt_prompt_system.txt', 'r') as f:
	prompt_system = f.read()

display(Markdown(prompt_system))

You work as a corpuslinguistic annotator for a research project in linguistics.
The data are from the BNC 2014 spoken corpus and contain utterances that feature the phrase 'in here'.
These utterances were classified into several clusters
based on several features of the attested utterances: word forms, word classes, and several semantic and pragmatic features.
The assumption is that these clusters represent different types of uses of the phrase 'in here'
and that they show differences with regard to linguistic regularities underlying the use of 'in here'.
Your job is to summarise the regularities of each cluster,
and to summarise how these clusters cover the space of usages of the phrase 'in here'.

#### analysis

In [None]:
#| echo: false
analysis_f = '../out/gpt_cluster_analysis_gpt-4_5.txt'

# read text file
with open(analysis_f, 'r') as file:
	analysis = file.read()

display(Markdown(analysis))

Cluster 0: This cluster seems to represent uses of 'in here' where the phrase refers to a specific location or container where items can be stored or found. The phrase is often used in the context of searching for something or discussing the placement of objects.

Cluster 1: This cluster represents uses of 'in here' in the context of moving or coming into a place. The phrase is often used to describe someone's arrival or relocation to a new place, such as moving into a new house or entering a room.

Cluster 2: This cluster represents uses of 'in here' where the phrase is used to describe the conditions or atmosphere of a place, such as its temperature or noise level. The phrase is often used in the context of commenting on these conditions.

Cluster 3: This cluster represents uses of 'in here' where the phrase is used in questions or exclamations about what is inside a particular place or container. The phrase is often used in the context of discovering or identifying what is inside.

Cluster 4: This cluster represents uses of 'in here' where the phrase is used to refer to a specific location where activities are taking place or are planned to take place. The phrase is often used in the context of discussing these activities.

These clusters cover a range of usages of 'in here', from referring to a specific location or container, to describing the conditions of a place, to discussing activities taking place in a location. They show that 'in here' can be used in a variety of contexts and with different meanings depending on the situation.