# Loading text pre-processing model
* https://github.com/UKPLab/sentence-transformers
* https://arxiv.org/abs/1908.10084

In [None]:
!pip install git+https://github.com/rmarcacini/sentence-transformers
!pip install gdown
!gdown https://drive.google.com/uc?id=1NV5t1YhyyOzMF5zAovfbSLdZZLvqrfZ_
!unzip distiluse-base-multilingual-cased.zip -d language_model
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging

np.set_printoptions(threshold=100)
logging.basicConfig(format='%(asctime)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S',level=logging.INFO,handlers=[LoggingHandler()])

language_model = SentenceTransformer('./language_model')

Collecting git+https://github.com/rmarcacini/sentence-transformers
  Cloning https://github.com/rmarcacini/sentence-transformers to /tmp/pip-req-build-pe30i3uy
  Running command git clone -q https://github.com/rmarcacini/sentence-transformers /tmp/pip-req-build-pe30i3uy
Collecting transformers<3.2.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 7.1MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 33.4MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/s

# Loading requirements candidates

In [None]:
!gdown --id 1O-R3Zz9li8SYDcFumPsDN9txA3L2QixG

Downloading...
From: https://drive.google.com/uc?id=1O-R3Zz9li8SYDcFumPsDN9txA3L2QixG
To: /content/requirement.csv
0.00B [00:00, ?B/s]2.57MB [00:00, 81.7MB/s]


In [None]:
import pandas as pd

df_data = pd.read_csv('requirement.csv')
df_data

Unnamed: 0,requirement_text
0,bought
1,support group orders with individual payments
2,cook more
3,process credit cards payments
4,cash
...,...
149630,intolerance
149631,Pooorer
149632,contraversial
149633,Worst.spoiledfood


Generate a feature vector for each requirement candidate

In [None]:
df_data['embedding'] = list(language_model.encode(df_data['requirement_text'].to_list()))

HBox(children=(FloatProgress(value=0.0, description='Batches', max=4677.0, style=ProgressStyle(description_wid…




In [None]:
df_data

Unnamed: 0,requirement_text,embedding
0,bought,"[0.037175164, 0.02746418, -0.027728138, 0.0300..."
1,support group orders with individual payments,"[-0.06067936, -0.04864118, -0.0040709833, -0.0..."
2,cook more,"[0.011025423, -0.014061921, 0.009416151, -0.01..."
3,process credit cards payments,"[-0.068453215, 0.0036510304, 0.0064596976, 0.0..."
4,cash,"[0.044530906, -0.011020444, -0.049469605, -0.0..."
...,...,...
149630,intolerance,"[-0.018483961, -0.049156286, -0.00084916595, -..."
149631,Pooorer,"[0.039778505, -0.035537004, -0.036012214, -0.0..."
149632,contraversial,"[-0.025207547, -0.04103427, -0.056060143, -0.0..."
149633,Worst.spoiledfood,"[-0.0016031228, 0.010851361, 0.05258863, 0.016..."


# Grouping candidates based on feature vector

* https://dl.acm.org/doi/book/10.5555/3235282 (capitulo de livro)
* https://www.researchgate.net/publication/268289418_A_Survey_of_Text_Clustering_Algorithms/link/55c5667a08aea2d9bdc39b6b/download 

In [None]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

from sklearn.metrics import silhouette_samples
import numpy as np

X = np.array(df_data['embedding'].to_list())
kmeans = MiniBatchKMeans(n_clusters=300, random_state=0, init='random',verbose=1,batch_size=1000,max_no_improvement=1000).fit(X)
df_data['cluster'] = kmeans.labels_

Init 1/3 with method: random
Inertia for init 1/3: 1126.972290
Init 2/3 with method: random
Inertia for init 2/3: 1110.588623
Init 3/3 with method: random
Inertia for init 3/3: 1122.899902
Minibatch iteration 1/15000: mean batch inertia: 0.432621, ewa inertia: 0.432621 
Minibatch iteration 2/15000: mean batch inertia: 0.418971, ewa inertia: 0.432439 
Minibatch iteration 3/15000: mean batch inertia: 0.404777, ewa inertia: 0.432069 
Minibatch iteration 4/15000: mean batch inertia: 0.400175, ewa inertia: 0.431643 
Minibatch iteration 5/15000: mean batch inertia: 0.399089, ewa inertia: 0.431208 
Minibatch iteration 6/15000: mean batch inertia: 0.402014, ewa inertia: 0.430817 
Minibatch iteration 7/15000: mean batch inertia: 0.388100, ewa inertia: 0.430246 
Minibatch iteration 8/15000: mean batch inertia: 0.391809, ewa inertia: 0.429733 
Minibatch iteration 9/15000: mean batch inertia: 0.394927, ewa inertia: 0.429267 
[MiniBatchKMeans] Reassigning 32 cluster centers.
Minibatch iteration 10/

In [None]:
df_data

Unnamed: 0,requirement_text,embedding,cluster
0,bought,"[0.037175164, 0.02746418, -0.027728138, 0.0300...",60
1,support group orders with individual payments,"[-0.06067936, -0.04864118, -0.0040709833, -0.0...",2
2,cook more,"[0.011025423, -0.014061921, 0.009416151, -0.01...",31
3,process credit cards payments,"[-0.068453215, 0.0036510304, 0.0064596976, 0.0...",47
4,cash,"[0.044530906, -0.011020444, -0.049469605, -0.0...",178
...,...,...,...
149630,intolerance,"[-0.018483961, -0.049156286, -0.00084916595, -...",89
149631,Pooorer,"[0.039778505, -0.035537004, -0.036012214, -0.0...",67
149632,contraversial,"[-0.025207547, -0.04103427, -0.056060143, -0.0...",77
149633,Worst.spoiledfood,"[-0.0016031228, 0.010851361, 0.05258863, 0.016...",220


In [None]:
df_data[df_data.cluster==47]

2021-06-18 17:36:34 - NumExpr defaulting to 2 threads.


Unnamed: 0,requirement_text,embedding,cluster
3,process credit cards payments,"[-0.068453215, 0.0036510304, 0.0064596976, 0.0...",47
118,credit card processing,"[-0.05089237, 0.009707626, -0.010439861, 0.026...",47
181,card,"[-0.03323663, -0.0034145599, -0.03726258, 0.00...",47
188,debit card,"[-0.026913177, -0.02715102, -0.023486247, 0.02...",47
233,credit card,"[-0.041877277, -0.00074380304, -0.024592008, 0...",47
...,...,...,...
148122,hack your card,"[-0.026397306, 0.006570547, -0.032916613, 0.02...",47
148680,show card option,"[0.005101489, 0.038082905, 0.0007375727, 0.074...",47
148681,shows card options,"[0.011716591, 0.027970335, 0.0015033563, 0.080...",47
148926,payments through food cards,"[-0.09067322, -0.02396723, -0.0015926573, -0.0...",47


In [None]:
df_data[df_data.cluster==0]

Unnamed: 0,requirement_text,embedding,cluster
683,update my contact info,"[-0.08187661, 0.042463683, 0.0079845, 0.074037...",0
735,update the details,"[-0.09368116, 0.04675122, -0.024632866, 0.0907...",0
953,update my profile,"[-0.06370778, 0.053309686, -0.02890365, 0.0967...",0
1046,update contact info,"[-0.0848206, 0.042936467, 0.013918944, 0.08375...",0
1073,update contact details,"[-0.10833922, 0.05706967, 0.0029367208, 0.0754...",0
...,...,...,...
143324,update its information,"[-0.054805808, 0.0077626314, -0.0113326665, 0....",0
145100,updated on restaurant information,"[-0.09538414, 0.0037518688, 0.022917407, 0.064...",0
145348,update correct contact details,"[-0.10763693, 0.03326906, 0.013415819, 0.03858...",0
146139,updating all my bookmarks,"[0.018984891, 0.038653534, -0.059202563, 0.030...",0


In [None]:
df_data[df_data.cluster==220]

Unnamed: 0,requirement_text,embedding,cluster
1120,saying rude words,"[0.008787694, -0.011050601, -0.008847884, 0.02...",220
3447,anger boosting,"[-0.015631177, 0.018261738, -0.043654673, 0.00...",220
3806,SickMessagingFeature,"[-0.030806001, 0.015994329, 0.018026613, 0.073...",220
3832,Commit suicide,"[0.0023929467, -0.03302312, -0.022682564, -0.0...",220
3958,translate malay to English,"[-0.030080566, -0.051196765, 0.015247984, 0.04...",220
...,...,...,...
149445,Disturbance Aap,"[0.030520817, 0.015248947, -0.021817314, 0.037...",220
149471,Animal abuse,"[0.0142560145, -0.018439716, 0.092928335, -0.0...",220
149472,BoycottChina,"[-0.0003806208, -0.012572662, -0.005591418, 0....",220
149633,Worst.spoiledfood,"[-0.0016031344, 0.010851323, 0.052588638, 0.01...",220


## Identifying well-placed candidates in your cluster

* https://en.wikipedia.org/wiki/Silhouette_(clustering)



In [None]:
from sklearn.metrics import silhouette_samples
import numpy as np

df_data['silhouette'] = silhouette_samples(X,  kmeans.labels_)

In [None]:
df_data

Unnamed: 0,requirement_text,embedding,cluster,silhouette
0,bought,"[0.037175164, 0.02746418, -0.027728138, 0.0300...",60,0.168153
1,support group orders with individual payments,"[-0.06067936, -0.04864118, -0.0040709833, -0.0...",2,-0.000717
2,cook more,"[0.011025423, -0.014061921, 0.009416151, -0.01...",31,0.115923
3,process credit cards payments,"[-0.068453215, 0.0036510304, 0.0064596976, 0.0...",47,0.072541
4,cash,"[0.044530906, -0.011020444, -0.049469605, -0.0...",178,0.103650
...,...,...,...,...
149630,intolerance,"[-0.018483961, -0.049156286, -0.00084916595, -...",89,-0.089331
149631,Pooorer,"[0.039778505, -0.035537004, -0.036012214, -0.0...",67,0.070369
149632,contraversial,"[-0.025207547, -0.04103427, -0.056060143, -0.0...",77,-0.016629
149633,Worst.spoiledfood,"[-0.0016031228, 0.010851361, 0.05258863, 0.016...",220,-0.119437


In [None]:
# ordenando pelo melhor silhouette (o topo eh um candidato para nomear o cluster)
df_data[df_data.cluster==0].sort_values(by='silhouette', ascending=False)

Unnamed: 0,requirement_text,embedding,cluster,silhouette
21021,me update my profile info,"[-0.059015255, 0.039536573, -0.024006462, 0.09...",0,0.139901
51741,update my account information,"[-0.071642414, 0.029500071, 0.015441682, 0.076...",0,0.135037
109585,telling me to update my billing information,"[-0.05916071, 0.027630934, 0.0034455154, 0.067...",0,0.133773
51221,update me billing info,"[-0.06451203, 9.2335395e-06, -0.004199724, 0.0...",0,0.133121
57937,update my billing settings,"[-0.07343577, 0.032138404, 0.011183555, 0.0634...",0,0.132948
...,...,...,...,...
56696,update my own credit card details,"[-0.110442355, 0.02671429, -0.016706113, 0.056...",0,-0.060873
1471,update contact number,"[-0.09658992, 0.030962607, -0.036351435, 0.049...",0,-0.061498
52990,update my credit card details,"[-0.09660519, 0.031507492, -0.022618154, 0.068...",0,-0.067565
117503,update phone number in profile,"[-0.104245074, 0.03133098, -0.054412168, 0.056...",0,-0.068673


In [None]:
# ordenando pelo melhor silhouette (o topo eh um candidato para nomear o cluster)
df_data[df_data.cluster==47].sort_values(by='silhouette', ascending=False)

Unnamed: 0,requirement_text,embedding,cluster,silhouette
66260,Payment cards,"[-0.039152864, -0.03024029, -0.052215483, 0.04...",47,0.145860
1912,credit cards,"[-0.027322367, -0.011047636, -0.035997145, 0.0...",47,0.144391
106801,Credit Cards,"[-0.02143452, 0.0074433074, -0.037838705, 0.04...",47,0.142757
23059,Credit cards,"[-0.019102467, 0.002080985, -0.04395124, 0.031...",47,0.142380
39648,creditcards,"[-0.024997879, -0.006144399, -0.04340581, 0.00...",47,0.142368
...,...,...,...,...
72752,deduct from debit card,"[-0.049438298, -0.04727059, -0.022030044, 0.03...",47,-0.057574
33574,Order through card,"[-0.024310656, -0.0153507, 0.029571172, 0.0688...",47,-0.057681
42675,auto goes off your card,"[0.02292111, -0.032058626, -0.056788508, -0.00...",47,-0.062883
106809,clears card,"[-0.04509657, 0.010905642, -0.015270534, 0.063...",47,-0.075889


In [None]:
df_data[['requirement_text','cluster','silhouette']].to_csv('requirements_clusters.csv')

## Extracting embeddings

In [None]:
!gdown --id 1uuKUhOp68tH5yeUq5J4rjcIvjGq6PgCR

Downloading...
From: https://drive.google.com/uc?id=1uuKUhOp68tH5yeUq5J4rjcIvjGq6PgCR
To: /content/requirements_clusters-2.csv
5.79MB [00:00, 26.4MB/s]


In [None]:
import pandas as pd

df_data = pd.read_csv('requirements_clusters-2.csv')
df_data

Unnamed: 0.1,Unnamed: 0,requirement_text,cluster,silhouette
0,0,bought,60,0.168153
1,1,support group orders with individual payments,2,-0.000717
2,2,cook more,31,0.115923
3,3,process credit cards payments,47,0.072541
4,4,cash,178,0.103650
...,...,...,...,...
149630,149630,intolerance,89,-0.089331
149631,149631,Pooorer,67,0.070369
149632,149632,contraversial,77,-0.016629
149633,149633,Worst.spoiledfood,220,-0.119437


In [None]:
df_data['embedding'] = list(language_model.encode(df_data['requirement_text'].to_list()))
df_data

HBox(children=(FloatProgress(value=0.0, description='Batches', max=4677.0, style=ProgressStyle(description_wid…




Unnamed: 0.1,Unnamed: 0,requirement_text,cluster,silhouette,embedding
0,0,bought,60,0.168153,"[0.037175164, 0.02746418, -0.027728138, 0.0300..."
1,1,support group orders with individual payments,2,-0.000717,"[-0.06067936, -0.04864118, -0.0040709833, -0.0..."
2,2,cook more,31,0.115923,"[0.011025423, -0.014061921, 0.009416151, -0.01..."
3,3,process credit cards payments,47,0.072541,"[-0.068453215, 0.0036510304, 0.0064596976, 0.0..."
4,4,cash,178,0.103650,"[0.044530906, -0.011020444, -0.049469605, -0.0..."
...,...,...,...,...,...
149630,149630,intolerance,89,-0.089331,"[-0.018483961, -0.049156286, -0.00084916595, -..."
149631,149631,Pooorer,67,0.070369,"[0.039778505, -0.035537004, -0.036012214, -0.0..."
149632,149632,contraversial,77,-0.016629,"[-0.025207547, -0.04103427, -0.056060143, -0.0..."
149633,149633,Worst.spoiledfood,220,-0.119437,"[-0.0016031228, 0.010851361, 0.05258863, 0.016..."


In [None]:
df_emb = pd.DataFrame(np.array(df_data.embedding.to_list()))
df_emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0.037175,0.027464,-0.027728,0.030051,-0.047393,-0.070054,-0.048226,-0.001437,-0.070056,0.002270,-0.001362,0.036649,-0.072419,-0.007883,0.068679,-0.002946,0.012186,0.038906,-0.021418,-0.023533,-0.009805,0.013207,0.007916,0.020905,0.006889,-0.027638,0.100307,-0.021532,-0.022688,-0.013657,-0.038405,0.022515,0.036805,-0.072783,0.041203,-0.096467,-0.034794,-0.027598,-0.009310,-0.018149,...,-0.069107,-0.002569,0.032430,0.048271,0.018485,0.032876,0.019514,0.040756,-0.003705,-0.008199,0.086558,0.004860,-0.072653,0.001134,-0.056511,0.021771,0.050127,-0.023038,-0.031581,0.003559,0.075101,0.016607,-0.007554,-0.005609,-0.006435,-0.028097,0.089207,-0.042285,-0.007975,-0.018953,0.114981,0.002194,0.026171,-0.014171,-0.065693,-0.061732,0.005795,-0.047110,0.029597,-0.036436
1,-0.060679,-0.048641,-0.004071,-0.040379,0.098232,-0.038001,-0.094894,0.035263,-0.023258,0.002307,0.002671,0.054337,0.024883,-0.025773,0.040958,0.043604,-0.010058,0.090123,-0.021185,-0.021610,0.044164,0.025283,0.002482,-0.031927,-0.065900,0.044052,0.003338,0.035657,-0.017699,-0.024205,0.025365,-0.045748,-0.013884,0.055764,0.048973,0.010396,0.011026,0.040924,-0.085098,-0.062968,...,-0.000794,-0.066317,0.060900,0.039245,-0.076814,-0.049168,-0.084025,0.002961,0.055662,-0.020807,-0.022711,-0.072616,-0.013577,-0.057921,0.015732,0.061590,-0.009314,0.058921,0.048101,0.009812,0.004249,-0.005354,0.037686,-0.091458,-0.090559,-0.039749,-0.022394,-0.017809,0.008167,-0.031706,0.002936,0.030972,-0.063914,-0.004490,0.022693,-0.013712,-0.009632,-0.001394,0.015019,0.023095
2,0.011025,-0.014062,0.009416,-0.019354,0.022201,0.047040,-0.028377,-0.052779,0.035758,-0.006486,-0.012642,0.019980,0.085202,-0.026478,0.064787,-0.022867,-0.093834,0.007117,-0.000794,-0.044362,-0.054917,-0.082330,-0.126892,0.061318,0.065384,-0.022290,-0.008654,-0.050452,0.027036,0.037346,-0.028986,0.056518,0.025427,-0.096483,0.023784,-0.051210,0.033010,0.012809,-0.004020,-0.004697,...,-0.019233,0.069105,0.039556,0.096254,-0.078774,0.039580,-0.001324,0.013485,-0.030752,-0.027498,0.041125,-0.028859,0.004331,0.020720,0.024748,-0.026894,0.062968,0.041143,0.013779,-0.000033,0.081710,0.007116,0.043038,0.031978,-0.005041,0.037950,0.032439,-0.071458,0.010836,0.040630,0.028821,-0.059694,0.008614,0.039775,0.050134,-0.067006,-0.151899,-0.036825,0.057874,0.057099
3,-0.068453,0.003651,0.006460,0.017188,0.028796,-0.048799,-0.051305,-0.001819,0.000649,0.033372,0.020469,0.067580,0.103719,-0.006749,-0.014027,0.060040,0.024211,-0.004119,-0.042480,-0.014482,0.063062,0.005483,-0.048422,-0.026761,-0.007421,0.032666,0.084156,0.002658,-0.056527,-0.053676,0.004661,-0.024201,0.048757,-0.040491,0.020864,0.037376,0.031301,0.003328,-0.079259,-0.014873,...,-0.054736,-0.070433,0.001755,-0.045277,-0.058274,0.015089,-0.016516,0.003814,-0.036066,-0.002048,0.029648,-0.073664,0.004622,0.019941,-0.067286,0.037086,-0.006820,-0.041382,-0.021345,0.053043,0.054233,0.038433,-0.017690,-0.039641,-0.012975,0.009827,0.038023,-0.035459,0.030627,-0.049873,0.033656,-0.027018,-0.003397,0.003600,0.028135,-0.012573,-0.053834,0.036301,0.007382,0.016326
4,0.044531,-0.011020,-0.049470,-0.040506,-0.046790,0.021136,0.013163,-0.032344,0.004080,-0.026158,-0.010939,0.022662,-0.002003,0.035570,0.061825,-0.019050,-0.022739,-0.020602,-0.032855,-0.037949,0.058830,0.046838,0.041907,-0.027975,0.029107,0.000487,0.056961,0.019094,0.038082,-0.037195,-0.038054,0.035368,-0.010014,-0.081853,0.019739,-0.097666,0.000508,0.000760,-0.006496,-0.011442,...,-0.036261,0.026595,-0.054722,0.015490,-0.062024,-0.027291,-0.009883,0.019979,0.001749,-0.040412,0.059668,-0.025110,0.033579,-0.038713,-0.076423,0.013667,-0.023482,0.007390,0.004022,0.007923,0.063995,0.026617,-0.002904,-0.022977,-0.018080,0.008585,0.044995,-0.062632,0.001849,-0.069008,0.046255,-0.023535,0.042486,0.001668,-0.031994,-0.054052,0.018224,0.013564,0.055970,-0.034986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149630,-0.018484,-0.049156,-0.000849,-0.054817,-0.048387,-0.013722,0.023008,-0.029675,-0.050128,-0.052191,-0.009717,-0.038449,-0.027749,0.029062,0.074778,-0.021396,0.025763,-0.013225,0.039303,0.012193,0.000634,0.068853,-0.033149,0.011424,0.034955,0.042319,0.003295,0.006819,-0.026094,0.034176,-0.056342,-0.015630,0.106434,-0.045502,-0.049552,-0.051986,-0.085817,0.012364,-0.041681,0.007661,...,-0.032851,0.037852,-0.041441,0.020422,0.071789,0.019192,0.038211,0.022719,0.012158,-0.034733,0.058715,-0.016720,0.062901,-0.010707,-0.007039,0.005519,0.048040,0.031017,0.043300,-0.016154,0.101325,-0.011819,-0.000565,0.030971,0.112263,-0.007325,-0.004161,-0.066374,0.005172,-0.027777,0.024856,-0.060365,0.016707,-0.053605,-0.058744,-0.095968,-0.038758,-0.029216,0.005998,-0.023093
149631,0.039779,-0.035537,-0.036012,-0.033651,-0.047782,0.025303,-0.002836,0.007823,-0.003405,-0.036009,-0.069414,0.042002,-0.029474,0.006188,0.018274,-0.031485,0.016871,-0.012742,-0.051170,-0.012062,-0.013091,0.032760,-0.002508,-0.021013,0.007952,0.041338,-0.041053,-0.021885,-0.029067,-0.003986,0.009336,0.046160,-0.019685,-0.047850,0.042604,-0.031931,-0.056986,-0.038344,-0.026619,-0.051486,...,-0.028152,-0.052940,-0.024679,0.047330,0.010449,0.007216,-0.011872,0.035838,-0.028513,-0.030567,0.072511,-0.022158,0.005312,0.039146,-0.005835,0.030435,-0.027914,0.019715,0.067171,-0.029798,0.020346,0.028298,0.004345,-0.026494,-0.023175,0.003068,-0.035819,-0.072873,0.011877,-0.049448,0.008576,-0.026451,0.013218,0.056798,-0.060023,-0.054391,-0.017503,-0.009183,0.019589,0.017529
149632,-0.025208,-0.041034,-0.056060,-0.023150,-0.015840,-0.014752,0.009115,-0.030182,-0.074770,-0.030316,-0.022036,0.010807,-0.060686,0.017356,0.059653,-0.015520,0.050742,-0.020034,0.052818,0.023773,-0.042634,0.052303,0.007742,0.016913,-0.017197,0.045094,0.023365,0.013420,-0.060261,-0.007663,-0.033955,-0.003240,0.076460,-0.019632,0.001992,-0.055191,-0.002186,0.022240,-0.052899,0.067163,...,-0.045233,0.034206,0.015131,0.015748,0.000423,-0.009115,0.005732,0.025697,-0.023642,0.034977,0.069191,-0.054909,-0.007919,-0.031399,-0.023926,-0.012079,0.028864,-0.022491,0.005206,0.023530,0.086827,0.020746,0.045602,-0.018643,0.011255,0.039650,0.002889,-0.054751,-0.055191,0.015846,0.001878,-0.036434,-0.046930,-0.036119,-0.046080,-0.075927,0.006719,0.015441,-0.013442,-0.025400
149633,-0.001603,0.010851,0.052589,0.016712,-0.017911,0.050226,0.033304,-0.061428,-0.012530,-0.045173,0.014461,-0.005862,-0.050276,0.018407,0.010774,-0.057747,0.009534,0.020150,-0.014071,-0.011371,0.024168,-0.011919,-0.102264,0.102306,0.057068,-0.025451,-0.054290,-0.006554,-0.024336,-0.005590,-0.015202,0.013591,-0.018319,-0.078041,-0.030222,-0.060492,0.025174,-0.031989,-0.008778,0.031137,...,0.057659,-0.039231,-0.031609,0.038796,-0.039224,0.020805,0.061053,-0.010594,-0.040798,-0.041567,0.069025,-0.005238,-0.027715,0.012435,0.034885,0.004727,0.054808,0.004202,0.005023,0.029684,0.062595,-0.004952,0.009947,0.004474,0.027087,0.123462,0.002589,-0.057539,-0.037152,-0.015662,0.024820,0.018721,-0.010388,0.007939,0.013123,-0.029385,-0.120131,-0.049186,0.019976,0.041061


In [None]:
cols = df_emb.columns
df_emb[cols] = df_emb[cols].applymap(lambda x: '{0:.3f}'.format(x))

In [None]:
df_emb

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0.037,0.027,-0.028,0.030,-0.047,-0.070,-0.048,-0.001,-0.070,0.002,-0.001,0.037,-0.072,-0.008,0.069,-0.003,0.012,0.039,-0.021,-0.024,-0.010,0.013,0.008,0.021,0.007,-0.028,0.100,-0.022,-0.023,-0.014,-0.038,0.023,0.037,-0.073,0.041,-0.096,-0.035,-0.028,-0.009,-0.018,...,-0.069,-0.003,0.032,0.048,0.018,0.033,0.020,0.041,-0.004,-0.008,0.087,0.005,-0.073,0.001,-0.057,0.022,0.050,-0.023,-0.032,0.004,0.075,0.017,-0.008,-0.006,-0.006,-0.028,0.089,-0.042,-0.008,-0.019,0.115,0.002,0.026,-0.014,-0.066,-0.062,0.006,-0.047,0.030,-0.036
1,-0.061,-0.049,-0.004,-0.040,0.098,-0.038,-0.095,0.035,-0.023,0.002,0.003,0.054,0.025,-0.026,0.041,0.044,-0.010,0.090,-0.021,-0.022,0.044,0.025,0.002,-0.032,-0.066,0.044,0.003,0.036,-0.018,-0.024,0.025,-0.046,-0.014,0.056,0.049,0.010,0.011,0.041,-0.085,-0.063,...,-0.001,-0.066,0.061,0.039,-0.077,-0.049,-0.084,0.003,0.056,-0.021,-0.023,-0.073,-0.014,-0.058,0.016,0.062,-0.009,0.059,0.048,0.010,0.004,-0.005,0.038,-0.091,-0.091,-0.040,-0.022,-0.018,0.008,-0.032,0.003,0.031,-0.064,-0.004,0.023,-0.014,-0.010,-0.001,0.015,0.023
2,0.011,-0.014,0.009,-0.019,0.022,0.047,-0.028,-0.053,0.036,-0.006,-0.013,0.020,0.085,-0.026,0.065,-0.023,-0.094,0.007,-0.001,-0.044,-0.055,-0.082,-0.127,0.061,0.065,-0.022,-0.009,-0.050,0.027,0.037,-0.029,0.057,0.025,-0.096,0.024,-0.051,0.033,0.013,-0.004,-0.005,...,-0.019,0.069,0.040,0.096,-0.079,0.040,-0.001,0.013,-0.031,-0.027,0.041,-0.029,0.004,0.021,0.025,-0.027,0.063,0.041,0.014,-0.000,0.082,0.007,0.043,0.032,-0.005,0.038,0.032,-0.071,0.011,0.041,0.029,-0.060,0.009,0.040,0.050,-0.067,-0.152,-0.037,0.058,0.057
3,-0.068,0.004,0.006,0.017,0.029,-0.049,-0.051,-0.002,0.001,0.033,0.020,0.068,0.104,-0.007,-0.014,0.060,0.024,-0.004,-0.042,-0.014,0.063,0.005,-0.048,-0.027,-0.007,0.033,0.084,0.003,-0.057,-0.054,0.005,-0.024,0.049,-0.040,0.021,0.037,0.031,0.003,-0.079,-0.015,...,-0.055,-0.070,0.002,-0.045,-0.058,0.015,-0.017,0.004,-0.036,-0.002,0.030,-0.074,0.005,0.020,-0.067,0.037,-0.007,-0.041,-0.021,0.053,0.054,0.038,-0.018,-0.040,-0.013,0.010,0.038,-0.035,0.031,-0.050,0.034,-0.027,-0.003,0.004,0.028,-0.013,-0.054,0.036,0.007,0.016
4,0.045,-0.011,-0.049,-0.041,-0.047,0.021,0.013,-0.032,0.004,-0.026,-0.011,0.023,-0.002,0.036,0.062,-0.019,-0.023,-0.021,-0.033,-0.038,0.059,0.047,0.042,-0.028,0.029,0.000,0.057,0.019,0.038,-0.037,-0.038,0.035,-0.010,-0.082,0.020,-0.098,0.001,0.001,-0.006,-0.011,...,-0.036,0.027,-0.055,0.015,-0.062,-0.027,-0.010,0.020,0.002,-0.040,0.060,-0.025,0.034,-0.039,-0.076,0.014,-0.023,0.007,0.004,0.008,0.064,0.027,-0.003,-0.023,-0.018,0.009,0.045,-0.063,0.002,-0.069,0.046,-0.024,0.042,0.002,-0.032,-0.054,0.018,0.014,0.056,-0.035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149630,-0.018,-0.049,-0.001,-0.055,-0.048,-0.014,0.023,-0.030,-0.050,-0.052,-0.010,-0.038,-0.028,0.029,0.075,-0.021,0.026,-0.013,0.039,0.012,0.001,0.069,-0.033,0.011,0.035,0.042,0.003,0.007,-0.026,0.034,-0.056,-0.016,0.106,-0.046,-0.050,-0.052,-0.086,0.012,-0.042,0.008,...,-0.033,0.038,-0.041,0.020,0.072,0.019,0.038,0.023,0.012,-0.035,0.059,-0.017,0.063,-0.011,-0.007,0.006,0.048,0.031,0.043,-0.016,0.101,-0.012,-0.001,0.031,0.112,-0.007,-0.004,-0.066,0.005,-0.028,0.025,-0.060,0.017,-0.054,-0.059,-0.096,-0.039,-0.029,0.006,-0.023
149631,0.040,-0.036,-0.036,-0.034,-0.048,0.025,-0.003,0.008,-0.003,-0.036,-0.069,0.042,-0.029,0.006,0.018,-0.031,0.017,-0.013,-0.051,-0.012,-0.013,0.033,-0.003,-0.021,0.008,0.041,-0.041,-0.022,-0.029,-0.004,0.009,0.046,-0.020,-0.048,0.043,-0.032,-0.057,-0.038,-0.027,-0.051,...,-0.028,-0.053,-0.025,0.047,0.010,0.007,-0.012,0.036,-0.029,-0.031,0.073,-0.022,0.005,0.039,-0.006,0.030,-0.028,0.020,0.067,-0.030,0.020,0.028,0.004,-0.026,-0.023,0.003,-0.036,-0.073,0.012,-0.049,0.009,-0.026,0.013,0.057,-0.060,-0.054,-0.018,-0.009,0.020,0.018
149632,-0.025,-0.041,-0.056,-0.023,-0.016,-0.015,0.009,-0.030,-0.075,-0.030,-0.022,0.011,-0.061,0.017,0.060,-0.016,0.051,-0.020,0.053,0.024,-0.043,0.052,0.008,0.017,-0.017,0.045,0.023,0.013,-0.060,-0.008,-0.034,-0.003,0.076,-0.020,0.002,-0.055,-0.002,0.022,-0.053,0.067,...,-0.045,0.034,0.015,0.016,0.000,-0.009,0.006,0.026,-0.024,0.035,0.069,-0.055,-0.008,-0.031,-0.024,-0.012,0.029,-0.022,0.005,0.024,0.087,0.021,0.046,-0.019,0.011,0.040,0.003,-0.055,-0.055,0.016,0.002,-0.036,-0.047,-0.036,-0.046,-0.076,0.007,0.015,-0.013,-0.025
149633,-0.002,0.011,0.053,0.017,-0.018,0.050,0.033,-0.061,-0.013,-0.045,0.014,-0.006,-0.050,0.018,0.011,-0.058,0.010,0.020,-0.014,-0.011,0.024,-0.012,-0.102,0.102,0.057,-0.025,-0.054,-0.007,-0.024,-0.006,-0.015,0.014,-0.018,-0.078,-0.030,-0.060,0.025,-0.032,-0.009,0.031,...,0.058,-0.039,-0.032,0.039,-0.039,0.021,0.061,-0.011,-0.041,-0.042,0.069,-0.005,-0.028,0.012,0.035,0.005,0.055,0.004,0.005,0.030,0.063,-0.005,0.010,0.004,0.027,0.123,0.003,-0.058,-0.037,-0.016,0.025,0.019,-0.010,0.008,0.013,-0.029,-0.120,-0.049,0.020,0.041


In [None]:
df_emb.head(8000).to_csv('embeddings.tsv',sep="\t",header=None,index=False)

In [None]:
df_data[['embedding']].sample(8000).to_csv('embeddings.tsv',sep="\t",header=None,index=False)

In [None]:
df_data[['cluster','requirement_text']].sample(8000).to_csv('metadata.tsv',sep="\t",index=False)