# Machine Learning for String Field Theory

H. Erbin, R. Finotello, M. Kudrna, M. Schnabl

---
---

## Abstract

In the framework of bosonic Open String Field Theory (OSFT), we consider several observables characterised by conformal weight and type, and the position of vacua in the potential for various values of truncated mass level. We focus on the prediction of the extrapolated value for the level-$\infty$ truncation using Machine Learning (ML) techniques.

## Synopsis

In this notebook we prepare the model independent analysis. In particular we study a way to reduce the input data in order to use them independently from the initial model.

In [None]:
%load_ext autoreload
%autoreload 2

## Load the Dataset

In [2]:
import pandas as pd

lumps  = pd.read_csv('./data/lumps.csv')
wzw    = pd.read_csv('./data/wzw.csv')

In [3]:
import re

dlumps = pd.read_json('./data/dlumps.json')
dexp   = pd.read_json('./data/dlumps_exp.json')

dlumps = pd.merge(dlumps, dexp, left_index=True, right_index=True)
dlumps = dlumps.rename(columns=lambda c: re.sub('^([0-9]+)', r'level_\1', c))

In [4]:
lumps.describe()

Unnamed: 0,solutions,init,weight,type,exp,level_2,level_3,level_4,level_5,level_6,...,level_9,level_10,level_11,level_12,level_13,level_14,level_15,level_16,level_17,level_18
count,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0,...,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0,732.0
mean,23.849727,0.832923,1.983402,3.748634,0.538251,-1.605744,-1.77012,7.840907,8.2967,-33.868139,...,176.059381,-737.888108,-761.65948,3002.674618,3087.526701,-11341.3086,-11628.506465,39852.334515,40771.690931,-130931.3
std,13.322038,0.99706,2.338717,0.663437,0.707522,4.515971,4.969205,21.374749,22.621096,110.076428,...,630.770974,2826.575772,2915.937396,11805.371641,12130.574351,44982.958696,46093.166999,158645.302316,162223.144544,523083.3
min,0.0,0.0,0.0,2.0,-1.0,-19.74404,-21.893983,-0.754568,-0.782633,-514.984097,...,-1.781985,-13321.170445,-13781.246472,-8.850113,-12.265769,-211473.396816,-216475.644423,-44.356923,-66.596211,-2489024.0
25%,12.75,0.0,0.158447,4.0,0.0,-0.776018,-1.031007,0.0,0.0,-0.919728,...,0.0,-1.106747,-1.780883,0.001452,0.001633,-2.76424,-5.863,0.041082,0.040815,-18.50164
50%,25.0,0.0,1.0,4.0,1.0,0.0,0.0,0.924598,0.935917,0.0,...,0.975852,0.001638,0.001847,0.987451,0.987109,0.00168,0.018444,1.001152,1.0,0.1859456
75%,35.25,1.65,3.226174,4.0,1.0,0.817407,0.917998,1.387335,1.524975,0.965868,...,2.129576,0.993199,0.998879,4.057912,6.077472,0.999664,1.000738,14.953725,13.923463,1.00088
max,45.0,3.0,9.0,4.0,1.0,1.239384,1.358098,122.931347,131.67549,2.275741,...,2890.129013,5.243298,6.283092,56115.100219,57592.69886,16.106978,23.077325,731718.33209,748286.961169,103.3588


In [5]:
dlumps.describe()

Unnamed: 0,init,weight,type,level_2,level_3,level_4,level_5,level_6,level_7,level_8,...,level_10,level_11,level_12,level_13,level_14,level_15,level_16,level_17,level_18,exp
count,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,...,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0
mean,1.05,1.715278,3.8,-2.608983,-3.087235,12.360494,13.485368,-53.858729,-57.641226,258.016894,...,-1172.908731,-1232.022585,5292.641517,5529.478551,-22317.746351,-23199.704152,85662.85,88697.07,-300601.1,-723920.6
std,1.468081,2.189245,0.615587,8.118042,9.802525,44.393799,48.702023,222.478514,238.231213,1116.816835,...,5174.917644,5439.189243,23531.62981,24590.746496,99564.849566,103507.763796,382673.0,396237.8,1343627.0,3237079.0
min,0.0,0.0,2.0,-33.620593,-40.373713,-1.734322,-1.825242,-997.122136,-1067.67844,-5.550577,...,-23157.212916,-24338.986279,-7.42388,-17.395405,-445320.847252,-462954.335124,-67.06599,-67.0486,-6009038.0,-14476750.0
25%,0.0,0.090278,4.0,-1.304863,-1.425327,0.0,0.0,-1.437197,-1.412107,-0.06877,...,-1.958771,-1.390445,-0.195941,-0.652932,-1.970593,-1.375239,-0.6939012,-1.802527,-8.913318,-1.995897
50%,0.0,1.0,4.0,0.0,0.0,0.01128,0.001635,0.0,0.0,0.010817,...,0.0,0.000272,1.219497,0.424945,9.9e-05,0.201399,1.182381,0.4360167,0.2550822,0.3300225
75%,3.0,2.381944,4.0,0.006879,0.341444,2.107497,2.043995,0.623679,0.639251,2.023673,...,1.222807,1.71139,2.185291,2.014058,1.580704,2.010701,6.847626,2.034738,2.008174,1.998589
max,3.0,9.0,4.0,2.171332,2.499696,198.953725,218.118294,2.036695,2.03243,5001.157827,...,3.761426,9.229395,105266.161459,110002.605965,23.210856,23.159297,1711459.0,1772124.0,146.9126,117.2284


In [6]:
wzw.describe()

Unnamed: 0,k,exp_re,exp_im,weight,j,m,type,level_2_re,level_2_im,level_3_re,...,level_6_re,level_6_im,level_7_re,level_7_im,level_8_re,level_8_im,level_9_re,level_9_im,level_10_re,level_10_im
count,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,...,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0,1647.0
mean,6.784457,0.039473,-0.00189,0.90466,2.002429,0.0,3.888282,-0.039372,-0.005783,-0.03892,...,0.00213,-0.002027,0.010572,-0.015218,0.073137,-0.011159,0.065939,0.009917,0.000521,0.006889
std,1.309949,0.596876,0.301625,0.594117,1.252577,1.587753,0.459439,2.536976,0.237263,2.571754,...,1.391057,0.372175,1.443925,0.448329,1.531316,0.476229,1.591242,0.629464,1.738222,0.66384
min,2.0,-1.519671,-0.930605,0.0,0.0,-4.0,2.0,-26.284377,-1.867961,-26.284377,...,-10.978029,-2.287981,-11.445648,-3.646223,-13.721069,-4.023181,-14.249796,-6.650984,-24.994666,-7.174097
25%,6.0,-0.441788,0.0,0.375,1.0,-1.0,4.0,-0.558114,0.0,-0.579659,...,-0.521596,0.0,-0.530032,0.0,-0.498717,0.0,-0.486161,0.0,-0.478383,0.0
50%,7.0,0.0,0.0,0.972222,2.0,0.0,4.0,0.000476,0.0,0.004674,...,0.0,0.0,0.0,0.0,0.009156,0.0,0.006125,0.0,0.0,0.0
75%,8.0,0.506705,0.0,1.333333,3.0,1.0,4.0,0.600366,0.0,0.633898,...,0.545093,0.0,0.564691,0.0,0.59688,0.0,0.589068,0.0,0.521422,0.0
max,8.0,1.414214,0.930605,2.0,4.0,4.0,4.0,35.385221,1.867961,35.385221,...,10.978029,2.287981,11.445648,3.646223,18.673134,4.023181,20.051284,6.683011,15.315423,7.187127


The main difference between the lump solutions and the WZW model is the fact that truncation levels are complex numbers in the latter.
Moreover solutions in the WZW are labelled by an integer `k` which is related to the `weight` variable as `weight` $= \frac{j(j+1)}{k+2}$ where $j$ is one of the quantum numbers associated to the $\mathrm{SU}(2)$ representation, together with $m$.

## Preparing the Input

In [7]:
wzw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1647 entries, 0 to 1646
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   k            1647 non-null   float64
 1   exp_re       1647 non-null   float64
 2   exp_im       1647 non-null   float64
 3   weight       1647 non-null   float64
 4   j            1647 non-null   float64
 5   m            1647 non-null   float64
 6   type         1647 non-null   float64
 7   level_2_re   1647 non-null   float64
 8   level_2_im   1647 non-null   float64
 9   level_3_re   1647 non-null   float64
 10  level_3_im   1647 non-null   float64
 11  level_4_re   1647 non-null   float64
 12  level_4_im   1647 non-null   float64
 13  level_5_re   1647 non-null   float64
 14  level_5_im   1647 non-null   float64
 15  level_6_re   1647 non-null   float64
 16  level_6_im   1647 non-null   float64
 17  level_7_re   1647 non-null   float64
 18  level_7_im   1647 non-null   float64
 19  level_

We also have to redefine the columns of the lump and double lump solutions to have complex entries:

In [8]:
import numpy as np
import re

zeros  = np.zeros(lumps.shape[0], dtype=int)
dzeros = np.zeros(dlumps.shape[0], dtype=int)
lumps  = lumps.filter(regex='^weight|^type|^exp|^level')
dlumps = dlumps.filter(regex='^weight|^type|^exp|^level')

# rename columns
rename = {c: c + '_re' for c in lumps.columns if re.match('^level.*|^exp', c)}
lumps  = lumps.rename(columns=rename)
dlumps = dlumps.rename(columns=rename)

# add columns 
cols = [re.sub('_re', '_im', c) for c in lumps.columns if re.match('.*_re$', c)]
for c in cols:
    lumps[c]  = zeros
    dlumps[c] = dzeros

In [9]:
lumps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 38 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   weight       732 non-null    float64
 1   type         732 non-null    int64  
 2   exp_re       732 non-null    int64  
 3   level_2_re   732 non-null    float64
 4   level_3_re   732 non-null    float64
 5   level_4_re   732 non-null    float64
 6   level_5_re   732 non-null    float64
 7   level_6_re   732 non-null    float64
 8   level_7_re   732 non-null    float64
 9   level_8_re   732 non-null    float64
 10  level_9_re   732 non-null    float64
 11  level_10_re  732 non-null    float64
 12  level_11_re  732 non-null    float64
 13  level_12_re  732 non-null    float64
 14  level_13_re  732 non-null    float64
 15  level_14_re  732 non-null    float64
 16  level_15_re  732 non-null    float64
 17  level_16_re  732 non-null    float64
 18  level_17_re  732 non-null    float64
 19  level_18

In [10]:
dlumps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 38 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   weight       20 non-null     float64
 1   type         20 non-null     int64  
 2   level_2_re   20 non-null     float64
 3   level_3_re   20 non-null     float64
 4   level_4_re   20 non-null     float64
 5   level_5_re   20 non-null     float64
 6   level_6_re   20 non-null     float64
 7   level_7_re   20 non-null     float64
 8   level_8_re   20 non-null     float64
 9   level_9_re   20 non-null     float64
 10  level_10_re  20 non-null     float64
 11  level_11_re  20 non-null     float64
 12  level_12_re  20 non-null     float64
 13  level_13_re  20 non-null     float64
 14  level_14_re  20 non-null     float64
 15  level_15_re  20 non-null     float64
 16  level_16_re  20 non-null     float64
 17  level_17_re  20 non-null     float64
 18  level_18_re  20 non-null     float64
 19  exp_re    

## Principal Components

We then need to compute the principal components of the truncation levels since we are interested in having the same input shape.

In [11]:
from sklearn.decomposition import PCA

# fix random state
RAND = 123
COMP = 10

# compute PCA
lumps_levels  = PCA(n_components=COMP, random_state=RAND).fit_transform(lumps.filter(regex='^level'))
dlumps_levels = PCA(n_components=COMP, random_state=RAND).fit_transform(dlumps.filter(regex='^level'))
wzw_levels    = PCA(n_components=COMP, random_state=RAND).fit_transform(wzw.filter(regex='^level'))

In [12]:
import pandas as pd

# recreate a dataframe
cols = ['pca_' + str(n+1) for n in range(COMP)]
lumps_levels  = pd.DataFrame(lumps_levels, columns=cols)
dlumps_levels = pd.DataFrame(dlumps_levels, columns=cols)
wzw_levels    = pd.DataFrame(wzw_levels, columns=cols)

In [13]:
# join the old dataframes
lumps  = pd.merge(lumps[['exp_re', 'exp_im', 'weight', 'type']], lumps_levels, left_index=True, right_index=True)
dlumps = pd.merge(dlumps[['exp_re', 'exp_im', 'weight', 'type']], dlumps_levels, left_index=True, right_index=True)
wzw    = pd.merge(wzw[['exp_re', 'exp_im', 'weight', 'type']], wzw_levels, left_index=True, right_index=True)

We finally need to join (_outer_ join) the datasets:

In [14]:
# convert all integer to floats
lumps  = lumps.astype(float)
dlumps = dlumps.astype(float)
wzw    = wzw.astype(float)

# merge the datasets (keep the double lumps apart)
df = pd.concat([lumps, wzw], axis=0)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2379 entries, 0 to 1646
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   exp_re  2379 non-null   float64
 1   exp_im  2379 non-null   float64
 2   weight  2379 non-null   float64
 3   type    2379 non-null   float64
 4   pca_1   2379 non-null   float64
 5   pca_2   2379 non-null   float64
 6   pca_3   2379 non-null   float64
 7   pca_4   2379 non-null   float64
 8   pca_5   2379 non-null   float64
 9   pca_6   2379 non-null   float64
 10  pca_7   2379 non-null   float64
 11  pca_8   2379 non-null   float64
 12  pca_9   2379 non-null   float64
 13  pca_10  2379 non-null   float64
dtypes: float64(14)
memory usage: 278.8 KB


In [16]:
dlumps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   exp_re  20 non-null     float64
 1   exp_im  20 non-null     float64
 2   weight  20 non-null     float64
 3   type    20 non-null     float64
 4   pca_1   20 non-null     float64
 5   pca_2   20 non-null     float64
 6   pca_3   20 non-null     float64
 7   pca_4   20 non-null     float64
 8   pca_5   20 non-null     float64
 9   pca_6   20 non-null     float64
 10  pca_7   20 non-null     float64
 11  pca_8   20 non-null     float64
 12  pca_9   20 non-null     float64
 13  pca_10  20 non-null     float64
dtypes: float64(14)
memory usage: 2.3 KB


In [17]:
# save the dataset
df.to_csv('./data/full.csv', index=False)
dlumps.to_csv('./data/dlumps.csv', index=False)