# [How is a csv with columns of irregularly formatted lists imported into a pandas DataFrame?](https://stackoverflow.com/questions/52582499/read-in-a-csv-file-as-a-dataframe/52583505#52583505)

In [1]:
import pandas as pd
import json
from pandas.io.json import json_normalize
from pprint import pprint as pp
import matplotlib.pyplot as plt
import re

## Read the file

In [2]:
df = pd.read_csv('data/2018-09-30_columns_mixed_format_lists.csv', index_col=0)

In [3]:
df

Unnamed: 0,position_c,position_r_theta_phi,position_start,position_end
0,"[14.533842862081656, 0.03208616222764249, 56.9...","[58.751477765706575, 0.2499741778590546, 0.002...",[ 19271.0805564 17808.46240249 18693.4114863 ],[ 19270.18114729 18215.86140969 18693.4114863 ]
1,"[15.565867354362126, 2.2766677669751516, 82.66...","[84.149865052813752, 0.1880523977235686, 0.145...",[ 19301.1424449 17812.85092455 18719.15239368],[ 19242.18330777 18215.96205085 18719.15239368]
2,"[-5.4087970793661952, 8.7687766840863333, 82.9...","[83.541242779224319, 0.12364001055133951, 2.12...",[ 19424.05948468 18127.83826079 18719.38958887],[ 19077.31693912 17913.95893244 18719.38958887]
3,"[5.4876143325782323, -5.6088702278284472, 22.3...","[23.704501909037916, 0.3373934168579211, 5.486...",[ 19115.9817056 17864.06576253 18658.85413363],[ 19407.18754103 18148.97613687 18658.85413363]


## Fix the lists

In [4]:
def list_fix(row):
    new_list = []
    for x in row:
        new_list.append(re.findall(r"[-+]?\d*\.\d+|\d+", x))
    return new_list

In [5]:
df_processed = df.apply(lambda row: list_fix(row))

In [6]:
df_processed

Unnamed: 0,position_c,position_r_theta_phi,position_start,position_end
0,"[14.533842862081656, 0.03208616222764249, 56.9...","[58.751477765706575, 0.2499741778590546, 0.002...","[19271.0805564, 17808.46240249, 18693.4114863]","[19270.18114729, 18215.86140969, 18693.4114863]"
1,"[15.565867354362126, 2.2766677669751516, 82.66...","[84.149865052813752, 0.1880523977235686, 0.145...","[19301.1424449, 17812.85092455, 18719.15239368]","[19242.18330777, 18215.96205085, 18719.15239368]"
2,"[-5.4087970793661952, 8.7687766840863333, 82.9...","[83.541242779224319, 0.12364001055133951, 2.12...","[19424.05948468, 18127.83826079, 18719.38958887]","[19077.31693912, 17913.95893244, 18719.38958887]"
3,"[5.4876143325782323, -5.6088702278284472, 22.3...","[23.704501909037916, 0.3373934168579211, 5.486...","[19115.9817056, 17864.06576253, 18658.85413363]","[19407.18754103, 18148.97613687, 18658.85413363]"


## Unpack each column

In [7]:
pos_c = df_processed.position_c.apply(pd.Series)

In [8]:
pos_c = pos_c.rename(columns=lambda x: f'position_c_{x}')

In [9]:
pos_c

Unnamed: 0,position_c_0,position_c_1,position_c_2
0,14.533842862081656,0.0320861622276424,56.92541191588316
1,15.565867354362126,2.276667766975152,82.66631929505456
2,-5.408797079366195,8.768776684086333,82.9035144895315
3,5.487614332578232,-5.608870227828447,22.368059246608027


In [10]:
pos_rtp = df_processed.position_r_theta_phi.apply(pd.Series)

In [11]:
pos_rtp = pos_rtp.rename(columns=lambda x: f'position_r_theta_phi_{x}')

In [12]:
pos_rtp

Unnamed: 0,position_r_theta_phi_0,position_r_theta_phi_1,position_r_theta_phi_2
0,58.751477765706575,0.2499741778590546,0.0022076824693347
1,84.14986505281374,0.1880523977235686,0.1452305063941714
2,83.54124277922432,0.1236400105513395,2.12349508288374
3,23.70450190903792,0.3373934168579211,5.486860163384876


In [13]:
pos_s = df_processed.position_start.apply(pd.Series)

In [14]:
pos_s = pos_s.rename(columns=lambda x: f'position_start_{x}')

In [15]:
pos_s

Unnamed: 0,position_start_0,position_start_1,position_start_2
0,19271.0805564,17808.46240249,18693.4114863
1,19301.1424449,17812.85092455,18719.15239368
2,19424.05948468,18127.83826079,18719.38958887
3,19115.9817056,17864.06576253,18658.85413363


In [16]:
pos_e = df_processed.position_end.apply(pd.Series)

In [17]:
pos_e = pos_e.rename(columns=lambda x: f'position_end_{x}')

In [18]:
pos_e

Unnamed: 0,position_end_0,position_end_1,position_end_2
0,19270.18114729,18215.86140969,18693.4114863
1,19242.18330777,18215.96205085,18719.15239368
2,19077.31693912,17913.95893244,18719.38958887
3,19407.18754103,18148.97613687,18658.85413363


## Final DataFrame

In [19]:
df_final = pd.concat([pos_c, pos_rtp, pos_s, pos_e], axis=1)

In [20]:
df_final = df_final.astype('float64')

In [21]:
df_final.dtypes

position_c_0              float64
position_c_1              float64
position_c_2              float64
position_r_theta_phi_0    float64
position_r_theta_phi_1    float64
position_r_theta_phi_2    float64
position_start_0          float64
position_start_1          float64
position_start_2          float64
position_end_0            float64
position_end_1            float64
position_end_2            float64
dtype: object

In [22]:
df_final

Unnamed: 0,position_c_0,position_c_1,position_c_2,position_r_theta_phi_0,position_r_theta_phi_1,position_r_theta_phi_2,position_start_0,position_start_1,position_start_2,position_end_0,position_end_1,position_end_2
0,14.533843,0.032086,56.925412,58.751478,0.249974,0.002208,19271.080556,17808.462402,18693.411486,19270.181147,18215.86141,18693.411486
1,15.565867,2.276668,82.666319,84.149865,0.188052,0.145231,19301.142445,17812.850925,18719.152394,19242.183308,18215.962051,18719.152394
2,-5.408797,8.768777,82.903514,83.541243,0.12364,2.123495,19424.059485,18127.838261,18719.389589,19077.316939,17913.958932,18719.389589
3,5.487614,-5.60887,22.368059,23.704502,0.337393,5.48686,19115.981706,17864.065763,18658.854134,19407.187541,18148.976137,18658.854134
