In [1]:
import numpy as np
import pandas as pd
import random

In [2]:
groupsize = [100, 70, 50]
n_rows = sum(groupsize)

In [3]:
a1 = np.random.random(groupsize[0])
b1 = np.random.random(groupsize[0])
c1 = np.random.random(groupsize[0])
y1 = a1 * 3 + b1 * 2 + c1 * 0.5 + 1

a2 = 10*np.random.random(groupsize[1])
b2 = 10*np.random.random(groupsize[1])
c2 = 10*np.random.random(groupsize[1])
y2 = a2 * 5 + b2 * 1 + c2 * 0.3 + 2

a3 = -10*np.random.random(groupsize[2]) - 5
b3 = -10*np.random.random(groupsize[2]) - 5
c3 = -10+np.random.random(groupsize[2]) - 5
y3 = a3 * 7 + a3 * 2 + a3 * 0.2 + 3

In [4]:
base_elements = ('Fe', 'O', 'C', 'Co', 'N', 'H', 'Pb')
def random_compound():
    compound = ""
    elements = list(base_elements) # take a copy
    random.shuffle(elements)
    for element in elements:
        compound += element + (lambda x: str(x) if x > 1 else '')(random.randint(1,4))
        if random.random() > 0.3:
            break
    return compound
    
[random_compound() for _ in range(10)]

['Co4O3N2', 'Co3N4', 'C', 'Pb2', 'C4', 'Pb3', 'Pb2', 'Pb2', 'C2', 'O2Fe']

In [6]:
a = np.around(np.hstack((a1, a2, a3)), 3)
b = np.around(np.hstack((b1, b2, b3)), 3)
c = np.around(np.hstack((c1, c2, c3)), 3)
y = np.around(np.hstack((y1, y2, y3)), 3)
groups = np.array(sum(((name,) * n for name, n in zip(('foo', 'bar', 'rosco'), groupsize)), tuple()))
composition = np.array([random_compound() if g=='foo' else random_compound() + 'Au' for g in groups])

In [9]:
df = pd.DataFrame(np.column_stack((a, b, c, groups, composition, y)), 
                  columns=('a', 'b', 'c', 'grouping', 'composition', 'y'))
df.head()

Unnamed: 0,a,b,c,grouping,composition,y
0,0.743,0.071,0.308,foo,O3Pb2,3.524
1,0.707,0.236,0.758,foo,N,3.973
2,0.644,0.288,0.78,foo,Pb3,3.896
3,0.696,0.346,0.03,foo,Fe3N2,3.795
4,0.477,0.802,0.296,foo,N2,4.184


In [10]:
df.to_csv('grouped.csv', index=None)

In [12]:
ef = df[['a','b','c']]

In [13]:
ef

Unnamed: 0,a,b,c
0,0.743,0.071,0.308
1,0.707,0.236,0.758
2,0.644,0.288,0.78
3,0.696,0.346,0.03
4,0.477,0.802,0.296
5,0.56,0.254,0.656
6,0.87,0.515,0.804
7,0.552,0.725,0.792
8,0.73,0.052,0.105
9,0.04,0.962,0.956


In [14]:
from sklearn.preprocessing import PolynomialFeatures

In [15]:
pf = PolynomialFeatures()

In [16]:
pf.fit_transform(ef)

array([[  1.00000000e+00,   7.43000000e-01,   7.10000000e-02, ...,
          5.04100000e-03,   2.18680000e-02,   9.48640000e-02],
       [  1.00000000e+00,   7.07000000e-01,   2.36000000e-01, ...,
          5.56960000e-02,   1.78888000e-01,   5.74564000e-01],
       [  1.00000000e+00,   6.44000000e-01,   2.88000000e-01, ...,
          8.29440000e-02,   2.24640000e-01,   6.08400000e-01],
       ..., 
       [  1.00000000e+00,  -1.08370000e+01,  -5.25400000e+00, ...,
          2.76045160e+01,   7.80849480e+01,   2.20879044e+02],
       [  1.00000000e+00,  -7.29200000e+00,  -1.03690000e+01, ...,
          1.07516161e+02,   1.49821681e+02,   2.08773601e+02],
       [  1.00000000e+00,  -7.29400000e+00,  -1.10520000e+01, ...,
          1.22146704e+02,   1.60265052e+02,   2.10279001e+02]])

In [22]:
gf = pd.DataFrame(pf.fit_transform(ef), columns=pf.get_feature_names(ef.columns))

In [23]:
gf

Unnamed: 0,1,a,b,c,a^2,a b,a c,b^2,b c,c^2
0,1.0,0.743,0.071,0.308,0.552049,0.052753,0.228844,0.005041,0.021868,0.094864
1,1.0,0.707,0.236,0.758,0.499849,0.166852,0.535906,0.055696,0.178888,0.574564
2,1.0,0.644,0.288,0.780,0.414736,0.185472,0.502320,0.082944,0.224640,0.608400
3,1.0,0.696,0.346,0.030,0.484416,0.240816,0.020880,0.119716,0.010380,0.000900
4,1.0,0.477,0.802,0.296,0.227529,0.382554,0.141192,0.643204,0.237392,0.087616
5,1.0,0.560,0.254,0.656,0.313600,0.142240,0.367360,0.064516,0.166624,0.430336
6,1.0,0.870,0.515,0.804,0.756900,0.448050,0.699480,0.265225,0.414060,0.646416
7,1.0,0.552,0.725,0.792,0.304704,0.400200,0.437184,0.525625,0.574200,0.627264
8,1.0,0.730,0.052,0.105,0.532900,0.037960,0.076650,0.002704,0.005460,0.011025
9,1.0,0.040,0.962,0.956,0.001600,0.038480,0.038240,0.925444,0.919672,0.913936
