In [2]:
#Here we're going to polynomial expand the 4 most populated cols (the ones with)
#The fewest NaN entries, then recombine them with the 4 less populated cols.
#This captures their interaction terms.
#Would the result be similar if we had done fi / fj instead of fi*fj?

import numpy as np
import pandas as pd

#Create a dataframe and chop a triangular hole out of it.
X = pd.DataFrame(np.arange(80.).reshape(10,8))

for i in range(5,0,-1):
    for j in range(i, 6):
        X.set_value(i,j, np.nan)
X

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,1.0,2.0,3.0,4.0,5.0,6,7
1,8,,,,,,14,15
2,16,17.0,,,,,22,23
3,24,25.0,26.0,,,,30,31
4,32,33.0,34.0,35.0,,,38,39
5,40,41.0,42.0,43.0,44.0,,46,47
6,48,49.0,50.0,51.0,52.0,53.0,54,55
7,56,57.0,58.0,59.0,60.0,61.0,62,63
8,64,65.0,66.0,67.0,68.0,69.0,70,71
9,72,73.0,74.0,75.0,76.0,77.0,78,79


In [4]:
#Reorder cols by number of non null entries
#How to do this pandorably? 
cols_counts = list((zip(X.columns.tolist(), X.count().tolist())))
most_pop_cols = [ elem[0] for elem in sorted(cols_counts, key=lambda cols_counts: cols_counts[1], reverse=True) ]
most_pop_cols

[0, 6, 7, 1, 2, 3, 4, 5]

In [5]:
#Mean impute missing data, need all float entries.
from sklearn.preprocessing import Imputer
imp = Imputer()
X = pd.DataFrame(imp.fit_transform(X))

In [6]:
#Slice out 4 cols with most entries
num_cols = 4
most, rest = most_pop_cols[0:num_cols], most_pop_cols[num_cols:]

#Polynomial expand most populated cols to get all interaction terms.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2, interaction_only=True)
poly_most = pd.DataFrame(poly.fit_transform(X[most]))
poly_most

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1,0,6,7,1.0,0,0,0.0,42,6.0,7.0
1,1,8,14,15,40.111111,112,120,320.888889,210,561.555556,601.666667
2,1,16,22,23,17.0,352,368,272.0,506,374.0,391.0
3,1,24,30,31,25.0,720,744,600.0,930,750.0,775.0
4,1,32,38,39,33.0,1216,1248,1056.0,1482,1254.0,1287.0
5,1,40,46,47,41.0,1840,1880,1640.0,2162,1886.0,1927.0
6,1,48,54,55,49.0,2592,2640,2352.0,2970,2646.0,2695.0
7,1,56,62,63,57.0,3472,3528,3192.0,3906,3534.0,3591.0
8,1,64,70,71,65.0,4480,4544,4160.0,4970,4550.0,4615.0
9,1,72,78,79,73.0,5616,5688,5256.0,6162,5694.0,5767.0


In [8]:
#Concat poly expansion to unexpanded data
expanded_data = pd.concat([poly_most, X[rest]], axis=1) #join_axes=[df1.index])
expanded_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,2.1,3.1,4.1,5.1
0,1,0,6,7,1.0,0,0,0.0,42,6.0,7.0,2,3.0,4.0,5
1,1,8,14,15,40.111111,112,120,320.888889,210,561.555556,601.666667,44,47.571429,50.666667,53
2,1,16,22,23,17.0,352,368,272.0,506,374.0,391.0,44,47.571429,50.666667,53
3,1,24,30,31,25.0,720,744,600.0,930,750.0,775.0,26,47.571429,50.666667,53
4,1,32,38,39,33.0,1216,1248,1056.0,1482,1254.0,1287.0,34,35.0,50.666667,53
5,1,40,46,47,41.0,1840,1880,1640.0,2162,1886.0,1927.0,42,43.0,44.0,53
6,1,48,54,55,49.0,2592,2640,2352.0,2970,2646.0,2695.0,50,51.0,52.0,53
7,1,56,62,63,57.0,3472,3528,3192.0,3906,3534.0,3591.0,58,59.0,60.0,61
8,1,64,70,71,65.0,4480,4544,4160.0,4970,4550.0,4615.0,66,67.0,68.0,69
9,1,72,78,79,73.0,5616,5688,5256.0,6162,5694.0,5767.0,74,75.0,76.0,77


In [9]:
expanded_data.to_csv("poly_expanded.csv", index=False)