In [1]:
import json
import pandas as pd
import glob
import os
from itertools import chain
import numpy as np

In [2]:
exp_dir = "/home/noah/ESSL/exps/iteration4/exp6_1"
coeffs = []
exps = glob.glob(os.path.join(exp_dir, "*/"))
chromos = []
for e in exps:
    seed = e.split("/")[-2]
    with open(os.path.join(e, "outcomes.json"), "r") as f:
        results = json.load(f)
        for fitness, chromo in zip(results["pop_vals"], results["chromos"]):
            c = list(chain.from_iterable(chromo[1]))
            c.append(seed)
            c.append(fitness[1])
            chromos.append(c)
columns = list(chain.from_iterable([[f"aug{i}", f"op{i}"] for i in range(1, 4)]))
columns.append("seed")
columns.append("test acc")
columns

df = pd.DataFrame(chromos, columns=columns)
ops = set(list(df["aug1"]) + list(df["aug2"]) +  list(df["aug3"]))
indexes = {op:i for i, op in enumerate(ops)}
chromos_long = np.zeros([len(chromos), len(ops)+2])

for i, c in enumerate(chromos):
    for aug, intensity in zip(c[:-2][::2], c[:-2][1::2]):
        chromos_long[i][indexes[aug]] = intensity
        chromos_long[i][-2] = int(c[-2])
        chromos_long[i][-1] = c[-1]

columns_long = list(ops) + ["seed", "fitness"]
df_long = pd.DataFrame(chromos_long, columns = columns_long)
df_long
ops
df_long

Unnamed: 0,ShearY,VerticalFlip,Sharpness,HorizontalFlip,Solarize,TranslateX,Brightness,Rotate,Contrast,ShearX,TranslateY,Color,seed,fitness
0,0.000000,0.000000,0.000000,0.000000,0.000000,7.0,0.000000,22.0,0.0,0.000000,2.0,0.000000,7.0,82.84
1,0.126119,0.000000,0.000000,0.603191,0.000000,0.0,1.314937,0.0,0.0,0.000000,0.0,0.000000,7.0,82.58
2,0.000000,0.981939,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.041533,0.0,0.469010,7.0,82.70
3,0.000000,0.000000,0.668222,0.000000,0.428321,0.0,0.000000,0.0,0.0,0.000000,0.0,0.953488,7.0,82.68
4,0.000000,0.000000,0.000000,0.000000,0.000000,10.0,0.000000,0.0,0.0,0.136720,0.0,0.443667,7.0,82.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.000000,0.850476,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.104604,0.0,0.594466,8.0,81.44
446,0.000000,0.000000,0.000000,0.000000,0.813661,0.0,0.918928,0.0,0.0,0.126433,0.0,0.000000,8.0,83.67
447,0.000000,0.850476,0.000000,0.000000,0.000000,8.0,0.000000,0.0,0.0,0.000000,3.0,0.000000,8.0,83.22
448,0.126119,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,-1.0,0.0,0.000000,2.0,0.000000,8.0,84.24


# exp 1: fit linear regression and DT regressor to dataset

In [3]:
# fit linear regression  model to wide format data

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import export_text, plot_tree

X = df_long[ops]
y = df_long["fitness"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=10)
reg = LinearRegression().fit(X_train, y_train)
print("LR score: ", reg.score(X_test, y_test))
coeffs.append(reg.coef_)
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
# print(cross_val_score(regressor, X, y, cv=10))
print("DT score: ", regressor.score(X_test, y_test))
regressor.get_depth()
tree = export_text(regressor, feature_names=list(ops))
print(tree)

LR score:  -0.16203887534086814
DT score:  0.16537893087107924
|--- Brightness <= 0.28
|   |--- Contrast <= 0.75
|   |   |--- ShearY <= 0.12
|   |   |   |--- Color <= 1.08
|   |   |   |   |--- ShearX <= 0.06
|   |   |   |   |   |--- ShearY <= 0.02
|   |   |   |   |   |   |--- HorizontalFlip <= 0.01
|   |   |   |   |   |   |   |--- Rotate <= 25.00
|   |   |   |   |   |   |   |   |--- Solarize <= 0.51
|   |   |   |   |   |   |   |   |   |--- Solarize <= 0.33
|   |   |   |   |   |   |   |   |   |   |--- Sharpness <= 0.61
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |   |   |   |   |   |   |--- Sharpness >  0.61
|   |   |   |   |   |   |   |   |   |   |   |--- value: [82.37]
|   |   |   |   |   |   |   |   |   |--- Solarize >  0.33
|   |   |   |   |   |   |   |   |   |   |--- TranslateX <= 3.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- TranslateX >  3.50
|   |   

R-squared value for DT is 0.27 and -0.16 for LR.... Terrible

# exp 2: fit linear regression and DT regressor to dataset with L2 normalization

In [4]:
# normalize the data
from sklearn.preprocessing import normalize
X_train = normalize(X_train)
X_test = normalize(X_test)

reg = LinearRegression().fit(X_train, y_train)
print("LR score: ", reg.score(X_test, y_test))
coeffs.append(reg.coef_)
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
# print(cross_val_score(regressor, X, y, cv=10))
print("DT score: ", regressor.score(X_test, y_test))
regressor.get_depth()
tree = export_text(regressor, feature_names=list(ops))
print(tree)

LR score:  -0.061633052508687
DT score:  0.13911920524795485
|--- Brightness <= 0.55
|   |--- Contrast <= 0.55
|   |   |--- Color <= 0.97
|   |   |   |--- HorizontalFlip <= 0.01
|   |   |   |   |--- Rotate <= -0.29
|   |   |   |   |   |--- Rotate <= -0.53
|   |   |   |   |   |   |--- VerticalFlip <= 0.33
|   |   |   |   |   |   |   |--- value: [82.71]
|   |   |   |   |   |   |--- VerticalFlip >  0.33
|   |   |   |   |   |   |   |--- value: [82.90]
|   |   |   |   |   |--- Rotate >  -0.53
|   |   |   |   |   |   |--- value: [84.24]
|   |   |   |   |--- Rotate >  -0.29
|   |   |   |   |   |--- ShearY <= 0.01
|   |   |   |   |   |   |--- TranslateY <= 0.14
|   |   |   |   |   |   |   |--- ShearX <= 0.10
|   |   |   |   |   |   |   |   |--- TranslateX <= 0.15
|   |   |   |   |   |   |   |   |   |--- VerticalFlip <= 0.88
|   |   |   |   |   |   |   |   |   |   |--- ShearX <= 0.00
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 10
|   |   |   |   |   |   |   |   | 

Performance decreases when using l2 normalization

# exp 3: fit linear regression and DT regressor to dataset with categorical variables

In [5]:
# train with the categorical dataset
from sklearn.preprocessing import LabelEncoder
cols = list(chain.from_iterable([[f"aug{i}", f"op{i}"] for i in range(1, 4)]))
X = df[cols]
y = df["test acc"]
for col in [f"aug{i}" for i in range(1, 4)]:
    le = LabelEncoder()
    le.fit(X[col])
    X[col] = le.transform(X[col])
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=10)

X_train = normalize(X_train)
X_test = normalize(X_test)

reg = LinearRegression().fit(X_train, y_train)
print("LR score: ", reg.score(X_test, y_test))
coeffs.append(reg.coef_)
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
# print(cross_val_score(regressor, X, y, cv=10))
print("DT score: ", regressor.score(X_test, y_test))
regressor.get_depth()
tree = export_text(regressor, feature_names=cols)
print(tree)
df

LR score:  0.08612154148451179
DT score:  0.02720782036457059
|--- aug1 <= 0.99
|   |--- op1 <= 0.09
|   |   |--- op3 <= -0.08
|   |   |   |--- op3 <= -0.08
|   |   |   |   |--- op1 <= 0.04
|   |   |   |   |   |--- value: [82.13]
|   |   |   |   |--- op1 >  0.04
|   |   |   |   |   |--- value: [82.71]
|   |   |   |--- op3 >  -0.08
|   |   |   |   |--- value: [84.24]
|   |   |--- op3 >  -0.08
|   |   |   |--- op1 <= 0.02
|   |   |   |   |--- aug3 <= 0.03
|   |   |   |   |   |--- aug2 <= 0.52
|   |   |   |   |   |   |--- op2 <= 0.51
|   |   |   |   |   |   |   |--- value: [84.15]
|   |   |   |   |   |   |--- op2 >  0.51
|   |   |   |   |   |   |   |--- op2 <= 0.80
|   |   |   |   |   |   |   |   |--- value: [83.35]
|   |   |   |   |   |   |   |--- op2 >  0.80
|   |   |   |   |   |   |   |   |--- value: [83.39]
|   |   |   |   |   |--- aug2 >  0.52
|   |   |   |   |   |   |--- op2 <= 0.18
|   |   |   |   |   |   |   |--- op3 <= 0.10
|   |   |   |   |   |   |   |   |--- value: [82.89]
|   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


Unnamed: 0,aug1,op1,aug2,op2,aug3,op3,seed,test acc
0,Rotate,22.000000,TranslateY,2.000000,TranslateX,7.000000,7,82.84
1,ShearY,0.126119,Brightness,1.314937,HorizontalFlip,0.603191,7,82.58
2,VerticalFlip,0.981939,Color,0.469010,ShearX,0.041533,7,82.70
3,Sharpness,0.668222,Solarize,0.428321,Color,0.953488,7,82.68
4,ShearX,0.136720,Color,0.443667,TranslateX,10.000000,7,82.62
...,...,...,...,...,...,...,...,...
445,VerticalFlip,0.850476,Color,0.594466,ShearX,0.104604,8,81.44
446,Solarize,0.813661,Brightness,0.918928,ShearX,0.126433,8,83.67
447,VerticalFlip,0.850476,TranslateY,3.000000,TranslateX,8.000000,8,83.22
448,ShearY,0.126119,TranslateY,2.000000,Rotate,-1.000000,8,84.24


Considerably worse performance when using as categorical

# exp 4: fit linear regression and DT regressor to dataset with neutral values

In [6]:
# apply nuetral values to missing values, everything but solarize has value of 0.0 when not applied. 
neutral = {'Brightness':0.0,
 'Color':0.0,
 'Contrast':0.0,
 'HorizontalFlip':0.0,
 'Rotate':0.0,
 'Sharpness':0.0,
 'ShearX':0.0,
 'ShearY':0.0,
 'Solarize':1.0,
 'TranslateX':0.0,
 'TranslateY':0.0,
 'VerticalFlip':0.0
          }

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import export_text, plot_tree

df_long['Solarize'] = df_long['Solarize'].apply(lambda x: 1.0 if x == 0.0 else x)
X = df_long[ops]
y = df_long["fitness"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=10)
reg = LinearRegression().fit(X_train, y_train)
print("LR score: ", reg.score(X_test, y_test))
coeffs.append(reg.coef_)
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
# print(cross_val_score(regressor, X, y, cv=10))
print("DT score: ", regressor.score(X_test, y_test))
regressor.get_depth()
tree = export_text(regressor, feature_names=list(ops))
print(tree)

LR score:  -0.10152499866110687
DT score:  0.15126819637791444
|--- Solarize <= 0.61
|   |--- ShearY <= 0.01
|   |   |--- Rotate <= 14.00
|   |   |   |--- Solarize <= 0.56
|   |   |   |   |--- Solarize <= 0.41
|   |   |   |   |   |--- VerticalFlip <= 0.49
|   |   |   |   |   |   |--- ShearX <= 0.10
|   |   |   |   |   |   |   |--- Contrast <= 1.43
|   |   |   |   |   |   |   |   |--- Color <= 0.22
|   |   |   |   |   |   |   |   |   |--- Solarize <= 0.15
|   |   |   |   |   |   |   |   |   |   |--- value: [82.37]
|   |   |   |   |   |   |   |   |   |--- Solarize >  0.15
|   |   |   |   |   |   |   |   |   |   |--- Contrast <= 0.48
|   |   |   |   |   |   |   |   |   |   |   |--- value: [83.23]
|   |   |   |   |   |   |   |   |   |   |--- Contrast >  0.48
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |--- Color >  0.22
|   |   |   |   |   |   |   |   |   |--- Sharpness <= 0.44
|   |   |   |   |   |   |   |   |   |   |--- Tra

We do not see any improvement chanigng solarize to 1 instead of zero... this is expected. 

In [7]:
df = pd.DataFrame(coeffs, columns = ops)
df

In [8]:
exps = ["/home/noah/ESSL/exps/iteration4/exp6_0", "/home/noah/ESSL/exps/iteration4/exp6_1"]
chromos = []
for e_i in exps:
    exps_i = glob.glob(os.path.join(exp_dir, "*/"))
    for e in exps_i:
        seed = e.split("/")[-2]
        with open(os.path.join(e, "outcomes.json"), "r") as f:
            results = json.load(f)
            for fitness, chromo in zip(results["pop_vals"], results["chromos"]):
                c = list(chain.from_iterable(chromo[1]))
                c.append(e_i.split("/")[-1])
                c.append(seed)
                c.append(fitness[1])
                chromos.append(c)
columns = list(chain.from_iterable([[f"aug{i}", f"op{i}"] for i in range(1, 4)]))
columns.append("experiment")
columns.append("seed")
columns.append("test acc")
columns

df = pd.DataFrame(chromos, columns=columns)
df

Unnamed: 0,aug1,op1,aug2,op2,aug3,op3,experiment,seed,test acc
0,Rotate,22.000000,TranslateY,2.000000,TranslateX,7.000000,exp6_0,7,82.84
1,ShearY,0.126119,Brightness,1.314937,HorizontalFlip,0.603191,exp6_0,7,82.58
2,VerticalFlip,0.981939,Color,0.469010,ShearX,0.041533,exp6_0,7,82.70
3,Sharpness,0.668222,Solarize,0.428321,Color,0.953488,exp6_0,7,82.68
4,ShearX,0.136720,Color,0.443667,TranslateX,10.000000,exp6_0,7,82.62
...,...,...,...,...,...,...,...,...,...
895,VerticalFlip,0.850476,Color,0.594466,ShearX,0.104604,exp6_1,8,81.44
896,Solarize,0.813661,Brightness,0.918928,ShearX,0.126433,exp6_1,8,83.67
897,VerticalFlip,0.850476,TranslateY,3.000000,TranslateX,8.000000,exp6_1,8,83.22
898,ShearY,0.126119,TranslateY,2.000000,Rotate,-1.000000,exp6_1,8,84.24
