In [198]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import linear_model
from sklearn.isotonic import IsotonicRegression
from ideas import intersection_over_union
from ideas import bb_intersection_over_union
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import cross_val_score
from ideas import al
from ideas import aw
import math
from sklearn.ensemble import BaggingRegressor

In [195]:
columns = ['Xmin','Ymin','Xmax','Ymax']
box_columns = columns
box_columns_gd = list(map(lambda x: x + '_gd', box_columns))

center_columns = ['L', 'W', 'X', 'Y']
# 'L', 'W', 'Xcenter', 'Ycenter'
l, w, xc, yc = 0,1,2,3 
center_columns_gd = list(map(lambda x: x + '_gd', center_columns))

feature_columns = ['area']
feature_ratio_columns = []

box_columns_all = box_columns + box_columns_gd + feature_columns
center_columns_all = center_columns + center_columns_gd + feature_columns
center_columns_true = list(map(lambda x: x + '_true', center_columns))
scaler_columns = list(set(box_columns_all + center_columns_all))

columns_true = list(map(lambda x: x + '_true', columns))
columns_id = columns + ['itemId']
test_columns = list(map(lambda x: x + '_test', columns))
test_columns_id = test_columns + ['itemId']
theta = .5

In [106]:
def weighted_averaging(iid, df, np_data, avg, columns, back_up = None):
    global box_columns
    items = df[df['itemId'] == iid]
    vi = np_data[items.index.values]
    bl = len(columns)
    avg.loc[iid][bl] = iid
    if vi.shape[0] == 0:
        bp = back_up[back_up['itemId'] == iid][box_columns]
        for y in range(0, bl):                
            avg.loc[iid][y] = bp[box_columns[y]]               
        return
    l = al(vi)
    w = aw(vi)
    sabx = (l * w) ** 2
    sb = sum(1 / sabx)    
    if math.isinf(sb):
        vi = vi.mean(axis=0)
        for y in range(0, bl):                
            avg.loc[iid][y] = vi[y]
    else:
        for y in range(0, bl):                
            avg.loc[iid][y] = sum(vi[:,y] / sabx) / sb
    
    return avg.loc[iid]

def add_center_columns(df):
    df['L_true'] = df['Xmax_true'] - df['Xmin_true']
    df['W_true'] = df['Ymax_true'] - df['Ymin_true'] 
    df['X_true'] = (df['Xmax_true'] + df['Xmin_true'])/2
    df['Y_true'] = (df['Ymax_true'] + df['Ymin_true'])/2    
    return df

In [157]:
te_d = pd.read_csv('test_data_full.csv')
t_d = pd.read_csv('train_data_full.csv')
t_a = add_center_columns(pd.read_csv('train_answers.csv'))
removeIds = []
t_d_a = t_d.merge(t_a, on="itemId")
t_d_a = t_d_a.drop(t_d_a[t_d_a['itemId'].isin(removeIds)].index)
ids = list(set(te_d['itemId']))
scaler = MinMaxScaler()
scaler_fit_d = t_d_a[scaler_columns].values
scaler_fit_d = np.append(scaler_fit_d, te_d[scaler_columns].values, axis = 0)
scaler.fit(scaler_fit_d)
t_d_a[scaler_columns] = scaler.transform(t_d_a[scaler_columns])
te_d[scaler_columns] = scaler.transform(te_d[scaler_columns])
t_d_a = t_d_a[t_d_a['score'] > theta]

In [213]:
def my_custom_loss_func(y_true, y_pred):
    iou = np.zeros(len(y_true))
    for i in range(0, len(y_pred)):
        if type(y_true) is np.ndarray:
            iou[i] = bb_intersection_over_union(y_true, y_pred[i])                          
        else:
            iou[i] = bb_intersection_over_union(y_true.iloc[i].values, y_pred[i]) 
    res = iou.mean()
    return res

In [174]:
random = 71
kf = KFold(n_splits=5, random_state = random)
clf = linear_model.MultiTaskLassoCV(alphas=np.logspace(-4, 0, 15), cv=kf, 
                                    max_iter=10000)
X = t_d_a[box_columns_all + columns_ratio]
y = t_d_a
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,\
                                                    random_state = random)
clf.fit(X_train, y_train[columns_true])
y_test_pred = clf.predict(X_test)
iou = np.zeros(len(y_test_pred))

y_test_pred_pd = pd.DataFrame(data=np.c_[y_test_pred, y_test['itemId']], columns=box_columns_id)
y_test_pred_pd['itemId'] = pd.to_numeric(y_test_pred_pd['itemId'], downcast='integer')
data = y_test[box_columns_true + ['itemId']].merge(y_test_pred_pd, on=['itemId'])
data["iou"] = data[columns + columns_true].apply(intersection_over_union, axis=1)
print(data["iou"].mean())

0.5744559512262829


In [175]:
ids = list(set(data['itemId']))
print(set(data[data['iou'] < .2]['itemId']))
# print(data[data['itemId']  == 25135])

{33665, 4322, 24450, 31749, 2757, 17191, 32551, 31033, 3019, 23372, 5389, 23342, 8398, 12016, 22483, 23416, 32281, 11258, 16891, 13855}


In [224]:
random = 71
kf = KFold(n_splits=5, random_state = random)
clf_center = linear_model.MultiTaskElasticNetCV(l1_ratio=0.9, eps=0.0001, cv=kf, 
                                                max_iter=10000, random_state = seed)
X = t_d_a[center_columns_all + columns_ratio]
y = t_d_a
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,\
                                                    random_state = random)
clf_center.fit(X_train, y_train[center_columns_true])
y_test_pred = clf_center.predict(X_test)
y_test_pred = np.c_[y_test_pred, np.zeros(y_test_pred.shape)]
# Xmin, Ymin, Xmax, Ymax
y_test_pred[:, 4] = y_test_pred[:, xc] - y_test_pred[:, l]/2 
y_test_pred[:, 5] = y_test_pred[:, yc] - y_test_pred[:, w]/2
y_test_pred[:, 6] = y_test_pred[:, xc] + y_test_pred[:, l]/2
y_test_pred[:, 7] = y_test_pred[:, yc] + y_test_pred[:, w]/2
y_test_pred = y_test_pred[:, 4:]
y_test_pred_pd = pd.DataFrame(data=np.c_[y_test_pred, y_test['itemId']], columns=box_test_columns_id)
y_test_pred_pd['itemId'] = pd.to_numeric(y_test_pred_pd['itemId'], downcast='integer')
data = y_test[box_columns_true + ['itemId']].merge(y_test_pred_pd, on=['itemId'])
data["iou"] = data[box_test_columns + box_columns_true].apply(intersection_over_union, axis=1)
print(data["iou"].mean())

0.5668102127693141


In [226]:
seed = 31
kf = KFold(n_splits=5, random_state = seed)
lasso = linear_model.MultiTaskLassoCV(alphas=np.logspace(-4, 0, 20), cv=kf, max_iter=10000, n_jobs=-1,\
                                      random_state = seed)
ridge = linear_model.RidgeCV(alphas=np.logspace(-4, 0, 20), cv=kf)
net = linear_model.MultiTaskElasticNetCV(l1_ratio=0.9, eps=0.0001, cv=kf,\
                                        max_iter=10000, random_state = seed)
clf_array = [lasso, ridge, net]
X = t_d_a[box_columns_all + feature_ratio_columns]
y = t_d_a
for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X, y[columns_true], cv=kf, n_jobs=-1,\
                                     scoring=make_scorer(my_custom_loss_func))
    bagging_clf = BaggingRegressor(clf, max_samples=0.4, random_state=seed, n_jobs=-1)
    bagging_scores = cross_val_score(bagging_clf, X, y[columns_true], cv=kf, n_jobs=-1,\
                                     scoring=make_scorer(my_custom_loss_func))    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]\n".\
          format(clf.__class__.__name__, vanilla_scores.mean(), vanilla_scores.std()))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".\
          format(clf.__class__.__name__, bagging_scores.mean(), bagging_scores.std()))

Mean of: 0.552, std: (+/-) 0.010 [MultiTaskLassoCV]

Mean of: 0.555, std: (+/-) 0.008 [Bagging MultiTaskLassoCV]

Mean of: 0.549, std: (+/-) 0.010 [RidgeCV]

Mean of: 0.552, std: (+/-) 0.008 [Bagging RidgeCV]

Mean of: 0.558, std: (+/-) 0.005 [MultiTaskElasticNetCV]

Mean of: 0.557, std: (+/-) 0.005 [Bagging MultiTaskElasticNetCV]



In [168]:
ids = list(set(data['itemId']))
print(set(data[data['iou'] < .2]['itemId']))

{33665, 4322, 24450, 31749, 2757, 17191, 32551, 31033, 3019, 23372, 23342, 12016, 22483, 32281, 11258, 13855}


In [169]:
X_submit_box = te_d[box_columns_all + columns_ratio]
y_submit_box = clf.predict(X_submit_box)
X_submit_center = te_d[center_columns_all + columns_ratio]
y_submit_center = clf_center.predict(X_submit_center)
# Xmin, Ymin, Xmax, Ymax
y_submit_center = np.c_[y_submit_center, np.zeros(y_submit_center.shape)]
y_submit_center[:, 4] = y_submit_center[:, xc] - y_submit_center[:, l]/2 
y_submit_center[:, 5] = y_submit_center[:, yc] - y_submit_center[:, w]/2
y_submit_center[:, 6] = y_submit_center[:, xc] + y_submit_center[:, l]/2
y_submit_center[:, 7] = y_submit_center[:, yc] + y_submit_center[:, w]/2
y_submit_center = y_submit_center[:, 4:]

y_submit_pd_box = pd.DataFrame(data=np.c_[y_submit_box, te_d['itemId']], columns=box_columns_id)
y_submit_pd_box['itemId'] = pd.to_numeric(y_submit_pd_box['itemId'], downcast='integer')
y_submit_pd_center = pd.DataFrame(data=np.c_[y_submit_center, te_d['itemId']], columns=box_columns_id)
y_submit_pd_center['itemId'] = pd.to_numeric(y_submit_pd_center['itemId'], downcast='integer')

In [170]:
print(y_submit_pd_box[y_submit_pd_box['itemId'] == 4099])
print(y_submit_pd_center[y_submit_pd_center['itemId'] == 4099])

        Xmin        Ymin        Xmax         Ymax  itemId
0  77.284505  834.176848  403.467454  1118.765395    4099
1  74.968861  831.994653  399.640068  1120.642465    4099
2  74.953123  832.829583  401.386052  1124.102984    4099
        Xmin        Ymin        Xmax         Ymax  itemId
0  77.282745  833.468816  401.483732  1117.800286    4099
1  75.965920  832.607623  401.228075  1119.956980    4099
2  75.337940  832.735574  402.465928  1123.161499    4099


In [179]:
result = pd.concat([y_submit_pd_box, y_submit_pd_center])
# ids = list(set(result['itemId']))
# answers = pd.DataFrame(np.zeros((len(ids), len(box_columns_id)), dtype=int), index=ids, columns=box_columns_id)
# y_submit = result.values
# for iid in ids:
#     weighted_averaging(iid, result, y_submit, answers, columns)
answers = result.groupby('itemId').mean().reset_index()
answers[['itemId'] + columns].to_csv('submit2601.csv', index=False, header=False)
data = answers.merge(pd.read_csv("submit-max.csv"), on=['itemId'])
data["iou"] = data[['Xmin','Ymin', 'Xmax', 'Ymax', 'Xmin_true',\
      'Ymin_true', 'Xmax_true','Ymax_true']].apply(intersection_over_union, axis=1)
data["iou"].mean()

0.8476452700582842